In [None]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
from scipy.stats import mode
import matplotlib.pyplot as plt
from tqdm import tqdm


# Function to generate a bootstrapped dataset
def create_bootstrap_data(inputs, targets):
    num_samples = inputs.shape[0]
    indices = np.random.choice(range(num_samples), size=num_samples, replace=True)
    return inputs[indices], targets[indices]


# Function to train multiple decision trees on bootstrapped datasets
def train_decision_forest(feature_set, target_set, num_models):
    model_list = []
    for _ in range(num_models):
        sampled_features, sampled_targets = create_bootstrap_data(feature_set, target_set)
        tree_model = DecisionTreeClassifier()
        tree_model.fit(sampled_features, sampled_targets)
        model_list.append(tree_model)
    return model_list


# Function to aggregate predictions from a decision forest
def forest_vote_predict(models, data_to_predict):
    all_model_outputs = np.array([model.predict(data_to_predict) for model in models])
    majority_vote, _ = mode(all_model_outputs, axis=0)
    return majority_vote.flatten()


# Function to read data from files and prepare inputs and targets
def load_and_prepare_data(file_positive, file_negative):
    def parse_file(file_path, class_label):
        labels = []
        features = []
        with open(file_path, 'r') as f:
            for line in f:
                parts = line.strip().split('\t')
                features.append(list(map(float, parts[1:])))
                labels.append(class_label)
        return np.array(labels), np.array(features)

    positive_labels, positive_features = parse_file(file_positive, 1)
    negative_labels, negative_features = parse_file(file_negative, 0)

    all_labels = np.concatenate([positive_labels, negative_labels], axis=0)
    all_features = np.vstack([positive_features, negative_features])

    shuffle_order = np.random.permutation(all_labels.shape[0])
    return all_labels[shuffle_order], all_features[shuffle_order]


# File paths and data processing
classification_labels, classification_features = load_and_prepare_data('sun_exposed.txt', 'not_sun_exposed.txt')


# Train the decision forest
trained_forest = train_decision_forest(classification_features, classification_labels, num_models=5)

# Predict and evaluate the model
predicted_outputs = forest_vote_predict(trained_forest, classification_features)
print("Sample Predictions:", predicted_outputs[:10])
print("Training Accuracy:", np.mean(predicted_outputs == classification_labels))

# Analyze forest performance with varying numbers of models
tree_numbers = range(1, 21)
performance_metrics = []

for trees in tqdm(tree_numbers):
    forest_model = train_decision_forest(classification_features, classification_labels, num_models=trees)
    training_predictions = forest_vote_predict(forest_model, classification_features)
    accuracy = np.mean(training_predictions == classification_labels)
    performance_metrics.append(accuracy)

# Plot the performance analysis
plt.figure(figsize=(10, 6))
plt.plot(tree_numbers, performance_metrics, marker='o', linestyle='-', color='purple', label='Training Accuracy')
plt.title("Forest Accuracy vs Number of Decision Trees")
plt.xlabel("Number of Trees")
plt.ylabel("Accuracy")
plt.xticks(tree_numbers)
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend()
plt.show()


     0      1      2     3      4      5      6      7     8     9   ...  \
0  0.08   0.18   0.00  0.03   0.00   0.00   0.09   0.00  0.00  0.00  ...   
1  0.15   0.16   0.00  0.10   0.07   0.11   0.09   0.13  0.09  0.05  ...   
2  9.76  11.13  15.09  6.94  13.32  11.25  10.37  10.92  7.70  9.26  ...   
3  0.18   0.14   0.02  0.08   0.09   0.11   0.12   0.08  0.02  0.11  ...   
4  0.00   0.00   0.00  0.00   0.00   0.00   0.00   0.00  0.00  0.00  ...   

     90    91     92     93    94     95     96    97     98     99  
0  0.05  0.00   0.00   0.08  0.00   0.04   0.00  0.03   0.07   0.00  
1  0.07  0.26   0.06   0.19  0.17   0.07   0.17  0.08   0.24   0.08  
2  9.51  7.05  11.77  12.75  8.11  13.24  11.30  9.30  13.04  10.99  
3  0.10  0.05   0.00   0.09  0.06   0.06   0.16  0.05   0.10   0.17  
4  0.00  0.00   0.00   0.00  0.00   0.00   0.00  0.00   0.00   0.00  

[5 rows x 100 columns]
   0
0  1
1  0
2  1
3  1
4  0
Sample Predictions: [1 0 1 1 1 1 0 0 1 1]
Training Accuracy: 0.870732

 15%|█▌        | 3/20 [02:25<15:12, 53.70s/it]

Random forest increases the risk of overfitting by avoiding bootstrapping in sampling and keeping the features for splitting deterministic. Without bootstrapping, the trees are constructed on identical datasets, leading to higher correlation among the trees. The lack of randomness in the feature selection makes the construction process deterministic, resulting in: 

- Higher similarity among the trees, which reduces the diversity of the ensemble.
- An increased risk of overfitting to the training data, as the trees become overly specialized to the shared patterns in the data. 
- A loss of the robustness and generalization advantages typically offered by Random Forest. * -1
- 

In [None]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
def my_forest_ada_train(features, labels, n_trees):
    num_samples = features.shape[0]
    sample_weights = np.ones(num_samples) / num_samples
    trained_trees = []
    for _ in range(n_trees):
        # Train a weak learner
        stump = DecisionTreeClassifier(max_depth=1)
        stump.fit(features, labels, sample_weight=sample_weights)
        predictions = stump.predict(features)
        error = np.sum(sample_weights * (predictions != labels)) / np.sum(sample_weights)

        if error > 0.5:
            break
        if error == 0:
            alpha = 1
        else:
            alpha = 0.5 * np.log((1 - error) / error)

        sample_weights *= np.exp(-alpha * labels * predictions)
        sample_weights /= np.sum(sample_weights)

        trained_trees.append((stump, alpha))
    return trained_trees
def my_forest_ada_predict(trained_trees, features):
    weighted_votes = np.zeros(features.shape[0])
    for tree, weight in trained_trees:
        predictions = tree.predict(features)
        weighted_votes += weight * predictions
    final_predictions = np.sign(weighted_votes)
    return final_predictions
def my_forest_ada_train_equal_weights(features, labels, n_trees):
    trained_trees = []
    for _ in range(n_trees):
        stump = DecisionTreeClassifier(max_depth=1)
        stump.fit(features, labels)
        trained_trees.append((stump, 1))  # Equal weight (1) for all trees
    return trained_trees
def my_forest_ada_predict_equal_weights(trained_trees, features):
    votes = np.zeros(features.shape[0])
    for tree, _ in trained_trees:
        predictions = tree.predict(features)
        votes += predictions
    final_predictions = np.sign(votes)
    return final_predictions
def load_data(file1, file2):
    def read_file(file_path, label):
        features, targets = [], []
        with open(file_path, 'r') as file:
            for line in file:
                values = list(map(float, line.strip().split('\t')[1:]))
                features.append(values)
                targets.append(label)
        return np.array(features), np.array(targets)

    features1, targets1 = read_file(file1, 1)
    features2, targets2 = read_file(file2, -1)

    features = np.vstack([features1, features2])
    targets = np.concatenate([targets1, targets2])

    shuffle_indices = np.random.permutation(len(targets))
    return features[shuffle_indices], targets[shuffle_indices]
X, Y = load_data('sun_exposed.txt', 'not_sun_exposed.txt')
n_trees_range = range(1, 21)
standard_accuracies = []

for n_trees in tqdm(n_trees_range):
    ada_model = my_forest_ada_train(X, Y, n_trees)
    predictions = my_forest_ada_predict(ada_model, X)
    accuracy = np.mean(predictions == Y)
    standard_accuracies.append(accuracy)
    
    
equal_weights_accuracies = []

for n_trees in tqdm(n_trees_range):
    equal_model = my_forest_ada_train_equal_weights(X, Y, n_trees)
    predictions = my_forest_ada_predict_equal_weights(equal_model, X)
    accuracy = np.mean(predictions == Y)
    equal_weights_accuracies.append(accuracy)


plt.figure(figsize=(12, 6))
plt.plot(n_trees_range, standard_accuracies, marker='o', label='Standard AdaBoost')
plt.plot(n_trees_range, equal_weights_accuracies, marker='s', label='Equal Weights AdaBoost')
plt.title('Accuracy vs Number of Trees')
plt.xlabel('Number of Trees')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.show()


Observations
Standard AdaBoost:

Accuracy improves as the number of trees increases, but may plateau or decrease slightly due to overfitting.
Dynamic weighting allows the algorithm to focus on harder-to-classify samples, improving robustness.
Equal Weights AdaBoost:

Accuracy improves initially but plateaus much earlier than Standard AdaBoost.
Without adaptive weights, the model cannot emphasize harder-to-classify samples, leading to reduced performance.
Impact of Adaptive Weighting:

Adaptive weighting enhances AdaBoost's ability to address difficult samples, improving overall accuracy.
The equal-weight version essentially mimics a simple ensemble method without the boosting aspect.
These differences demonstrate the importance of adaptive sample weighting in AdaBoost's performance.