In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Load your dataset
data = pd.read_csv('data/train_tfidf_features.csv')

# Exclude the first column and use the second column as the label
X = data.drop(['label', 'id'], axis=1).values  # Convert to NumPy array
y = data['label'].values  # Convert to NumPy array

# List of random seeds, learning rates, and number of estimators
random_seeds = [42, 52, 62]
learning_rates = [0.01, 0.1, 0.5, 1.0]
n_estimators = 10

# Initialize variables to track the best configuration
best_accuracy = 0
best_lr = None
best_seed = None

# Initialize a dictionary to store accuracies for each learning rate
lr_accuracies = {lr: [] for lr in learning_rates}

# Iterate over each learning rate
for lr in learning_rates:
    for seed in random_seeds:
        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

        # Initialize the base estimator
        base_estimator = DecisionTreeClassifier(max_depth=1)

        # Initialize the AdaBoost classifier
        adaboost = AdaBoostClassifier(estimator=base_estimator, n_estimators=n_estimators, learning_rate=lr, random_state=seed)

        # Define the k-fold cross-validator
        kf = KFold(n_splits=5, shuffle=True, random_state=seed)

        fold_accuracies = []

        # Perform cross-validation
        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
            
            # Train the classifier
            adaboost.fit(X_train_fold, y_train_fold)
            
            # Predict on the validation set
            y_pred = adaboost.predict(X_val_fold)
            
            # Calculate the accuracy on the validation set
            fold_accuracy = accuracy_score(y_val_fold, y_pred)
            fold_accuracies.append(fold_accuracy)

        # Calculate and store the average accuracy for this learning rate and seed
        avg_fold_accuracy = np.mean(fold_accuracies)
        lr_accuracies[lr].append(avg_fold_accuracy)
        print(f"Accuracy for seed {seed}, learning rate {lr}: {avg_fold_accuracy:.2f}")

# Calculate the mean and standard deviation of accuracies for each learning rate
lr_means = {lr: np.mean(lr_accuracies[lr]) for lr in learning_rates}
lr_stds = {lr: np.std(lr_accuracies[lr]) for lr in learning_rates}

# Plot the results
plt.figure(figsize=(12, 8))
plt.errorbar(lr_means.keys(), lr_means.values(), yerr=lr_stds.values(), fmt='-o', capsize=5)
plt.xlabel('Learning Rate')
plt.ylabel('Mean Accuracy')
plt.title('Mean Accuracy vs. Learning Rate with Error Bars')
plt.grid(True)
plt.show()

# Print the mean and standard deviation of accuracies for each learning rate
for lr in learning_rates:
    print(f"Learning Rate: {lr}, Mean Accuracy: {lr_means[lr]:.2f}, Std: {lr_stds[lr]:.2f}")