In [20]:
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import mode

# Load your dataset
data = pd.read_csv('data/train_tfidf_features.csv')

# Exclude the first column and use the second column as the label
X = data.drop(['label', 'id'], axis=1).values  # Convert to NumPy array
y = data['label'].values  # Convert to NumPy array

# List of random seeds, learning rates, and number of estimators
random_seeds = [42, 52, 62]
learning_rates = [0.01, 0.1, 0.5, 1.0]

# Initialize variables to track the best configuration
best_accuracy = 0
best_lr = None
best_n_estimators = None
best_seed = None

# Iterate over each combination of learning rate and number of estimators
for lr in learning_rates:
    test_accuracies = []
    
    for seed in random_seeds:
        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

        # Initialize the base estimator
        base_estimator = DecisionTreeClassifier(max_depth=1)

        # Initialize the AdaBoost classifier
        adaboost = AdaBoostClassifier(estimator=base_estimator, n_estimators=10, learning_rate=lr, random_state=seed)

        # Define the k-fold cross-validator
        kf = KFold(n_splits=5, shuffle=True, random_state=seed)

        # Prepare an array to collect predictions
        all_predictions = np.zeros((X_train.shape[0], kf.get_n_splits()), dtype=int)

        # Perform cross-validation
        for fold_index, (train_index, test_index) in enumerate(kf.split(X_train)):
            X_train_fold, X_val_fold = X_train[train_index], X_train[test_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[test_index]
            
            # Train the classifier
            adaboost.fit(X_train_fold, y_train_fold)
            
            # Predict on the validation set
            y_pred = adaboost.predict(X_val_fold)
            
            # Store predictions in the appropriate rows
            all_predictions[test_index, fold_index] = y_pred

        # Calculate majority vote for each sample
        majority_vote_predictions_train = mode(all_predictions, axis=1).mode.flatten()

        # Train the classifier on the entire training set
        adaboost.fit(X_train, y_train)

        # Predict on the test set
        y_test_pred = adaboost.predict(X_test)

        # Calculate the accuracy on the test set
        test_accuracy = accuracy_score(y_test, y_test_pred)
        test_accuracies.append(test_accuracy)
        print(f"Test Accuracy for seed {seed}, learning rate {lr}, n_estimators {n_estimators}: {test_accuracy:.2f}")

    # Calculate and print the average accuracy for this learning rate and number of estimators
    average_accuracy = np.mean(test_accuracies)
    print(f"Average Test Accuracy for learning rate {lr} and n_estimators {n_estimators}: {average_accuracy:.2f}")

    # Update best parameters if current average accuracy is better
    if average_accuracy > best_accuracy:
        best_accuracy = average_accuracy
        best_lr = lr
        best_n_estimators = n_estimators
        best_seed = seed

# Print the best configuration
print(f"Best Accuracy: {best_accuracy:.2f}")
print(f"Best Learning Rate: {best_lr}")
print(f"Best Number of Estimators: {best_n_estimators}")
print(f"Best Seed: {best_seed}")

  majority_vote_predictions_train = mode(all_predictions, axis=1).mode.flatten()


Test Accuracy for seed 42, learning rate 0.01, n_estimators 100: 0.65


  majority_vote_predictions_train = mode(all_predictions, axis=1).mode.flatten()


Test Accuracy for seed 52, learning rate 0.01, n_estimators 100: 0.63


  majority_vote_predictions_train = mode(all_predictions, axis=1).mode.flatten()


Test Accuracy for seed 62, learning rate 0.01, n_estimators 100: 0.63
Average Test Accuracy for learning rate 0.01 and n_estimators 100: 0.64


  majority_vote_predictions_train = mode(all_predictions, axis=1).mode.flatten()


Test Accuracy for seed 42, learning rate 0.1, n_estimators 100: 0.65


  majority_vote_predictions_train = mode(all_predictions, axis=1).mode.flatten()


Test Accuracy for seed 52, learning rate 0.1, n_estimators 100: 0.64


  majority_vote_predictions_train = mode(all_predictions, axis=1).mode.flatten()


Test Accuracy for seed 62, learning rate 0.1, n_estimators 100: 0.64
Average Test Accuracy for learning rate 0.1 and n_estimators 100: 0.64


  majority_vote_predictions_train = mode(all_predictions, axis=1).mode.flatten()


Test Accuracy for seed 42, learning rate 0.5, n_estimators 100: 0.67


  majority_vote_predictions_train = mode(all_predictions, axis=1).mode.flatten()


Test Accuracy for seed 52, learning rate 0.5, n_estimators 100: 0.66


  majority_vote_predictions_train = mode(all_predictions, axis=1).mode.flatten()


Test Accuracy for seed 62, learning rate 0.5, n_estimators 100: 0.66
Average Test Accuracy for learning rate 0.5 and n_estimators 100: 0.66


  majority_vote_predictions_train = mode(all_predictions, axis=1).mode.flatten()


Test Accuracy for seed 42, learning rate 1.0, n_estimators 100: 0.68


  majority_vote_predictions_train = mode(all_predictions, axis=1).mode.flatten()


Test Accuracy for seed 52, learning rate 1.0, n_estimators 100: 0.67


  majority_vote_predictions_train = mode(all_predictions, axis=1).mode.flatten()


Test Accuracy for seed 62, learning rate 1.0, n_estimators 100: 0.68
Average Test Accuracy for learning rate 1.0 and n_estimators 100: 0.68
Best Accuracy: 0.68
Best Learning Rate: 1.0
Best Number of Estimators: 100
Best Seed: 62


In [14]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(X_train, y_train)
y_test_pred_logistic = logistic_model.predict(X_test)
test_accuracy_logistic = (y_test_pred_logistic == y_test).mean()
print(f"Test Accuracy with Logistic Regression: {test_accuracy_logistic:.2f}")

Test Accuracy with Logistic Regression: 1.00


In [5]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score

# Load your dataset
data = pd.read_csv('data/train_tfidf_features.csv')

# Exclude the first column and use the second column as the label
X = data.drop(['label', 'id'], axis=1).values  # Convert to NumPy array
y = data['label'].values  # Convert to NumPy array

# List of random seeds, number of estimators, boosting types, number of leaves, max depth, subsample, colsample_bytree
random_seeds = [42, 52, 62]
n_estimators_list = [50, 100]
boosting_types = ['gbdt', 'dart']
num_leaves_list = [15, 31, 63]
max_depth_list = [-1, 10, 20]  # -1 means no limit
subsample_list = [0.8, 0.9, 1.0]
colsample_bytree_list = [0.8, 0.9, 1.0]

# Initialize variables to track the best configuration
best_accuracy = 0
best_params = {}

# Store results for all configurations
results = []

# Iterate over each combination of boosting type, number of estimators, number of leaves, max depth, subsample, and colsample_bytree
for boosting_type in boosting_types:
    for n_estimators in n_estimators_list:
        for num_leaves in num_leaves_list:
            for max_depth in max_depth_list:
                for subsample in subsample_list:
                    for colsample_bytree in colsample_bytree_list:
                        test_accuracies = []

                        for seed in random_seeds:
                            # Split data into training and testing sets
                            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

                            # Define the k-fold cross-validator
                            kf = KFold(n_splits=5, shuffle=True, random_state=seed)

                            # Prepare an array to collect predictions
                            all_predictions = np.zeros((X_train.shape[0], kf.get_n_splits()), dtype=int)

                            # Perform cross-validation
                            for fold_index, (train_index, val_index) in enumerate(kf.split(X_train)):
                                X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
                                y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

                                # Initialize the LightGBM classifier
                                lgbm = lgb.LGBMClassifier(
                                    n_estimators=n_estimators,
                                    learning_rate=0.1,
                                    boosting_type=boosting_type,
                                    num_leaves=num_leaves,
                                    max_depth=max_depth,
                                    subsample=subsample,
                                    colsample_bytree=colsample_bytree,
                                    random_state=seed
                                )

                                # Train the classifier
                                lgbm.fit(X_train_fold, y_train_fold)

                                # Predict on the validation set
                                y_pred = lgbm.predict(X_val_fold)

                                # Store predictions in the appropriate rows
                                all_predictions[val_index, fold_index] = y_pred

                            # Calculate majority vote for each sample
                            majority_vote_predictions_train = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=all_predictions)

                            # Train the classifier on the entire training set
                            lgbm.fit(X_train, y_train)

                            # Predict on the test set
                            y_test_pred = lgbm.predict(X_test)

                            # Calculate the accuracy on the test set
                            test_accuracy = accuracy_score(y_test, y_test_pred)
                            test_accuracies.append(test_accuracy)

                        # Calculate and store the average accuracy for this combination of parameters
                        average_accuracy = np.mean(test_accuracies)
                        results.append((boosting_type, n_estimators, num_leaves, max_depth, subsample, colsample_bytree, average_accuracy))

                        # Update best parameters if current average accuracy is better
                        if average_accuracy > best_accuracy:
                            best_accuracy = average_accuracy
                            best_params = {
                                'boosting_type': boosting_type,
                                'n_estimators': n_estimators,
                                'num_leaves': num_leaves,
                                'max_depth': max_depth,
                                'subsample': subsample,
                                'colsample_bytree': colsample_bytree
                            }

# Print all results at the end
print("\nAll Results:")
for boosting_type, n_estimators, num_leaves, max_depth, subsample, colsample_bytree, accuracy in results:
    print(f"Boosting Type: {boosting_type}, Number of Estimators: {n_estimators}, Num Leaves: {num_leaves}, Max Depth: {max_depth}, Subsample: {subsample}, Colsample_bytree: {colsample_bytree}, Average Accuracy: {accuracy:.2f}")

# Print the best configuration
print(f"\nBest Accuracy: {best_accuracy:.2f}")
print(f"Best Parameters: {best_params}")

[LightGBM] [Info] Number of positive: 4203, number of negative: 6794
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027635 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 18899
[LightGBM] [Info] Number of data points in the train set: 10997, number of used features: 814
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.382195 -> initscore=-0.480241
[LightGBM] [Info] Start training from score -0.480241
[LightGBM] [Info] Number of positive: 4205, number of negative: 6792
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025637 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 18842
[LightGBM] [Info] Number of data points in the train set: 10997, number of used features: 801
[LightGBM] [Info] 

KeyboardInterrupt: 