# Task 3: Try other machine learning models and race to the top

The final model used is made of 5 other models:
1) Logistic regression

2) SVM

3) Naive Bayes

4) Bagging

5) Boosting

Each model will make their own prediction given the data point. The final model will then make its prediction based on the majority prediction of the 5 models.

Each model creates their own prediction file that can be found in task3_predictions. Alternatively, we can uncomment the code that creates the file and run the code to obtain the predictions for each model.

Let's see the code implementation of all 5 models and their hyperparameter optimisation.

## Logistic Regression

The hyperparameters to optimise is Learning rate (lr) and Batch size (bs)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

# Define your train, predict, and accuracy functions
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def loss(y, y_hat):
    return -np.mean(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))

def gradient_descent(X, y, y_hat):
    m = X.shape[0]
    dw = (1 / m) * np.dot(X.T, (y_hat - y))
    db = (1 / m) * np.sum(y_hat - y)
    return dw, db

def train(X, y, bs, epochs, lr, X_val, y_val):
    n_samples, n_features = X.shape
    w = np.zeros((n_features, 1))
    b = 0
    y = y.reshape(n_samples, 1)
    
    training_loss = []
    validation_f1 = []
    validation_loss = []

    for epoch in range(epochs):
        epoch_loss = 0
        for i in range(0, n_samples, bs):
            X_batch = X[i:i + bs]
            y_batch = y[i:i + bs]
            y_hat = sigmoid(np.dot(X_batch, w) + b)
            dw, db = gradient_descent(X_batch, y_batch, y_hat)
            w -= lr * dw
            b -= lr * db

            epoch_loss += loss(y_batch, y_hat)
        
        epoch_loss /= (n_samples // bs)
        training_loss.append(epoch_loss)

        # Calculate validation F1 score and loss
        y_hat_val = sigmoid(np.dot(X_val, w) + b)
        y_pred_val = (y_hat_val > 0.5).astype(int)
        f1 = f1_score(y_val, y_pred_val, average='macro')
        validation_f1.append(f1)
        val_loss = loss(y_val, y_hat_val)
        validation_loss.append(val_loss)

        if epoch % 100 == 0:
            print(f'Epoch {epoch}, Training Loss: {epoch_loss:.4f}, Validation F1: {f1:.4f}, Validation Loss: {val_loss:.4f}')

    return w, b, training_loss, validation_f1, validation_loss

def predict(X, w, b):
    y_hat = sigmoid(np.dot(X, w) + b)
    pred = [1 if i > 0.5 else 0 for i in y_hat]
    return np.array(pred)

def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

Epoch 0, Training Loss: 0.6916, Validation F1: 0.3812, Validation Loss: 0.6875
Epoch 100, Training Loss: 0.6604, Validation F1: 0.3812, Validation Loss: 0.6661
Epoch 200, Training Loss: 0.6551, Validation F1: 0.3812, Validation Loss: 0.6663
Epoch 300, Training Loss: 0.6501, Validation F1: 0.3821, Validation Loss: 0.6667
Epoch 400, Training Loss: 0.6454, Validation F1: 0.3829, Validation Loss: 0.6672
Epoch 0, Training Loss: 0.6946, Validation F1: 0.3812, Validation Loss: 0.6901
Epoch 100, Training Loss: 0.6646, Validation F1: 0.3812, Validation Loss: 0.6660
Epoch 200, Training Loss: 0.6618, Validation F1: 0.3812, Validation Loss: 0.6661


KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import f1_score
import pandas as pd

def optimize(X_features, Y_label):
    epochs = 1000
    learning_rate_options = [0.001, 0.01, 0.1, 0.2]
    batch_size_options = [16, 32, 64]
    k = 5  # Number of folds for cross-validation

    f1_results = []
    best_w = None
    best_b = None
    best_f1 = 0
    best_params = {}

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    for lr in learning_rate_options:
        for bs in batch_size_options:
            f1_scores = []

            for train_index, val_index in kf.split(X_features):
                X_train_split, X_val_split = X_features[train_index], X_features[val_index]
                y_train_split, y_val_split = Y_label[train_index], Y_label[val_index]
                
                w, b, training_loss, validation_f1, validation_loss = train(X_train_split, y_train_split, bs, epochs, lr, X_val_split, y_val_split)
                y_pred = predict(X_val_split, w, b)
                f1 = f1_score(y_val_split, y_pred, average='macro')
                f1_scores.append(f1)

            avg_f1 = sum(f1_scores) / len(f1_scores)

            if avg_f1 > best_f1:
                best_f1 = avg_f1
                best_params = {'learning_rate': lr, 'batch_size': bs}
                best_w = w
                best_b = b
            
            # Store the results
            f1_results.append((lr, bs, avg_f1))

    f1_df = pd.DataFrame(f1_results, columns=['Learning Rate', 'Batch Size', 'F1 Score'])

    # Identify the best hyperparameters
    best_params_idx = f1_df['F1 Score'].idxmax()
    best_params = f1_df.iloc[best_params_idx]

    print(f'Best Hyperparameters: Learning Rate = {best_params["Learning Rate"]}, Batch Size = {best_params["Batch Size"]}, F1 Score = {best_params["F1 Score"]:.4f}')

    return best_params, f1_df, best_w, best_b

In [None]:
# Load your data
data = pd.read_csv("Data/train_tfidf_features.csv")
X_features = data.drop(['label', 'id'], axis=1).values
Y_label = data['label'].values

# Optimize hyperparameters
best_params, f1_df, best_w, best_b = optimize(X_features, Y_label)

In [None]:
best_lr = best_params['Learning Rate']
best_bs = best_params['Batch Size']

In [None]:
def plot_f1_vs_batch_size(f1_df):
    learning_rate_options = f1_df['Learning Rate'].unique()
    plt.figure(figsize=(12, 6))
    for lr in learning_rate_options:
        subset = f1_df[f1_df['Learning Rate'] == lr]
        plt.plot(subset['Batch Size'], subset['F1 Score'], marker='o', label=f'LR: {lr}')
    plt.title('F1 Score vs Batch Size')
    plt.xlabel('Batch Size')
    plt.ylabel('F1 Score')
    plt.legend()
    plt.grid(True)
    plt.show()

def plot_f1_vs_learning_rate(f1_df):
    batch_size_options = f1_df['Batch Size'].unique()
    plt.figure(figsize=(12, 6))
    for bs in batch_size_options:
        subset = f1_df[f1_df['Batch Size'] == bs]
        plt.plot(subset['Learning Rate'], subset['F1 Score'], marker='o', label=f'BS: {bs}')
    plt.title('F1 Score vs Learning Rate')
    plt.xlabel('Learning Rate')
    plt.ylabel('F1 Score')
    plt.legend()
    plt.grid(True)
    plt.show()

def plot_pred_vs_actual(y_val, y_pred):
    plt.figure(figsize=(12, 6))
    plt.scatter(range(len(y_val)), y_val, color='blue', alpha=0.6, label='Actual')
    plt.scatter(range(len(y_pred)), y_pred, color='red', alpha=0.6, label='Predicted')
    plt.plot(range(len(y_pred)), y_pred, color='red', alpha=0.6)
    plt.title('Predicted vs Actual Values')
    plt.xlabel('Sample Index')
    plt.ylabel('Class Label')
    plt.legend()
    plt.show()

In [None]:
# Plot the results
plot_f1_vs_batch_size(f1_df)
plot_f1_vs_learning_rate(f1_df)

# Validate with the best parameters
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_features, Y_label, test_size=0.2, random_state=42)
y_pred = predict(X_val_split, best_w, best_b)

# Plot predicted vs actual values
plot_pred_vs_actual(y_val_split, y_pred)

In [None]:
# Load the test data and first set of predictions
test = pd.read_csv("Data/test_tfidf_features.csv")

# Predict on the test data using the best model
X_test = test.drop(columns=['id']).values
y_pred_best = predict(X_test, best_w, best_b)

# Save the predictions to a CSV file
predictions_df = pd.DataFrame({
    'id': test['id'],
    'label': y_pred_best
})

# Display the first few rows of the final predictions
print(predictions_df.head())

#predictions_df.to_csv('Optimal_LogReg.csv', index=False) #uncomment to get file, otherwise, predictions can be found in task3_predictions

# Completed Print
print("Final predictions saved to logreg_predictions.csv")

The optimal hyperparater values are
Learning rate: 0.1
Batch sizeL: 32

## SVM

SVM with different kernel functions were investigated:
- Linear SVM (investigated both LinearSVC and SVC module)
- Radial Basis function SVM*
- Sigmoid SVM
- Poly SVM

#### Linear SVM

Using SVC module

Hyperparameters:
1) C

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
data = pd.read_csv('Data/train_tfidf_features.csv')
X = data.drop(columns=['id', 'label']).values
y = data['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# PLAIN AND SIMPLE LINEAR SVM USING SVC KERNEL = LINEAR

import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

pipeline = Pipeline([
    ('svc', SVC(kernel='linear',
               gamma=0.1))
])
param_grid = {
    'svc__C': [0.1, 1, 10, 50, 100]
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='f1_macro')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro') 


print(f'Accuracy with SGDClassifier: {accuracy}')
print(f'F1 Score: {macro_f1}')
print('Classification Report with SGDClassifier:')
print(report)

# Result
# F1: 0.6926448445963982

Using LinearSVC module,

Hyperparameters:
1) C
2) Max Iteration
3) loss


In [None]:
# LINEAR SVM WITH DIFFERING REGULARISATION VALUE USING LINEARSVC
# https://scikit-learn.org/stable/auto_examples/svm/plot_svm_scale_c.html#sphx-glr-auto-examples-svm-plot-svm-scale-c-py

import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, f1_score


""" import pandas as pd
from sklearn.model_selection import train_test_split
data = pd.read_csv('Data/train_tfidf_features.csv')
X = data.drop(columns=['id', 'label']).values
y = data['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)# uncomment in case prev is disrupted """


pipeline = Pipeline([
    ('svc', LinearSVC(penalty="l1", loss="squared_hinge", dual=False, tol=1e-3))
])

param_grid = {
    'svc__C': [0.17385, 0.173875, 0.1739, 0.173925, 0.17395],
    'svc__max_iter': [6000, 6200, 6300, 6400, 6500],
    'svc__loss': ['hinge','squared_hinge']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=7, scoring='f1_macro')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print(f'Best parameters: {best_params}')


# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {macro_f1}')
print('Classification Report:')
print(report)


""" Best parameters: {'svc__C': 0.17385, 'svc__loss': 'squared_hinge', 'svc__max_iter': 6000}
Accuracy: 0.722432353796916
F1 Score: 0.6880534155931336 with cv=7."""

#### Sigmoid SVM

Hyperparameters:
- C: regularisation term
- coef: independent term added to the product

In [None]:
# Sigmoid SVM
# https://stats.stackexchange.com/questions/90736/the-difference-of-kernels-in-svm


import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score,f1_score

""" import pandas as pd
from sklearn.model_selection import train_test_split
data = pd.read_csv('Data/train_tfidf_features.csv')
X = data.drop(columns=['id', 'label']).values
y = data['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)# uncomment in case prev is disrupted """


svm_sigmoid = SVC(kernel='sigmoid', C=1.0, coef0=1)
svm_sigmoid.fit(X_train, y_train)
y_pred = svm_sigmoid.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')


print(f'Accuracy: {accuracy}')
print(f'F1 Score: {macro_f1}')
print('Classification Report:')
print(report)

'''
Accuracy: 0.5292406168169915
F1 Score: 0.49756365081946474
Noting the poor performance, sigmoid SVM was not investigated further
'''



#### Poly SVM

Hyperparameters:
- C: regularisation term
- coef: independent term added to the product

In [None]:
# POLY SVM
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, f1_score

""" import pandas as pd
from sklearn.model_selection import train_test_split
data = pd.read_csv('Data/train_tfidf_features.csv')
X = data.drop(columns=['id', 'label']).values
y = data['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)# uncomment in case prev is disrupted """

svm_poly = SVC(kernel='poly', degree=5, C=1.0, coef0=1) # here we will explore different degrees

svm_poly.fit(X_train, y_train)

# Make predictions
y_pred = svm_poly.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')


print(f'Accuracy: {accuracy}')
print(f'F1 Score: {macro_f1}')
print('Classification Report:')
print(report)
'''
Accuracy: 0.6264183881291824
F1 Score: 0.3851520572450805
Noting the poor performance, Poly SVM was not investigated further
'''

#### Radial basis function SVM 

This was chosen as the final model for SVM.

Hyperparamters:
- c: regularisation term 
- gamma: kernel coefficient 

Additional Reference for the rationale of choosing RBF:
https://www.cs.cornell.edu/people/tj/publications/joachims_98a.pdf

In [None]:
# Radial Basis function SVM
# https://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html

import pandas as pd
from sklearn.model_selection import train_test_split
data = pd.read_csv('Data/train_tfidf_features.csv')
X = data.drop(columns=['id', 'label']).values
Y = data['label'].values

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score,f1_score
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import StratifiedKFold

pipeline = Pipeline([
    ('rbf', SVC(kernel='rbf'))
])

param_grid = {
    'rbf__C': [ 5, 10, 20,30, 50],
    'rbf__gamma': ['scale', 0.001, 0.01, 0.1, 1]
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='f1_macro', n_jobs=-1) 

grid_search.fit(X, Y)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_


print(f'Best parameters: {best_params}')

# Modified this cell to train with 100% of training data to enhance the model's performance, to better perform of predicting test data.
# Previous modification of the cell utilised 80-20% split, the following was its performance
# F1 score was a 0.69934251368587 with gamma= 'scale' and C = 10
# F1 score was a 0.7002338328283246 with gamma= 1 and C = 5
# However took 300 mins to train each time.......

Predicting Test Data using RBF SVM Model

In [None]:
test_dataset = pd.read_csv('Data/test_tfidf_features.csv')
x_test_dataset = test_dataset.drop(columns=['id']).values
y_test_pred = best_model.predict(x_test_dataset)
predictions_df = pd.DataFrame({'id': test_dataset['id'], 'label': y_test_pred})
# Save the DataFrame to a CSV file
#predictions_df.to_csv('Optimal_svm.csv', index=False)  #uncomment to get file, otherwise file is found in /task3_predictions

Validation of SVM using RBF Kernel compared to other Kernels:

Compare performance with other kernels to validate that it is better than other kernels

In [None]:


import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, f1_score

""" import pandas as pd
from sklearn.model_selection import train_test_split
data = pd.read_csv('Data/train_tfidf_features.csv')
X = data.drop(columns=['id', 'label']).values
y = data['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)# uncomment in case prev is disrupted """


svm = SVC()
pipeline = Pipeline([
    ('feature_selection', SelectKBest(chi2)), 
    ('rbf', SVC(kernel='rbf'))
])

param_grid = {
    'classification__C': np.logspace(-3, 3, 7),
    'classification__kernel': ['linear', 'rbf', 'sigmoid'],
    'classification__gamma': ['scale', 0.001, 0.01, 0.1, 1],
    'feature_selection__k': [3000, 4000, 5000]
}

#cross-validation strategy
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='f1_macro', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print(f'Best parameters: {best_params}')

y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {macro_f1}')
print('Classification Report:')
print(report)
'''

Best parameters: {'feature_selection__k': 5000, 'rbf__C': 5, 'rbf__gamma': 1}
Accuracy: 0.7299970904858889
F1 Score: 0.7002338328283246
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.83      0.79      2153
           1       0.67      0.56      0.61      1284

    accuracy                           0.73      3437
   macro avg       0.71      0.69      0.70      3437
weighted avg       0.72      0.73      0.72      3437

Note this will take 600 mins
'''

Based on the investigation above, both the Linear SVM and RBF SVM models achieved similar accuracy and F1 scores, with the RBF SVM having slightly higher values. However, the RBF SVM takes significantly more time to train, requiring 30 minutes compared to just 2 minutes for the Linear SVM. Ultimately, the SVM with the RBF kernel was chosen.

## Naive Bayes

The hyperparameter optimised is alpha, also known as smoothing factor.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, KFold, cross_val_predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, f1_score, make_scorer
import joblib

# Loading the training data
data = pd.read_csv('Data/train_tfidf_features.csv')
X = data.drop(columns=['id', 'label'])
Y = data['label']


'''
MultinomialNB has 2 main parameters (alpha and fit_prior)
alpha is the smoothing parameter, preventing the model from assigning 0 probability to unseen words
adds alpha to count when calculating conditional probability. (default =1)
fit_prior is a boolean that sets whether the model will calculate prior probabilities from the training set. (default = True)
Also important to note that SciKit uses log likelihood when calculating 
'''
#Set Up a Detailed Hyperparameter Grid for Alpha
alpha_values = np.arange(0.01, 1, 0.01)  # More detailed range of alpha values

#Perform K-Fold Cross-Validation for Each Alpha
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scorer = make_scorer(f1_score, average='macro')

results = []

for alpha in alpha_values:
    model = MultinomialNB(alpha=alpha)
    cv_scores = cross_val_score(model, X, Y, cv=kf, scoring=scorer)
    results.append((alpha, np.mean(cv_scores)))
    print(f"Alpha: {alpha}, F1 Score: {np.mean(cv_scores)}")

# Convert results to a DataFrame for easier plotting
results_df = pd.DataFrame(results, columns=['Alpha', 'F1 Score'])

#Plot the Results to Find the Optimal Alpha
plt.figure(figsize=(10, 6))
plt.plot(results_df['Alpha'], results_df['F1 Score'], marker='o')
plt.xlabel('Alpha')
plt.ylabel('F1 Score (Macro)')
plt.title('Alpha vs. F1 Score (Macro)')
plt.grid(True)
plt.show()

#Find the Exact Optimal Alpha Value
optimal_alpha = results_df.loc[results_df['F1 Score'].idxmax()]['Alpha']
optimal_f1_score = results_df.loc[results_df['F1 Score'].idxmax()]['F1 Score']

print(f"Optimal alpha value: {optimal_alpha}")
print(f"F1 Score at optimal alpha: {optimal_f1_score}")


# Initialize the model
model = MultinomialNB(alpha = 0.26)

# Load the test data
test = pd.read_csv("Data/test_tfidf_features.csv")
X_test = test.drop(['id'], axis=1)

# Fit the model on the entire training data
model.fit(X, Y)

# Make predictions on the test data
y_test_pred = model.predict(X_test)

# Create a DataFrame with predictions
predictions_df = pd.DataFrame({'id': test['id'], 'label': y_test_pred})

# Save predictions to a CSV file
#predictions_df.to_csv('Optimal_NB.csv', index=False) #uncomment to get file, otherwise file is found in /task3_predictions




Alpha: 0.01, F1 Score: 0.6706924170645149
Alpha: 0.02, F1 Score: 0.6726259323487608
Alpha: 0.03, F1 Score: 0.6727110494395248
Alpha: 0.04, F1 Score: 0.6735146478858398


KeyboardInterrupt: 

The optimal alpha value is alpha = 0.26

## Bagging

Bagging was investigated using SKlearn's random forest classifier.

The hyperparameters to optimise were 
- n_estimators: The number of trees in the forest.
- Criterion: The function to measure the quality of a split.
- max_depth: The maximum depth of the tree.  
- min_samples_split: The minimum number of samples required to split an internal node
- min_samples_leaf: The minimum number of samples required to be at a leaf node.
- max_features: The number of features to consider when looking for the best split
- max_samples: The number of samples to draw to train each tree.
- class_weight: Weights associated with classes.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
# explicitly require this experimental feature
from sklearn.experimental import enable_halving_search_cv 
from sklearn.model_selection import HalvingGridSearchCV

df = pd.read_csv('Data/train_tfidf_features.csv')
k = 5

# Separate features and labels
X = df.drop(columns=['id', 'label'])
y = df['label']

model = RandomForestClassifier(n_estimators=400, random_state=42, criterion="entropy", min_samples_split=80, max_features='sqrt', max_samples=0.6, 
                               class_weight='balanced_subsample', max_depth=150, min_samples_leaf=2)

if False:  # Change this to True to activate hyperparameter optimization
    param_grid = {
        'n_estimators': [350,360,370,380,390]
    }
    
    print("Doing hyperparameter optimization")
    halving_grid_search = HalvingGridSearchCV(
        estimator=model,
        param_grid=param_grid,
        factor=3,  # The factor by which the number of candidates is reduced at each iteration
        scoring='f1_macro',
        n_jobs=-1,
        cv=5,
        verbose=1,
        random_state=42
    )

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    halving_grid_search.fit(X_train, y_train)
    print("Best parameters found: ", halving_grid_search.best_params_)

    # Collecting results
    results = pd.DataFrame(halving_grid_search.cv_results_)

    # Best model evaluation
    best_model = halving_grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    f1 = f1_score(y_test, y_pred, zero_division=1)
    print(f"F1 Score: {f1}")

if True:
    print("Doing Cross Validation")
    scores = cross_val_score(model, X, y, cv=k, scoring='f1_macro')
    print(f"Cross-Validation F1 Scores for {k} folds: {scores}")
    print("Mean Accuracy:", scores.mean())
    print("Standard Deviation:", scores.std())

if True:
    test = pd.read_csv("Data/test_tfidf_features.csv")
    X_test = test.drop(['id'], axis=1).values
    model.fit(X, y)
    y_pred = model.predict(X_test)
    predictions_df = pd.DataFrame({'id': test['id'], 'label': y_pred})

    # Display the first few rows of the predictions
    print(predictions_df.head())

    # Save predictions to a CSV file
    #predictions_df.to_csv('Optimal_bagging', index=False) #uncomment to get file, otherwise file is found in /task3_predictions


if False: #creating graphs for each hyperparameter
    min_samples_split_range = range(2, 100, 1)

    # Store the mean and standard deviation of the F1 Macro scores for each n_estimators
    mean_scores = []
    std_scores = []

    for min_samples_split in min_samples_split_range:
        print(f"Evaluating model with min_samples_split={min_samples_split}")
        model = RandomForestClassifier(n_estimators=100, random_state=42, criterion="entropy",
                                       min_samples_split=min_samples_split, max_features='sqrt', max_samples=0.7,
                                       class_weight='balanced_subsample', max_depth=80, min_samples_leaf=1)
        scores = cross_val_score(model, X, y, cv=k, scoring='f1_macro')
        mean_scores.append(scores.mean())
        std_scores.append(scores.std())
        print(f"Mean F1 Macro Score: {scores.mean()} | Std Dev: {scores.std()}")

    # Save the plot
    plt.figure(figsize=(10, 6))
    plt.errorbar(min_samples_split_range, mean_scores, yerr=std_scores, fmt='-o')
    plt.xlabel('min_samples_split')
    plt.ylabel('Mean F1 Macro Score')
    plt.title('F1 Macro Score vs. min_samples_split')
    plt.grid(True)
    plt.savefig('min_samples_split_vs_f1_macro.png')
    plt.close()

    max_depth_range = range(1, 200, 1)

    # Store the mean and standard deviation of the F1 Macro scores for each n_estimators
    mean_scores = []
    std_scores = []

    for max_depth in max_depth_range:
        print(f"Evaluating model with max_depth={max_depth}")
        model = RandomForestClassifier(n_estimators=100, random_state=42, criterion="entropy",
                                       min_samples_split=7, max_features='sqrt', max_samples=0.7,
                                       class_weight='balanced_subsample', max_depth=max_depth, min_samples_leaf=1)
        scores = cross_val_score(model, X, y, cv=k, scoring='f1_macro')
        mean_scores.append(scores.mean())
        std_scores.append(scores.std())
        print(f"Mean F1 Macro Score: {scores.mean()} | Std Dev: {scores.std()}")

    # Save the plot
    plt.figure(figsize=(10, 6))
    plt.errorbar(max_depth_range, mean_scores, yerr=std_scores, fmt='-o')
    plt.xlabel('max_depth')
    plt.ylabel('Mean F1 Macro Score')
    plt.title('F1 Macro Score vs. max_depth')
    plt.grid(True)
    plt.savefig('max_depth_vs_f1_macro.png')
    plt.close()

    min_samples_leaf_range = range(1, 100, 1)

    # Store the mean and standard deviation of the F1 Macro scores for each n_estimators
    mean_scores = []
    std_scores = []

    for min_samples_leaf in min_samples_leaf_range:
        print(f"Evaluating model with min_samples_leaf={min_samples_leaf}")
        model = RandomForestClassifier(n_estimators=100, random_state=42, criterion="entropy",
                                       min_samples_split=7, max_features='sqrt', max_samples=0.7,
                                       class_weight='balanced_subsample', max_depth=80, min_samples_leaf=min_samples_leaf)
        scores = cross_val_score(model, X, y, cv=k, scoring='f1_macro')
        mean_scores.append(scores.mean())
        std_scores.append(scores.std())
        print(f"Mean F1 Macro Score: {scores.mean()} | Std Dev: {scores.std()}")

    # Save the plot
    plt.figure(figsize=(10, 6))
    plt.errorbar(min_samples_leaf_range, mean_scores, yerr=std_scores, fmt='-o')
    plt.xlabel('min_samples_leaf')
    plt.ylabel('Mean F1 Macro Score')
    plt.title('F1 Macro Score vs. min_samples_leaf')
    plt.grid(True)
    plt.savefig('min_samples_leaf_vs_f1_macro.png')
    plt.close()

    max_samples_range = np.arange(0.01, 1, 0.01)

    # Store the mean and standard deviation of the F1 Macro scores for each n_estimators
    mean_scores = []
    std_scores = []

    for max_samples in max_samples_range:
        print(f"Evaluating model with max_samples={max_samples}")
        model = RandomForestClassifier(n_estimators=100, random_state=42, criterion="entropy",
                                       min_samples_split=7, max_features='sqrt', max_samples=max_samples,
                                       class_weight='balanced_subsample', max_depth=80, min_samples_leaf=1)
        scores = cross_val_score(model, X, y, cv=k, scoring='f1_macro')
        mean_scores.append(scores.mean())
        std_scores.append(scores.std())
        print(f"Mean F1 Macro Score: {scores.mean()} | Std Dev: {scores.std()}")

    # Save the plot
    plt.figure(figsize=(10, 6))
    plt.errorbar(max_samples_range, mean_scores, yerr=std_scores, fmt='-o')
    plt.xlabel('max_samples')
    plt.ylabel('Mean F1 Macro Score')
    plt.title('F1 Macro Score vs. max_samples')
    plt.grid(True)
    plt.savefig('max_samples_vs_f1_macro.png')
    plt.close()

    n_estimators_range = range(10, 510, 10)

    # Store the mean and standard deviation of the F1 Macro scores for each n_estimators
    mean_scores = []
    std_scores = []

    for n_estimators in n_estimators_range:
        print(f"Evaluating model with n_estimators={n_estimators}")
        model = RandomForestClassifier(n_estimators=n_estimators, random_state=42, criterion="entropy",
                                       min_samples_split=7, max_features='sqrt', max_samples=0.7,
                                       class_weight='balanced_subsample', max_depth=80, min_samples_leaf=1)
        scores = cross_val_score(model, X, y, cv=k, scoring='f1_macro')
        mean_scores.append(scores.mean())
        std_scores.append(scores.std())
        print(f"Mean F1 Macro Score: {scores.mean()} | Std Dev: {scores.std()}")

    # Save the plot
    plt.figure(figsize=(10, 6))
    plt.errorbar(n_estimators_range, mean_scores, yerr=std_scores, fmt='-o')
    plt.xlabel('n_estimators')
    plt.ylabel('Mean F1 Macro Score')
    plt.title('F1 Macro Score vs. n_estimators')
    plt.grid(True)
    plt.savefig('n_estimators_vs_f1_macro.png')
    plt.close()

The optimal values for each hyperparameters are:
- n_estimators: 400
- Criterion: "entropy" 
- max_depth: 150  
- min_samples_split: 80
- min_samples_leaf: 2
- max_features: 'sqrt'
- max_samples: 0.6
- class_weight: 'balanced_subsample'

## Boosting

Boosting was investigated using Light Gradient Boosting Machine (LightGBM)

The hyperparameters to optimise were
- num_leaves: The maximum number of leaves in one tree
- n_estimators: The number of boosting iterations (trees).
- max_depth: The maximum depth of each tree.
- min_child_samples: The minimum number of data points needed in a leaf.
- colsample_bytree: The fraction of features to consider when building each tree.
- scale_pos_weight: The weight for balancing the positive and negative classes in the binary classification task.
- reg_alpha:  L1 regularization term on weights
- reg_lambda: L2 regularization term on weights

Deciding between models:
- Decision Stump
- Logistic Regression
- Linear SVM
- Gaussian Naive Bayes
- GBDT
- DART
- GOSS

In [None]:
import numpy as np
import pandas as pd
import time
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from joblib import Parallel, delayed

# Load your dataset
data = pd.read_csv('Data/train_tfidf_features.csv')

# Exclude the first column and use the second column as the label
X = data.drop(['label', 'id'], axis=1).values  # Convert to NumPy array
y = data['label'].values  # Convert to NumPy array

# List of random seeds
random_seeds = [42, 52, 62]

# List of boosting algorithms and weak learners
boosting_algorithms = {
    'LightGBM GBDT': LGBMClassifier(boosting_type='gbdt'),
    'LightGBM DART': LGBMClassifier(boosting_type='dart'),
    'LightGBM GOSS': LGBMClassifier(boosting_type='goss')
}

weak_learners = {
    'Decision Stump': DecisionTreeClassifier(max_depth=1),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Gaussian Naive Bayes': GaussianNB()
}

# Initialize dictionaries to store F1 scores and times for each algorithm/learner
boosting_results = {name: {'f1_scores': [], 'times': []} for name in boosting_algorithms}
weak_learner_results = {name: {'f1_scores': [], 'times': []} for name in weak_learners}

def evaluate_boosting(name, model, seed):
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

    # Initialize the model
    model.set_params(random_state=seed)
    
    # Measure the start time
    start_time = time.time()
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Measure the end time
    end_time = time.time()
    
    # Calculate the time taken
    elapsed_time = end_time - start_time
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate the macro F1 score on the test set
    f1 = f1_score(y_test, y_pred, average='macro')
    
    # Return the F1 score and time taken
    return name, seed, f1, elapsed_time

def evaluate_learner(name, base_estimator, seed):
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

    # Initialize the AdaBoost classifier
    adaboost = AdaBoostClassifier(estimator=base_estimator, n_estimators=10, learning_rate=1.0, random_state=seed)
    
    # Measure the start time
    start_time = time.time()
    
    # Train the classifier
    adaboost.fit(X_train, y_train)
    
    # Measure the end time
    end_time = time.time()
    
    # Calculate the time taken
    elapsed_time = end_time - start_time
    
    # Predict on the test set
    y_pred = adaboost.predict(X_test)
    
    # Calculate the macro F1 score on the test set
    f1 = f1_score(y_test, y_pred, average='macro')
    
    # Return the F1 score and time taken
    return name, seed, f1, elapsed_time

# Use Parallel and delayed to parallelize the evaluation
results_boosting = Parallel(n_jobs=-1)(delayed(evaluate_boosting)(name, model, seed)
                                       for name, model in boosting_algorithms.items()
                                       for seed in random_seeds)

results_weak_learners = Parallel(n_jobs=-1)(delayed(evaluate_learner)(name, base_estimator, seed)
                                            for name, base_estimator in weak_learners.items()
                                            for seed in random_seeds)

# Process results for boosting algorithms
for name, seed, f1, elapsed_time in results_boosting:
    boosting_results[name]['f1_scores'].append(f1)
    boosting_results[name]['times'].append(elapsed_time)

# Process results for weak learners
for name, seed, f1, elapsed_time in results_weak_learners:
    weak_learner_results[name]['f1_scores'].append(f1)
    weak_learner_results[name]['times'].append(elapsed_time)

# Calculate the mean and standard deviation of F1 scores and times for each algorithm/learner
boosting_means_f1 = {name: np.mean(boosting_results[name]['f1_scores']) for name in boosting_algorithms}
boosting_stds_f1 = {name: np.std(boosting_results[name]['f1_scores']) for name in boosting_algorithms}
boosting_means_time = {name: np.mean(boosting_results[name]['times']) for name in boosting_algorithms}
boosting_stds_time = {name: np.std(boosting_results[name]['times']) for name in boosting_algorithms}

learner_means_f1 = {name: np.mean(weak_learner_results[name]['f1_scores']) for name in weak_learners}
learner_stds_f1 = {name: np.std(weak_learner_results[name]['f1_scores']) for name in weak_learners}
learner_means_time = {name: np.mean(weak_learner_results[name]['times']) for name in weak_learners}
learner_stds_time = {name: np.std(weak_learner_results[name]['times']) for name in weak_learners}

# Combine the results for plotting
all_names = list(boosting_means_f1.keys()) + list(learner_means_f1.keys())
all_means_f1 = list(boosting_means_f1.values()) + list(learner_means_f1.values())
all_stds_f1 = list(boosting_stds_f1.values()) + list(learner_stds_f1.values())
all_means_time = list(boosting_means_time.values()) + list(learner_means_time.values())
all_stds_time = list(boosting_stds_time.values()) + list(learner_stds_time.values())

# Create a dual dot plot
fig, ax1 = plt.subplots(figsize=(16, 8))

# Plot F1 scores
color = 'tab:blue'
ax1.set_xlabel('Algorithm / Weak Learner')
ax1.set_ylabel('Mean Macro F1 Score', color=color)
ax1.errorbar(all_names, all_means_f1, yerr=all_stds_f1, fmt='o', capsize=5, color=color, label='Mean F1 Score')
ax1.tick_params(axis='y', labelcolor=color)

# Create a second y-axis to plot times
ax2 = ax1.twinx()
color = 'tab:red'
ax2.set_ylabel('Mean Time Taken (seconds)', color=color)
ax2.errorbar(all_names, all_means_time, yerr=all_stds_time, fmt='o', capsize=5, color=color, label='Mean Time Taken')
ax2.tick_params(axis='y', labelcolor=color)

# Add a title and grid
plt.title('Mean Macro F1 Score and Mean Time Taken vs. Algorithm with Error Bars')
ax1.grid(True)
fig.tight_layout()
plt.xticks(rotation=45)
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')
plt.show()

# Print the mean and standard deviation of F1 scores and times for each algorithm/learner
for name in all_names:
    if name in boosting_means_f1:
        print(f"Boosting Algorithm: {name}, Mean F1 Score: {boosting_means_f1[name]:.2f}, Std: {boosting_stds_f1[name]:.2f}")
        print(f"Boosting Algorithm: {name}, Mean Time Taken: {boosting_means_time[name]:.2f} seconds, Std: {boosting_stds_time[name]:.2f}")
    else:
        print(f"Weak Learner: {name}, Mean F1 Score: {learner_means_f1[name]:.2f}, Std: {learner_stds_f1[name]:.2f}")
        print(f"Weak Learner: {name}, Mean Time Taken: {learner_means_time[name]:.2f} seconds, Std: {learner_stds_time[name]:.2f}")

GBDT was selected as the model to undergo further optimisation of hyperparameters.

#### Optimisation 1

In [None]:
import numpy as np
import pandas as pd
import time
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

# Load your dataset
data = pd.read_csv('Data/train_tfidf_features.csv')

# Exclude the first column and use the second column as the label
X = data.drop(['label', 'id'], axis=1).values
y = data['label'].values

# Split data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# List of hyperparameters and their ranges
hyperparameters = {
    'num_leaves': [15, 31, 63, 127, 255],
    'n_estimators': [50, 100, 200, 400, 800],
    'max_depth': [-1, 3, 5, 7, 9, 12],
    'min_child_samples': [10, 20, 50, 100, 150],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'scale_pos_weight': [1, 2, 5, 10, 20],
    'reg_alpha': [0, 0.1, 0.5, 1, 2],
    'reg_lambda': [0, 0.1, 0.5, 1, 2]
}

# Initialize a dictionary to store results
results = {param: {'train_f1_scores': [], 'test_f1_scores': []} for param in hyperparameters}

def evaluate_hyperparameter(param_name, param_values):
    kf = KFold(n_splits=5, shuffle=True, random_state=123)
    
    for value in param_values:
        print(f"Evaluating {param_name} with value {value}...")
        train_f1_scores = []
        test_f1_scores = []

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
            
            params = {
                'boosting_type': 'gbdt',
                'num_leaves': 31,  # Default
                'n_estimators': 100,  # Default
                'max_depth': -1,  # Default
                'min_child_samples': 20,  # Default
                'subsample': 1.0,  # Default
                'colsample_bytree': 1.0,  # Default
                'scale_pos_weight': 1,  # Default
                'reg_alpha': 0,  # Default
                'reg_lambda': 0,  # Default
                'random_state': 123
            }
            params[param_name] = value
            
            model = LGBMClassifier(**params)
            
            start_time = time.time()
            try:
                model.fit(X_train_fold, y_train_fold)
                y_val_pred = model.predict(X_val_fold)
                y_test_pred = model.predict(X_test)
                
                train_f1_scores.append(f1_score(y_val_fold, y_val_pred, average='macro'))
                test_f1_scores.append(f1_score(y_test, y_test_pred, average='macro'))
            except Exception as e:
                print(f"Error while training or predicting with {param_name}={value}: {e}")

        if train_f1_scores and test_f1_scores:
            results[param_name]['train_f1_scores'].append(np.mean(train_f1_scores))
            results[param_name]['test_f1_scores'].append(np.mean(test_f1_scores))
        else:
            print(f"No scores collected for {param_name} with value {value}")

# Evaluate each hyperparameter
for param_name, param_values in hyperparameters.items():
    evaluate_hyperparameter(param_name, param_values)

# Plotting results for each hyperparameter
for param_name in hyperparameters:
    if results[param_name]['train_f1_scores']:
        plt.figure(figsize=(12, 6))
        plt.plot(hyperparameters[param_name], results[param_name]['train_f1_scores'], marker='o', label='Train F1 Score')
        plt.plot(hyperparameters[param_name], results[param_name]['test_f1_scores'], marker='o', label='Test F1 Score')
        plt.xlabel(param_name)
        plt.ylabel('F1 Score')
        plt.title(f'Macro F1 Score vs. {param_name}')
        plt.legend()
        plt.grid(True)
        plt.show()
    else:
        print(f"No results for hyperparameter {param_name}")

# Print results
for param_name in hyperparameters:
    print(f"Hyperparameter: {param_name}")
    print(f"Values: {hyperparameters[param_name]}")
    print(f"Train F1 Scores: {results[param_name]['train_f1_scores']}")
    print(f"Test F1 Scores: {results[param_name]['test_f1_scores']}")
    print()

#### Optimisation 2

In [None]:
import numpy as np
import pandas as pd
import time
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

# Load your dataset
data = pd.read_csv('Data/train_tfidf_features.csv')

# Exclude the first column and use the second column as the label
X = data.drop(['label', 'id'], axis=1).values
y = data['label'].values

# Split data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# List of hyperparameters and their ranges
hyperparameters = {
    'num_leaves': [31, 40, 50, 63, 70, 80, 90, 100],
    'n_estimators': [75, 100, 125, 150, 175, 200],
    'max_depth': [-1, 10, 20, 30, 40, 50, 60, 70],
    'min_child_samples': [1, 4, 8, 12, 16, 20],
    'scale_pos_weight': [1, 1.25, 1.5, 1.75, 2, 2.25, 2.5],
    'reg_alpha': [0, 0.4, 0.8, 1.2, 1.6, 2],
    'reg_lambda': [0, 0.4, 0.8, 1.2, 1.6, 2]
}

# Initialize a dictionary to store results
results = {param: {'train_f1_scores': [], 'test_f1_scores': []} for param in hyperparameters}

def evaluate_hyperparameter(param_name, param_values):
    kf = KFold(n_splits=5, shuffle=True, random_state=123)
    
    for value in param_values:
        print(f"Evaluating {param_name} with value {value}...")
        train_f1_scores = []
        test_f1_scores = []

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
            
            params = {
                'boosting_type': 'gbdt',
                'num_leaves': 63,  # Default
                'n_estimators': 100,  # Default
                'max_depth': -1,  # Default
                'min_child_samples': 10,  # Default
                'subsample': 1.0,  # Default
                'colsample_bytree': 1.0,  # Default
                'scale_pos_weight': 1,  # Default
                'reg_alpha': 0,  # Default
                'reg_lambda': 0,  # Default
                'random_state': 123
            }
            params[param_name] = value
            
            model = LGBMClassifier(**params)
            
            start_time = time.time()
            try:
                model.fit(X_train_fold, y_train_fold)
                y_val_pred = model.predict(X_val_fold)
                y_test_pred = model.predict(X_test)
                
                train_f1_scores.append(f1_score(y_val_fold, y_val_pred, average='macro'))
                test_f1_scores.append(f1_score(y_test, y_test_pred, average='macro'))
            except Exception as e:
                print(f"Error while training or predicting with {param_name}={value}: {e}")

        if train_f1_scores and test_f1_scores:
            results[param_name]['train_f1_scores'].append(np.mean(train_f1_scores))
            results[param_name]['test_f1_scores'].append(np.mean(test_f1_scores))
        else:
            print(f"No scores collected for {param_name} with value {value}")

# Evaluate each hyperparameter
for param_name, param_values in hyperparameters.items():
    evaluate_hyperparameter(param_name, param_values)

# Plotting results for each hyperparameter
for param_name in hyperparameters:
    if results[param_name]['train_f1_scores']:
        plt.figure(figsize=(12, 6))
        plt.plot(hyperparameters[param_name], results[param_name]['train_f1_scores'], marker='o', label='Train F1 Score')
        plt.plot(hyperparameters[param_name], results[param_name]['test_f1_scores'], marker='o', label='Test F1 Score')
        plt.xlabel(param_name)
        plt.ylabel('F1 Score')
        plt.title(f'Macro F1 Score vs. {param_name}')
        plt.legend()
        plt.grid(True)
        plt.show()
    else:
        print(f"No results for hyperparameter {param_name}")

# Print results
for param_name in hyperparameters:
    print(f"Hyperparameter: {param_name}")
    print(f"Values: {hyperparameters[param_name]}")
    print(f"Train F1 Scores: {results[param_name]['train_f1_scores']}")
    print(f"Test F1 Scores: {results[param_name]['test_f1_scores']}")
    print()

#### Optimisation 3

In [None]:
import numpy as np
import pandas as pd
import time
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

# Load your dataset
data = pd.read_csv('Data/train_tfidf_features.csv')

# Exclude the first column and use the second column as the label
X = data.drop(['label', 'id'], axis=1).values
y = data['label'].values

# Split data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# List of hyperparameters and their ranges
hyperparameters = {
    'num_leaves': [31, 40, 50, 63, 70, 80, 90, 100, 110, 120, 130],
    'n_estimators': [75, 85, 95, 105, 115, 125],
    'max_depth': [-1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'min_child_samples': [1, 2, 4, 6, 8, 10],
    'reg_lambda': [1, 1.2, 1.4, 1.6, 1.8, 2.0]
}

# Initialize a dictionary to store results
results = {param: {'train_f1_scores': [], 'test_f1_scores': []} for param in hyperparameters}

def evaluate_hyperparameter(param_name, param_values):
    kf = KFold(n_splits=5, shuffle=True, random_state=123)
    
    for value in param_values:
        print(f"Evaluating {param_name} with value {value}...")
        train_f1_scores = []
        test_f1_scores = []

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
            
            params = {
                'boosting_type': 'gbdt',
                'num_leaves': 63,  # Default
                'n_estimators': 100,  # Default
                'max_depth': -1,  # Default
                'min_child_samples': 10,  # Default
                'subsample': 1.0,  # Default
                'colsample_bytree': 1.0,  # Default
                'scale_pos_weight': 1.5,  # DONE
                'reg_alpha': 1.2,  # DONE
                'reg_lambda': 0,  # Default
                'random_state': 123
            }
            params[param_name] = value
            
            model = LGBMClassifier(**params)
            
            start_time = time.time()
            try:
                model.fit(X_train_fold, y_train_fold)
                y_val_pred = model.predict(X_val_fold)
                y_test_pred = model.predict(X_test)
                
                train_f1_scores.append(f1_score(y_val_fold, y_val_pred, average='macro'))
                test_f1_scores.append(f1_score(y_test, y_test_pred, average='macro'))
            except Exception as e:
                print(f"Error while training or predicting with {param_name}={value}: {e}")

        if train_f1_scores and test_f1_scores:
            results[param_name]['train_f1_scores'].append(np.mean(train_f1_scores))
            results[param_name]['test_f1_scores'].append(np.mean(test_f1_scores))
        else:
            print(f"No scores collected for {param_name} with value {value}")

# Evaluate each hyperparameter
for param_name, param_values in hyperparameters.items():
    evaluate_hyperparameter(param_name, param_values)

# Plotting results for each hyperparameter
for param_name in hyperparameters:
    if results[param_name]['train_f1_scores']:
        plt.figure(figsize=(12, 6))
        plt.plot(hyperparameters[param_name], results[param_name]['train_f1_scores'], marker='o', label='Train F1 Score')
        plt.plot(hyperparameters[param_name], results[param_name]['test_f1_scores'], marker='o', label='Test F1 Score')
        plt.xlabel(param_name)
        plt.ylabel('F1 Score')
        plt.title(f'Macro F1 Score vs. {param_name}')
        plt.legend()
        plt.grid(True)
        plt.show()
    else:
        print(f"No results for hyperparameter {param_name}")

# Print results
for param_name in hyperparameters:
    print(f"Hyperparameter: {param_name}")
    print(f"Values: {hyperparameters[param_name]}")
    print(f"Train F1 Scores: {results[param_name]['train_f1_scores']}")
    print(f"Test F1 Scores: {results[param_name]['test_f1_scores']}")
    print()

#### Optimisation 4

In [None]:
import numpy as np
import pandas as pd
import time
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

# Load your dataset
data = pd.read_csv('Data/train_tfidf_features.csv')

# Exclude the first column and use the second column as the label
X = data.drop(['label', 'id'], axis=1).values
y = data['label'].values

# Split data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# List of hyperparameters and their ranges
hyperparameters = {
    'min_child_samples': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'colsample_bytree': [0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1],
}

# Initialize a dictionary to store results
results = {param: {'train_f1_scores': [], 'test_f1_scores': []} for param in hyperparameters}

def evaluate_hyperparameter(param_name, param_values):
    kf = KFold(n_splits=5, shuffle=True, random_state=123)
    
    for value in param_values:
        print(f"Evaluating {param_name} with value {value}...")
        train_f1_scores = []
        test_f1_scores = []

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
            
            params = {
                'boosting_type': 'gbdt',
                'num_leaves': 70,
                'n_estimators': 85,
                'max_depth': 30,
                'min_child_samples': 2,
                'subsample': 1.0,  # Default
                'colsample_bytree': 1.0,  # Default
                'scale_pos_weight': 1.5,
                'reg_alpha': 1.2,
                'reg_lambda': 1,
                'random_state': 123
            }
            params[param_name] = value
            
            model = LGBMClassifier(**params)
            
            start_time = time.time()
            try:
                model.fit(X_train_fold, y_train_fold)
                y_val_pred = model.predict(X_val_fold)
                y_test_pred = model.predict(X_test)
                
                train_f1_scores.append(f1_score(y_val_fold, y_val_pred, average='macro'))
                test_f1_scores.append(f1_score(y_test, y_test_pred, average='macro'))
                
                # Check the impact on training
                print(f"Trained with {param_name}={value} in {time.time() - start_time:.2f} seconds")
            except Exception as e:
                print(f"Error while training or predicting with {param_name}={value}: {e}")

        if train_f1_scores and test_f1_scores:
            results[param_name]['train_f1_scores'].append(np.mean(train_f1_scores))
            results[param_name]['test_f1_scores'].append(np.mean(test_f1_scores))
        else:
            print(f"No scores collected for {param_name} with value {value}")

# Evaluate each hyperparameter
for param_name, param_values in hyperparameters.items():
    evaluate_hyperparameter(param_name, param_values)

# Plotting results for each hyperparameter
for param_name in hyperparameters:
    if results[param_name]['train_f1_scores']:
        plt.figure(figsize=(12, 6))
        plt.plot(hyperparameters[param_name], results[param_name]['train_f1_scores'], marker='o', label='Train F1 Score')
        plt.plot(hyperparameters[param_name], results[param_name]['test_f1_scores'], marker='o', label='Test F1 Score')
        plt.xlabel(param_name)
        plt.ylabel('F1 Score')
        plt.title(f'Macro F1 Score vs. {param_name}')
        plt.legend()
        plt.grid(True)
        plt.show()
    else:
        print(f"No results for hyperparameter {param_name}")

# Print results
for param_name in hyperparameters:
    print(f"Hyperparameter: {param_name}")
    print(f"Values: {hyperparameters[param_name]}")
    print(f"Train F1 Scores: {results[param_name]['train_f1_scores']}")
    print(f"Test F1 Scores: {results[param_name]['test_f1_scores']}")
    print()

#### Optimisation 5

In [None]:
import numpy as np
import pandas as pd
import time
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

# Load your dataset
data = pd.read_csv('Data/train_tfidf_features.csv')

# Exclude the first column and use the second column as the label
X = data.drop(['label', 'id'], axis=1).values
y = data['label'].values

# Split data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# List of hyperparameters and their ranges
hyperparameters = {
    'colsample_bytree': [0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1],
}

# Initialize a dictionary to store results
results = {param: {'train_f1_scores': [], 'test_f1_scores': []} for param in hyperparameters}

def evaluate_hyperparameter(param_name, param_values):
    kf = KFold(n_splits=5, shuffle=True, random_state=123)
    
    for value in param_values:
        print(f"Evaluating {param_name} with value {value}...")
        train_f1_scores = []
        test_f1_scores = []

        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
            
            params = {
                'boosting_type': 'gbdt',
                'num_leaves': 70,
                'n_estimators': 85,
                'max_depth': 30,
                'min_child_samples': 4,
                'subsample': 1.0,  # Default
                'colsample_bytree': 1.0,  # Default
                'scale_pos_weight': 1.5,
                'reg_alpha': 1.2,
                'reg_lambda': 1,
                'random_state': 123
            }
            params[param_name] = value
            
            model = LGBMClassifier(**params)
            
            start_time = time.time()
            try:
                model.fit(X_train_fold, y_train_fold)
                y_val_pred = model.predict(X_val_fold)
                y_test_pred = model.predict(X_test)
                
                train_f1_scores.append(f1_score(y_val_fold, y_val_pred, average='macro'))
                test_f1_scores.append(f1_score(y_test, y_test_pred, average='macro'))
                
                # Check the impact on training
                print(f"Trained with {param_name}={value} in {time.time() - start_time:.2f} seconds")
            except Exception as e:
                print(f"Error while training or predicting with {param_name}={value}: {e}")

        if train_f1_scores and test_f1_scores:
            results[param_name]['train_f1_scores'].append(np.mean(train_f1_scores))
            results[param_name]['test_f1_scores'].append(np.mean(test_f1_scores))
        else:
            print(f"No scores collected for {param_name} with value {value}")

# Evaluate each hyperparameter
for param_name, param_values in hyperparameters.items():
    evaluate_hyperparameter(param_name, param_values)

# Plotting results for each hyperparameter
for param_name in hyperparameters:
    if results[param_name]['train_f1_scores']:
        plt.figure(figsize=(12, 6))
        plt.plot(hyperparameters[param_name], results[param_name]['train_f1_scores'], marker='o', label='Train F1 Score')
        plt.plot(hyperparameters[param_name], results[param_name]['test_f1_scores'], marker='o', label='Test F1 Score')
        plt.xlabel(param_name)
        plt.ylabel('F1 Score')
        plt.title(f'Macro F1 Score vs. {param_name}')
        plt.legend()
        plt.grid(True)
        plt.show()
    else:
        print(f"No results for hyperparameter {param_name}")

# Print results
for param_name in hyperparameters:
    print(f"Hyperparameter: {param_name}")
    print(f"Values: {hyperparameters[param_name]}")
    print(f"Train F1 Scores: {results[param_name]['train_f1_scores']}")
    print(f"Test F1 Scores: {results[param_name]['test_f1_scores']}")
    print()

### Optimised model

In [None]:
import numpy as np
import pandas as pd
import time
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score

# Load your training dataset
train_data = pd.read_csv('Data/train_tfidf_features.csv')

# Exclude the first column and use the second column as the label
X_train = train_data.drop(['label', 'id'], axis=1).values
y_train = train_data['label'].values

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

# Load your test dataset
test_data = pd.read_csv('Data/test_tfidf_features.csv')

# Exclude the first column
X_test = test_data.drop(['id'], axis=1).values

print(f"X_test shape: {X_test.shape}")

# Define model parameters
params = {
    'boosting_type': 'gbdt',
    'num_leaves': 70,
    'n_estimators': 85,
    'max_depth': 30,
    'min_child_samples': 4,
    'subsample': 1.0,  # Default
    'colsample_bytree': 0.55,
    'scale_pos_weight': 1.5,
    'reg_alpha': 1.2,
    'reg_lambda': 1,
    'random_state': 123
}

# Train the model on the entire training dataset
model = LGBMClassifier(**params)
start_time = time.time()
model.fit(X_train, y_train)
print(f"Model trained in {time.time() - start_time:.2f} seconds")

# Make predictions on the test dataset
y_test_pred = model.predict(X_test)

# Output the predictions
output = pd.DataFrame({'id': test_data['id'], 'label': y_test_pred})
#output.to_csv('Optimal_boosting.csv', index=False) #uncomment to get file, otherwise file is found in /task3_predictions


print("Predictions saved to Optimal_boosting.csv")

The optimal hyperparameters are:
- num_leaves: 70
- n_estimators: 85
- max_depth:30
- min_child_samples: 4
- colsample_bytree: 0.55
- scale_pos_weight: 1.5
- reg_alpha: 1
- reg_lambda: 123

## Final Model
Avengers Ensemble Method

In [None]:
import pandas as pd
import numpy as np

test = pd.read_csv("Data/test_tfidf_features.csv")

# Read the first set of predictions
y_pred1 = pd.read_csv("Task3_predictions/Optimal_LogReg.csv")
y_pred1 = y_pred1['label'].values

# Read the second set of predictions
y_pred2 = pd.read_csv("Task3_predictions/Optimal_gbdt.csv")
y_pred2 = y_pred2['label'].values

# Read the third set of predictions
y_pred3 = pd.read_csv("Task3_predictions/Optimal_forest.csv")
y_pred3 = y_pred3['label'].values

# Read the fourth set of predictions
y_pred4 = pd.read_csv("Task3_predictions/Optimal_NB.csv")
y_pred4 = y_pred4['label'].values

# Read the fifth set of predictions
y_pred5 = pd.read_csv("Task3_predictions/Optimal_svm.csv")
y_pred5 = y_pred5['label'].values

# Combine predictions into a DataFrame for easier manipulation
predictions_df = pd.DataFrame({
    'id': test['id'],
    'pred1': y_pred1,
    'pred2': y_pred2,
    'pred3': y_pred3,
    'pred4': y_pred4,
    'pred5': y_pred5
})

# Perform majority voting
predictions_df['label'] = (predictions_df[['pred1', 'pred2', 'pred3', 'pred4', 'pred5']].sum(axis=1) > 1).astype(int)

#Display the first few rows of the final predictions
print(predictions_df.head())

# Save the final predictions to a CSV file
final_df = predictions_df[['id', 'label']]
#final_df.to_csv('final_predictions.csv', index=False) #uncomment to get file, otherwise file is found in root folder