In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from collections import Counter
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import RandomOverSampler
import random


import warnings
warnings.filterwarnings('ignore')


data = pd.read_pickle("ass2.pickle")
train = data["train"].to_numpy()

X_train = train[:, :-1]
y_train = train[:, -1]

test = data["test"].to_numpy()
X_test = test[:, :-1]
y_test = test[:, -1]

dev = data["dev"].to_numpy()
X_dev = dev[:, :-1]
y_dev = dev[:, -1]



Preliminary data analysis:

In [23]:
print(f"number of attributes: {X_train.shape[1] - 1}\n")
print(f"number of samples:\n\ttrain: {X_train.shape[0]}, test: {X_test.shape[0]}, dev: {X_dev.shape[0]}\n")

train_num_of_nulls = np.sum(np.isnan(train))
print(f"number of nulls in train: {train_num_of_nulls}")

test_num_of_nulls = np.sum(np.isnan(test))
print(f"number of nulls in test: {test_num_of_nulls}")

dev_num_of_nulls = np.sum(np.isnan(dev))
print(f"number of nulls in dev: {dev_num_of_nulls}")

# Scale the data
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
X_dev_scaled = scaler.fit_transform(X_dev)

# Merge the train and dev datasets
X_train_and_dev_scaled = np.vstack((X_train_scaled, X_dev_scaled))
y_train_and_dev = np.hstack((y_train, y_dev))

percentage_of_each_category = {category: count/len(y_train) for category, count in Counter(y_train).items()}
print("\npercentage of each category in the training data (check for balance):\n")
print(percentage_of_each_category)


number of attributes: 41

number of samples:
	train: 40533, test: 13512, dev: 13512

number of nulls in train: 0
number of nulls in test: 0
number of nulls in dev: 0

percentage of each category in the training data (check for balance):

{2: 0.6595613450768509, 1: 0.2438013470505514, 0: 0.09663730787259764}


-------------------------------------- flow --------------------------------------

1. Running the models on the original training data (unbalanced) with default hyperparameters
2. Running the models on the original training data (unbalanced) with optimized hyperparameters
3. Running the models on the balanced training data with optimized hyperparameters

Running the models on the original training data (unbalanced) with default hyperparameters:

In [24]:
models = {
            'KNN': KNeighborsClassifier,
            'Decision Tree': DecisionTreeClassifier,
            'Random Forest': RandomForestClassifier,
            'Logistic Regression': LogisticRegression, 
            'Adaptive Boosting': AdaBoostClassifier,
            'Gradient Boosting': GradientBoostingClassifier,
            'Gaussian Naiive Bayes': GaussianNB,            
        }

hyperparameters = {
            'KNN': {
                    'n_neighbors': [3, 5, 7],
                    'weights': ['uniform', 'distance'],
                    'algorithm': ['ball_tree', 'kd_tree']
                    },
            'Decision Tree': {
                                'max_depth': [None, 5, 10],
                                'min_samples_split': [2, 5, 10],
                                'criterion': ['gini', 'entropy']
                            },
            'Random Forest': {
                                'n_estimators': [100, 200, 300],
                                'max_depth': [None, 5, 10],
                                'min_samples_split': [2, 5, 10]
                            },
            'Logistic Regression': {
                                'C': [0.1, 1.0, 10.0],
                                'penalty': ['l1', 'l2'],
                                'solver': ['liblinear', 'saga']
                                    }, 
            'Adaptive Boosting': {
                                'n_estimators': [50, 100, 150],
                                'learning_rate': [0.1, 0.01, 0.001],
                                'base_estimator__max_depth': [1, 3, 5]
                                },
            'Gradient Boosting': {
                                'n_estimators': [50, 100, 150],
                                'learning_rate': [0.1, 0.01, 0.001],
                                'max_depth': [3, 5, 7]
                                },
            'Gaussian Naiive Bayes': {'var_smoothing': [1e-9, 1e-8, 1e-7]},            
        }

best_model = None
best_model_name = ""
best_accuracy = 0

for name, model_class in models.items():
    model = model_class()
    print(f"{name}:")

    clf = model.fit(X_train_and_dev_scaled, y_train_and_dev)
    y_pred = clf.predict(X_test_scaled)
    
    dict_report = classification_report(y_test, y_pred, output_dict=True)
    str_report = classification_report(y_test, y_pred, output_dict=False)
    print(str_report)
    
    if (dict_report["accuracy"] > best_accuracy):
        best_accuracy = dict_report["accuracy"]
        best_model = model
        best_model_name = name
        
print(f"\nBest model when fit to the original data (unbalanced) with default hyperparameters:\n\t{best_model_name}")


KNN:
              precision    recall  f1-score   support

           0       0.29      0.23      0.26      1262
           1       0.63      0.52      0.57      3380
           2       0.80      0.88      0.84      8870

    accuracy                           0.73     13512
   macro avg       0.57      0.54      0.56     13512
weighted avg       0.71      0.73      0.72     13512

Decision Tree:
              precision    recall  f1-score   support

           0       0.23      0.24      0.23      1262
           1       0.61      0.60      0.61      3380
           2       0.82      0.82      0.82      8870

    accuracy                           0.71     13512
   macro avg       0.55      0.55      0.55     13512
weighted avg       0.72      0.71      0.71     13512

Random Forest:
              precision    recall  f1-score   support

           0       0.55      0.07      0.12      1262
           1       0.79      0.65      0.71      3380
           2       0.81      0.96      0

Running the models on the original training data (unbalanced) with optimized hyperparameters:

* When using 'r2' as the scoring metric in GridSearchCV, it means that the grid search will evaluate different parameter combinations based on how well they maximize the R-squared value. The goal is to find the parameter combination that yields the highest R-squared score, indicating the best fit of the model to the data.*

In [25]:
best_model = None
best_model_name = ""
best_accuracy = 0

for name, model_class in models.items():
    model = model_class()
    print(f"{name}:")
    
    model_hyperparameters = hyperparameters.get(name)
    print(model_hyperparameters)
    
    # Create a GridSearchCV object
    grid_search = GridSearchCV(estimator=model, param_grid=model_hyperparameters, cv=5, scoring='r2', n_jobs=-1)
    # grid_search = RandomizedSearchCV(model, model_hyperparameters, n_iter=10, cv=5, scoring='r2', n_jobs=-1)


    # Fit the GridSearchCV object to your data
    grid_search.fit(X_train_and_dev_scaled, y_train_and_dev)

    
    # Get the best hyperparameters
    best_params = grid_search.best_params_
    
    print("best combination chosen: ", best_params)
    
    # Unpacking the best_params dict into the model constructor
    model = model_class(**best_params)

    clf = model.fit(X_train_and_dev_scaled, y_train_and_dev)
    y_pred = clf.predict(X_test_scaled)
    
    dict_report = classification_report(y_test, y_pred, output_dict=True)
    str_report = classification_report(y_test, y_pred, output_dict=False)
    print(str_report)
    
    if (dict_report["accuracy"] > best_accuracy):
        best_accuracy = dict_report["accuracy"]
        best_model = model
        best_model_name = name
        
print(f"\nBest model when fit to the original data (unbalanced) with optimized hyperparameters:\n\t{best_model_name}")


KNN:
{'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance'], 'algorithm': ['ball_tree', 'kd_tree']}
best combination chosen:  {'algorithm': 'ball_tree', 'n_neighbors': 7, 'weights': 'uniform'}
              precision    recall  f1-score   support

           0       0.34      0.16      0.22      1262
           1       0.67      0.53      0.59      3380
           2       0.79      0.92      0.85      8870

    accuracy                           0.75     13512
   macro avg       0.60      0.54      0.56     13512
weighted avg       0.72      0.75      0.73     13512

Decision Tree:
{'max_depth': [None, 5, 10], 'min_samples_split': [2, 5, 10], 'criterion': ['gini', 'entropy']}


KeyboardInterrupt: 

Creating a balanced dataset

In [None]:
# Create the balanced dataset
oversampler = RandomOverSampler()
X_train_and_dev_balanced, y_train_and_dev_balanced = oversampler.fit_resample(X_train_and_dev_scaled, y_train_and_dev)

balanced_percentage_of_each_category = {category: count/len(y_train_and_dev_balanced) for category, count in Counter(y_train_and_dev_balanced).items()}
print("\npercentage of each category in the balanced training data (check for balance):\n")
print(balanced_percentage_of_each_category)

Running the models on a balanced subset of the training data with optimized hyperparameters:

In [None]:
best_model = None
best_model_name = ""
best_accuracy = 0

for name, model_class in models.items():
    model = model_class()
    print(f"{name}:")
    
    model_hyperparameters = hyperparameters.get(name)
    print(model_hyperparameters)
    
    # Create a GridSearchCV object
    grid_search = GridSearchCV(estimator=model, param_grid=model_hyperparameters, cv=5, scoring='r2', n_jobs=-1)
    # grid_search = RandomizedSearchCV(model, model_hyperparameters, n_iter=10, cv=5, scoring='r2', n_jobs=-1)


    # Fit the GridSearchCV object to your data
    grid_search.fit(X_train_and_dev_scaled, y_train_and_dev)
    
    # Get the best hyperparameters
    best_params = grid_search.best_params_
    
    print(best_params)
    
    # Unpacking the best_params dict into the model constructor
    model = model_class(**best_params)

    clf = model.fit(X_train_and_dev_balanced, y_train_and_dev_balanced)
    y_pred = clf.predict(X_test_scaled)
    
    dict_report = classification_report(y_test, y_pred, output_dict=True)
    str_report = classification_report(y_test, y_pred, output_dict=False)
    print(str_report)
    
    if (dict_report["accuracy"] > best_accuracy):
        best_accuracy = dict_report["accuracy"]
        best_model = model
        best_model_name = name
        
print(f"\nBest model when fit to a balanced subset of the original data with optimized hyperparameters:\n\t{best_model_name}")
