In [106]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from collections import Counter
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import RandomOverSampler
import random
import itertools


import warnings
warnings.filterwarnings('ignore')


data = pd.read_pickle("ass2.pickle")
train = data["train"].to_numpy()

X_train = train[:, :-1]
y_train = train[:, -1]

test = data["test"].to_numpy()
X_test = test[:, :-1]
y_test = test[:, -1]

dev = data["dev"].to_numpy()
X_dev = dev[:, :-1]
y_dev = dev[:, -1]



Preliminary data analysis:

In [107]:
print(f"number of attributes: {X_train.shape[1] - 1}\n")
print(f"number of samples:\n\ttrain: {X_train.shape[0]}, test: {X_test.shape[0]}, dev: {X_dev.shape[0]}\n")

train_num_of_nulls = np.sum(np.isnan(train))
print(f"number of nulls in train: {train_num_of_nulls}")

test_num_of_nulls = np.sum(np.isnan(test))
print(f"number of nulls in test: {test_num_of_nulls}")

dev_num_of_nulls = np.sum(np.isnan(dev))
print(f"number of nulls in dev: {dev_num_of_nulls}")

# Scale the data
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
X_dev_scaled = scaler.fit_transform(X_dev)

# Merge the train and dev datasets
X_train_and_dev_scaled = np.vstack((X_train_scaled, X_dev_scaled))
y_train_and_dev = np.hstack((y_train, y_dev))

percentage_of_each_category = {category: count/len(y_train) for category, count in Counter(y_train).items()}
print("\npercentage of each category in the training data (check for balance):\n")
print(percentage_of_each_category)

dev_percentage_of_each_category = {category: count/len(y_dev) for category, count in Counter(y_dev).items()}
print("\npercentage of each category in the dev data (check for balance):\n")
print(dev_percentage_of_each_category)

test_percentage_of_each_category = {category: count/len(y_test) for category, count in Counter(y_test).items()}
print("\npercentage of each category in the dev data (check for balance):\n")
print(test_percentage_of_each_category)

categories = list(percentage_of_each_category.keys())
print(f"\ncategories: {categories}")


number of attributes: 41

number of samples:
	train: 40533, test: 13512, dev: 13512

number of nulls in train: 0
number of nulls in test: 0
number of nulls in dev: 0

percentage of each category in the training data (check for balance):

{2: 0.6595613450768509, 1: 0.2438013470505514, 0: 0.09663730787259764}

percentage of each category in the dev data (check for balance):

{2: 0.6563795145056246, 1: 0.2496299585553582, 0: 0.09399052693901717}

percentage of each category in the dev data (check for balance):

{2: 0.6564535227945529, 0: 0.09339846062759029, 1: 0.25014801657785674}

categories: [2, 1, 0]


-------------------------------------- flow --------------------------------------

1. Running the models on the original training+dev data (unbalanced) with default hyperparameters
2. Running the models on the original training+dev data (unbalanced) with optimized hyperparameters
3. Running the models on the original training data, testing their roc auc scores on the dev data to determine every model's amount of say in the ensambles
4. Testing all combinations of models in search for the best ensemble

Running the models on the original training data (unbalanced) with default hyperparameters:

In [108]:
models = {
            'KNN': KNeighborsClassifier,
            'Decision Tree': DecisionTreeClassifier,
            'Random Forest': RandomForestClassifier,
            'Logistic Regression': LogisticRegression, 
            'Gradient Boosting': GradientBoostingClassifier,
            'Gaussian Naiive Bayes': GaussianNB,            
        }

best_model = None
best_model_name = ""
best_roc_auc = 0

for name, model_class in models.items():
    model = model_class()
    print(f"{name}:")

    # fit the model on the train_and_dev data
    clf = model.fit(X_train_and_dev_scaled, y_train_and_dev)
    y_pred = clf.predict(X_test_scaled)
        
    # performance report
    str_report = classification_report(y_test, y_pred, output_dict=False)
    print(str_report)
        
    # roc auc score
    y_pred = model.predict_proba(X_test_scaled)
    model_roc_auc = roc_auc_score(y_test, y_pred, multi_class='ovo', )
    print(f"roc auc = {model_roc_auc}\n")
    
    if (model_roc_auc > best_roc_auc):
        best_roc_auc = model_roc_auc
        best_model = model
        best_model_name = name
                
print(f"\nBest model when fit to the original data (unbalanced) with default hyperparameters:\n\t{best_model_name}")
print(f"roc auc: {best_roc_auc}")


KNN:
              precision    recall  f1-score   support

           0       0.29      0.23      0.26      1262
           1       0.63      0.52      0.57      3380
           2       0.80      0.88      0.84      8870

    accuracy                           0.73     13512
   macro avg       0.57      0.54      0.56     13512
weighted avg       0.71      0.73      0.72     13512

roc auc = 0.7461567089445434

Decision Tree:
              precision    recall  f1-score   support

           0       0.22      0.24      0.23      1262
           1       0.61      0.59      0.60      3380
           2       0.82      0.82      0.82      8870

    accuracy                           0.71     13512
   macro avg       0.55      0.55      0.55     13512
weighted avg       0.71      0.71      0.71     13512

roc auc = 0.6639066160300829

Random Forest:
              precision    recall  f1-score   support

           0       0.52      0.06      0.11      1262
           1       0.78      0.65 

Running the models on the original training data (unbalanced) with optimized hyperparameters:

When using 'roc_auc' as the scoring metric in GridSearchCV, it means that the grid search will evaluate different parameter combinations based on how well they maximize roc auc value. The goal is to find the parameter combination that yields the highest roc auc score, indicating the best fit of the model to the data.

We give weights to each class based on its percentage of the training set (the more it appears in the training data, the lower is it's weight). This helps offset the inbalance in the data a bit without undersampling (to not lose precious data).

In [None]:
hyperparameters = {
            'KNN': {
                    'n_neighbors': [3, 5, 7],
                    'weights': ['uniform', 'distance'],
                    'algorithm': ['ball_tree', 'kd_tree']
                    },
            'Decision Tree': {
                                'max_depth': [None, 5, 10],
                                'min_samples_split': [2, 5, 10],
                                'criterion': ['gini', 'entropy']
                            },
            'Random Forest': {
                                'n_estimators': [100, 200, 300],
                                'max_depth': [None, 5, 10],
                                'min_samples_split': [2, 5, 10]
                            },
            'Logistic Regression': {
                                'C': [0.1, 1.0, 10.0],
                                'penalty': ['l1', 'l2'],
                                'solver': ['liblinear', 'saga']
                                    }, 
            'Gradient Boosting': {
                                'n_estimators': [50, 100, 150],
                                'learning_rate': [0.1, 0.01, 0.001],
                                'max_depth': [3, 5, 7]
                                },
            'Gaussian Naiive Bayes': {'var_smoothing': [1e-9, 1e-8, 1e-7]},            
        }

class_weights = {
    0: 1 / percentage_of_each_category[0], 
    1: 1 / percentage_of_each_category[1], 
    2: 1 / percentage_of_each_category[2] 
} # inversely proportional to the category's percentage in the dataset

# give every sample it's weight
sample_weights = np.array([class_weights[label] for label in y_train_and_dev])

models_iteration_2 = {}
best_model = None
best_model_name = ""
best_roc_auc = 0

for name, model_class in models.items():
    model = model_class()
    print(f"{name}:")
    
    model_hyperparameters = hyperparameters.get(name)
    print(model_hyperparameters)
    
    # Create a GridSearchCV object
    grid_search = GridSearchCV(estimator=model, param_grid=model_hyperparameters, cv=5, scoring='roc_auc', n_jobs=-1)

    # Fit the GridSearchCV object to your data
    if (name == 'KNN' or name == 'Gaussian Naiive Bayes'): # only these models don't take the argument sample_weight
        grid_search.fit(X_train_and_dev_scaled, y_train_and_dev)
    else:
        grid_search.fit(X_train_and_dev_scaled, y_train_and_dev, sample_weight=sample_weights)
   
    # Get the best hyperparameters
    best_params = grid_search.best_params_
    
    print("best combination chosen: ", best_params)
    
    # Unpacking the best_params dict into the model constructor
    model = model_class(**best_params)

    if (name == 'KNN' or name == 'Gaussian Naiive Bayes'): # only these models don't take the argument sample_weight
        clf = model.fit(X_train_and_dev_scaled, y_train_and_dev)
    else:
        clf = model.fit(X_train_and_dev_scaled, y_train_and_dev, sample_weight=sample_weights)
    y_pred = clf.predict(X_test_scaled)
    
    str_report = classification_report(y_test, y_pred, output_dict=False)
    print(str_report)

    y_pred = model.predict_proba(X_test_scaled)
    model_roc_auc = roc_auc_score(y_test, y_pred, multi_class='ovo')
    print(f"roc auc = {model_roc_auc}\n")
    
    if (model_roc_auc > best_roc_auc):
        best_roc_auc = model_roc_auc
        best_model = model
        best_model_name = name
        
    models_iteration_2[name] = model
                
print(f"\nBest model when fit to the original data (unbalanced) with optimized hyperparameters:\n\t{best_model_name}")
print(f"roc auc: {best_roc_auc}")


KNN:
{'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance'], 'algorithm': ['ball_tree', 'kd_tree']}
best combination chosen:  {'algorithm': 'ball_tree', 'n_neighbors': 3, 'weights': 'uniform'}
              precision    recall  f1-score   support

           0       0.26      0.31      0.28      1262
           1       0.63      0.47      0.54      3380
           2       0.80      0.85      0.82      8870

    accuracy                           0.71     13512
   macro avg       0.56      0.55      0.55     13512
weighted avg       0.71      0.71      0.70     13512

roc auc = 0.7151836820976949

Decision Tree:
{'max_depth': [None, 5, 10], 'min_samples_split': [2, 5, 10], 'criterion': ['gini', 'entropy']}
best combination chosen:  {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2}
              precision    recall  f1-score   support

           0       0.24      0.26      0.25      1262
           1       0.62      0.63      0.62      3380
           2       0.83 

Fit the models on the training set with default hyperparameters and test roc auc on the dev set.
We save the roc auc of each model on the dev set. We later use this accuracy to set the "amount of say" of each model in its ensemble

In [None]:
models_iteration_3 = {}
training_roc_auc_iteration_3 = {}
best_model = None
best_model_name = ""
best_roc_auc = 0

sample_weights = np.array([class_weights[label] for label in y_train])

for name, model_class in models.items():
    model = model_class()
    print(f"{name}:")

    model_hyperparameters = hyperparameters.get(name)
    print(model_hyperparameters)
    
    # Create a GridSearchCV object
    grid_search = GridSearchCV(estimator=model, param_grid=model_hyperparameters, cv=5, scoring='roc_auc', n_jobs=-1)

    # Fit the GridSearchCV object to your data
    if (name == 'KNN' or name == 'Gaussian Naiive Bayes'): # only these models don't take the argument sample_weight
        grid_search.fit(X_train_scaled, y_train)
    else:
        grid_search.fit(X_train_scaled, y_train, sample_weight=sample_weights)
        
    
    # Get the best hyperparameters
    best_params = grid_search.best_params_
    
    print(best_params)
    
    # Unpacking the best_params dict into the model constructor
    model = model_class(**best_params)
    
    if (name == 'KNN' or name == 'Gaussian Naiive Bayes'): # only these models don't take the argument sample_weight
        clf = model.fit(X_train_scaled, y_train)
    else:
        clf = model.fit(X_train_scaled, y_train, sample_weight=sample_weights)
    y_pred = clf.predict(X_dev_scaled)
        
    str_report = classification_report(y_dev, y_pred, output_dict=False)
    print(str_report)
    
    y_pred = model.predict_proba(X_dev_scaled)
    model_roc_auc = roc_auc_score(y_dev, y_pred, multi_class='ovo')
    print(f"roc auc = {model_roc_auc}\n")
    
    if (model_roc_auc > best_roc_auc):
        best_roc_auc = model_roc_auc
        best_model = model
        best_model_name = name
                
    models_iteration_3[name] = clf
    training_roc_auc_iteration_3[name] = model_roc_auc
        
print(f"\nBest model when fit to the original data (unbalanced) with default hyperparameters:\n\t{best_model_name}")
print(f"roc auc: {best_roc_auc}")


Creating all possible combinations of models:

In [None]:
ensembles = []

for r in range(2, len(models_iteration_3) + 1):
    subsets = itertools.combinations(models_iteration_3.items(), r)
    # subsets is a list of tuples (name, model)
    for subset in subsets:
        ensemble = {}
        for name, model in subset:
            ensemble[name] = model
        ensembles.append(ensemble)
                
print(f"num of subsets = {len(ensembles)}")


Testing all possible combinations of models as ensembles:

In [None]:
best_ensamble = None
best_ensamble_name = ""
best_accuracy = 0

for ensemble in ensembles:
    ensemble_name = ", ".join([name for name in ensemble.keys()]) # join all the model names    
    print(f"{ensemble_name}:") 
    
    preds = np.zeros((len(y_test), len(categories)))
    # sum of the roc_aucs of this ensamble's models
    sum_of_roc_aucs = np.sum([training_roc_auc_iteration_3[name] for name in ensemble.keys()])
    for name, model in ensemble.items():   
        pred = model.predict_proba(X_test_scaled)
        # multiply the prediction by the "amount of say" of this model
        pred *= training_roc_auc_iteration_3[name] / sum_of_roc_aucs
#         pred /= len(ensemble)
        preds += pred
    
    # Determine the class with the maximum overall weighted votes
    majority_vote = (np.argmax(preds, axis=1)).astype(int)
        
    str_report = classification_report(y_test, majority_vote, output_dict=False)
    print(str_report)
    
    model_roc_auc = roc_auc_score(y_dev, preds, multi_class='ovo')
    print(f"roc auc = {model_roc_auc}\n")
    
    if (model_roc_auc > best_roc_auc):
        best_roc_auc = model_roc_auc
        best_model = model
        best_model_name = name
        
print(f"\nBest ensemble when fit to the original data (unbalanced) with optimized hyperparameters:\n\t{best_ensemble_name}")
print(f"roc auc: {best_roc_auc}")
