In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from collections import Counter
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import RandomOverSampler
import random
import itertools


import warnings
warnings.filterwarnings('ignore')


data = pd.read_pickle("ass2.pickle")
train = data["train"].to_numpy()

X_train = train[:, :-1]
y_train = train[:, -1]

test = data["test"].to_numpy()
X_test = test[:, :-1]
y_test = test[:, -1]

dev = data["dev"].to_numpy()
X_dev = dev[:, :-1]
y_dev = dev[:, -1]



Preliminary data analysis:

In [63]:
print(f"number of attributes: {X_train.shape[1] - 1}\n")
print(f"number of samples:\n\ttrain: {X_train.shape[0]}, test: {X_test.shape[0]}, dev: {X_dev.shape[0]}\n")

train_num_of_nulls = np.sum(np.isnan(train))
print(f"number of nulls in train: {train_num_of_nulls}")

test_num_of_nulls = np.sum(np.isnan(test))
print(f"number of nulls in test: {test_num_of_nulls}")

dev_num_of_nulls = np.sum(np.isnan(dev))
print(f"number of nulls in dev: {dev_num_of_nulls}")

# Scale the data
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
X_dev_scaled = scaler.fit_transform(X_dev)

# Merge the train and dev datasets
X_train_and_dev_scaled = np.vstack((X_train_scaled, X_dev_scaled))
y_train_and_dev = np.hstack((y_train, y_dev))

percentage_of_each_category = {category: count/len(y_train) for category, count in Counter(y_train).items()}
print("\npercentage of each category in the training data (check for balance):\n")
print(percentage_of_each_category)

dev_percentage_of_each_category = {category: count/len(y_dev) for category, count in Counter(y_dev).items()}
print("\npercentage of each category in the dev data (check for balance):\n")
print(dev_percentage_of_each_category)

test_percentage_of_each_category = {category: count/len(y_test) for category, count in Counter(y_test).items()}
print("\npercentage of each category in the dev data (check for balance):\n")
print(test_percentage_of_each_category)


number of attributes: 41

number of samples:
	train: 40533, test: 13512, dev: 13512

number of nulls in train: 0
number of nulls in test: 0
number of nulls in dev: 0

percentage of each category in the training data (check for balance):

{2: 0.6595613450768509, 1: 0.2438013470505514, 0: 0.09663730787259764}

percentage of each category in the dev data (check for balance):

{2: 0.6563795145056246, 1: 0.2496299585553582, 0: 0.09399052693901717}

percentage of each category in the dev data (check for balance):

{2: 0.6564535227945529, 0: 0.09339846062759029, 1: 0.25014801657785674}


-------------------------------------- flow --------------------------------------

1. Running the models on the original training data (unbalanced) with default hyperparameters
2. Running the models on the original training data (unbalanced) with optimized hyperparameters
3. Running the models on the balanced training data with optimized hyperparameters

Running the models on the original training data (unbalanced) with default hyperparameters:

In [62]:
models = {
            'KNN': KNeighborsClassifier,
            'Decision Tree': DecisionTreeClassifier,
            'Random Forest': RandomForestClassifier,
            'Logistic Regression': LogisticRegression, 
            'Adaptive Boosting': AdaBoostClassifier,
            'Gradient Boosting': GradientBoostingClassifier,
            'Gaussian Naiive Bayes': GaussianNB,            
        }

hyperparameters = {
            'KNN': {
                    'n_neighbors': [3, 5, 7],
                    'weights': ['uniform', 'distance'],
                    'algorithm': ['ball_tree', 'kd_tree']
                    },
            'Decision Tree': {
                                'max_depth': [None, 5, 10],
                                'min_samples_split': [2, 5, 10],
                                'criterion': ['gini', 'entropy']
                            },
            'Random Forest': {
                                'n_estimators': [100, 200, 300],
                                'max_depth': [None, 5, 10],
                                'min_samples_split': [2, 5, 10]
                            },
            'Logistic Regression': {
                                'C': [0.1, 1.0, 10.0],
                                'penalty': ['l1', 'l2'],
                                'solver': ['liblinear', 'saga']
                                    }, 
            'Adaptive Boosting': {
                                'n_estimators': [50, 100, 150],
                                'learning_rate': [0.1, 0.01, 0.001],
                                'base_estimator__max_depth': [1, 3, 5]
                                },
            'Gradient Boosting': {
                                'n_estimators': [50, 100, 150],
                                'learning_rate': [0.1, 0.01, 0.001],
                                'max_depth': [3, 5, 7]
                                },
            'Gaussian Naiive Bayes': {'var_smoothing': [1e-9, 1e-8, 1e-7]},            
        }

models_iteration_1 = {}
training_accuracies_iteration_1 = {}
best_model = None
best_model_name = ""
best_accuracy = 0

for name, model_class in models.items():
    model = model_class()
    print(f"{name}:")

    clf = model.fit(X_train_and_dev_scaled, y_train_and_dev)
    y_pred = clf.predict(X_test_scaled)
        
    dict_report = classification_report(y_test, y_pred, output_dict=True)
    str_report = classification_report(y_test, y_pred, output_dict=False)
    print(str_report)
    
    models_iteration_1[name] = clf
    
    if (dict_report["accuracy"] > best_accuracy):
        best_accuracy = dict_report["accuracy"]
        best_model = model
        best_model_name = name
        
print(f"\nBest model when fit to the original data (unbalanced) with default hyperparameters:\n\t{best_model_name}")


KNN:
              precision    recall  f1-score   support

           0       0.29      0.23      0.26      1262
           1       0.63      0.52      0.57      3380
           2       0.80      0.88      0.84      8870

    accuracy                           0.73     13512
   macro avg       0.57      0.54      0.56     13512
weighted avg       0.71      0.73      0.72     13512

Decision Tree:
              precision    recall  f1-score   support

           0       0.23      0.25      0.24      1262
           1       0.61      0.60      0.61      3380
           2       0.83      0.82      0.82      8870

    accuracy                           0.71     13512
   macro avg       0.56      0.56      0.56     13512
weighted avg       0.72      0.71      0.72     13512

Random Forest:
              precision    recall  f1-score   support

           0       0.53      0.06      0.12      1262
           1       0.78      0.65      0.71      3380
           2       0.81      0.96      0

Running the models on the original training data (unbalanced) with optimized hyperparameters:

* When using 'r2' as the scoring metric in GridSearchCV, it means that the grid search will evaluate different parameter combinations based on how well they maximize the R-squared value. The goal is to find the parameter combination that yields the highest R-squared score, indicating the best fit of the model to the data.*

In [42]:
models_iteration_2 = {}
accuracies_iteration_2 = {}
best_model = None
best_model_name = ""
best_accuracy = 0

for name, model_class in models.items():
    model = model_class()
    print(f"{name}:")
    
    model_hyperparameters = hyperparameters.get(name)
    print(model_hyperparameters)
    
    # Create a GridSearchCV object
    grid_search = GridSearchCV(estimator=model, param_grid=model_hyperparameters, cv=5, scoring='r2', n_jobs=-1)
    # grid_search = RandomizedSearchCV(model, model_hyperparameters, n_iter=10, cv=5, scoring='r2', n_jobs=-1)


    # Fit the GridSearchCV object to your data
    grid_search.fit(X_train_and_dev_scaled, y_train_and_dev)

    
    # Get the best hyperparameters
    best_params = grid_search.best_params_
    
    print("best combination chosen: ", best_params)
    
    # Unpacking the best_params dict into the model constructor
    model = model_class(**best_params)

    clf = model.fit(X_train_and_dev_scaled, y_train_and_dev)
    y_pred = clf.predict(X_test_scaled)
    
    dict_report = classification_report(y_test, y_pred, output_dict=True)
    str_report = classification_report(y_test, y_pred, output_dict=False)
    print(str_report)
    
    models_iteration_2[name] = clf
    accuracies_iteration_2[name] = dict_report["accuracy"]
    
    if (dict_report["accuracy"] > best_accuracy):
        best_accuracy = dict_report["accuracy"]
        best_model = model
        best_model_name = name
        
print(f"\nBest model when fit to the original data (unbalanced) with optimized hyperparameters:\n\t{best_model_name}")


KNN:
{'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance'], 'algorithm': ['ball_tree', 'kd_tree']}


KeyboardInterrupt: 

Creating a balanced dataset

In [None]:
# Create the balanced dataset
oversampler = RandomOverSampler()
X_train_and_dev_balanced, y_train_and_dev_balanced = oversampler.fit_resample(X_train_and_dev_scaled, y_train_and_dev)

balanced_percentage_of_each_category = {category: count/len(y_train_and_dev_balanced) for category, count in Counter(y_train_and_dev_balanced).items()}
print("\npercentage of each category in the balanced training data (check for balance):\n")
print(balanced_percentage_of_each_category)

Running the models on a balanced subset of the training data with optimized hyperparameters:

In [None]:
models_iteration_3 = {}
accuracies_iteration_3 = {}
best_model = None
best_model_name = ""
best_accuracy = 0

for name, model_class in models.items():
    model = model_class()
    print(f"{name}:")
    
    model_hyperparameters = hyperparameters.get(name)
    print(model_hyperparameters)
    
    # Create a GridSearchCV object
    grid_search = GridSearchCV(estimator=model, param_grid=model_hyperparameters, cv=5, scoring='r2', n_jobs=-1)
    # grid_search = RandomizedSearchCV(model, model_hyperparameters, n_iter=10, cv=5, scoring='r2', n_jobs=-1)


    # Fit the GridSearchCV object to your data
    grid_search.fit(X_train_and_dev_scaled, y_train_and_dev)
    
    # Get the best hyperparameters
    best_params = grid_search.best_params_
    
    print(best_params)
    
    # Unpacking the best_params dict into the model constructor
    model = model_class(**best_params)

    clf = model.fit(X_train_and_dev_balanced, y_train_and_dev_balanced)
    y_pred = clf.predict(X_test_scaled)
    
    dict_report = classification_report(y_test, y_pred, output_dict=True)
    str_report = classification_report(y_test, y_pred, output_dict=False)
    print(str_report)
    
    models_iteration_3[name] = clf
    accuracies_iteration_3[name] = dict_report["accuracy"]
    
    if (dict_report["accuracy"] > best_accuracy):
        best_accuracy = dict_report["accuracy"]
        best_model = model
        best_model_name = name
        
print(f"\nBest model when fit to a balanced subset of the original data with optimized hyperparameters:\n\t{best_model_name}")


Fit the models on the training set with default hyperparameters and test accuracy on the dev set
We gave weights to each class based on its percentage of the training set
We saved the accuracy of each model on the dev set. We later use this accuracy to set the "amount of say" of each model in its ensemble

In [65]:
models_iteration_4 = {}
training_accuracies_iteration_4 = {}
best_model = None
best_model_name = ""
best_accuracy = 0

class_weights = {
    0: 1 / percentage_of_each_category[0], 
    1: 1 / percentage_of_each_category[1], 
    2: 1 / percentage_of_each_category[2] 
} # inversely proportional to the category's percentage in the dataset

sample_weights = np.array([class_weights[label] for label in y_train])

for name, model_class in models.items():
    model = model_class()
    print(f"{name}:")

    model_hyperparameters = hyperparameters.get(name)
    print(model_hyperparameters)
    
    # Create a GridSearchCV object
    grid_search = GridSearchCV(estimator=model, param_grid=model_hyperparameters, cv=5, scoring='r2', n_jobs=-1)
    # grid_search = RandomizedSearchCV(model, model_hyperparameters, n_iter=10, cv=5, scoring='r2', n_jobs=-1)


    # Fit the GridSearchCV object to your data
    if (name == 'KNN' or name == 'Gaussian Naiive Bayes'): # only these models don't take the argument sample_weight
        grid_search.fit(X_train_scaled, y_train)
    else:
        grid_search.fit(X_train_scaled, y_train, sample_weight=sample_weights)
        
    
    # Get the best hyperparameters
    best_params = grid_search.best_params_
    
    print(best_params)
    
    # Unpacking the best_params dict into the model constructor
    model = model_class(**best_params)
    
    if (name == 'KNN' or name == 'Gaussian Naiive Bayes'): # only these models don't take the argument sample_weight
        clf = model.fit(X_train_scaled, y_train)
    else:
        clf = model.fit(X_train_scaled, y_train, sample_weight=sample_weights)
    y_pred = clf.predict(X_dev_scaled)
        
    dict_report = classification_report(y_dev, y_pred, output_dict=True)
    str_report = classification_report(y_dev, y_pred, output_dict=False)
    print(str_report)
    
    models_iteration_4[name] = clf
    training_accuracies_iteration_4[name] = dict_report["accuracy"]
    
    if (dict_report["accuracy"] > best_accuracy):
        best_accuracy = dict_report["accuracy"]
        best_model = model
        best_model_name = name
        
print(f"\nBest model when fit to the original data (unbalanced) with default hyperparameters:\n\t{best_model_name}")
print(f"accuracy: {best_accuracy}")


KNN:
{'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance'], 'algorithm': ['ball_tree', 'kd_tree']}


ValueError: 
All the 60 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\etzio\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
TypeError: KNeighborsClassifier.fit() got an unexpected keyword argument 'sample_weight'


Creating all possible combinations of models:

In [58]:
ensembles = []

for r in range(2, len(models_iteration_4) + 1):
    subsets = itertools.combinations(models_iteration_4.items(), r)
    # subsets is a list of tuples (name, model)
    for subset in subsets:
        ensemble = {}
        for name, model in subset:
            ensemble[name] = model
        ensembles.append(ensemble)
                
print(f"num of subsets = {len(ensembles)}")


num of subsets = 120


Testing all possible combinations of models as ensembles:

In [64]:
best_ensamble = None
best_ensamble_name = ""
best_accuracy = 0

for ensemble in ensembles:
    ensemble_name = ", ".join([name for name in ensemble.keys()]) # join all the model names    
    print(f"{ensemble_name}:")
    
    start = True
    
    sum_of_accs = np.sum(np.array([training_accuracies_iteration_4[name] for name in ensemble.keys()]))
    preds = np.zeros(len(y_test))
    for name, model in ensemble.items():
        
        pred = model.predict(X_test_scaled)
        # multiply the prediction by the "amount of say" of this model
        pred *= training_accuracies_iteration_4[name] / sum_of_accs
        preds += pred
        start = False
    preds = (np.rint(preds)).astype(int)
    
    dict_report = classification_report(y_test, preds, output_dict=True)
    str_report = classification_report(y_test, preds, output_dict=False)
    print(str_report)
    
    if (dict_report["accuracy"] > best_accuracy):
        best_accuracy = dict_report["accuracy"]
        best_ensemble = models
        best_ensemble_name = ensemble_name
        
print(f"\nBest ensemble when fit to the original data (unbalanced) with optimized hyperparameters:\n\t{best_ensemble_name}")
print(f"accuracy: {best_accuracy}")


KNN, Decision Tree:
              precision    recall  f1-score   support

           0       0.34      0.18      0.23      1262
           1       0.52      0.71      0.60      3380
           2       0.86      0.80      0.83      8870

    accuracy                           0.72     13512
   macro avg       0.57      0.56      0.55     13512
weighted avg       0.73      0.72      0.72     13512

KNN, Random Forest:
              precision    recall  f1-score   support

           0       0.49      0.08      0.14      1262
           1       0.68      0.69      0.69      3380
           2       0.83      0.93      0.88      8870

    accuracy                           0.79     13512
   macro avg       0.67      0.57      0.57     13512
weighted avg       0.76      0.79      0.76     13512

KNN, Logistic Regression:
              precision    recall  f1-score   support

           0       0.31      0.19      0.23      1262
           1       0.38      0.65      0.48      3380
         

              precision    recall  f1-score   support

           0       0.46      0.08      0.14      1262
           1       0.57      0.73      0.64      3380
           2       0.85      0.86      0.86      8870

    accuracy                           0.76     13512
   macro avg       0.63      0.56      0.55     13512
weighted avg       0.75      0.76      0.74     13512

KNN, Decision Tree, Logistic Regression:
              precision    recall  f1-score   support

           0       0.36      0.18      0.24      1262
           1       0.50      0.68      0.58      3380
           2       0.86      0.80      0.83      8870

    accuracy                           0.71     13512
   macro avg       0.57      0.55      0.55     13512
weighted avg       0.72      0.71      0.71     13512

KNN, Decision Tree, Adaptive Boosting:
              precision    recall  f1-score   support

           0       0.36      0.18      0.24      1262
           1       0.39      0.78      0.52      

              precision    recall  f1-score   support

           0       0.29      0.33      0.31      1262
           1       0.43      0.73      0.54      3380
           2       0.90      0.65      0.76      8870

    accuracy                           0.64     13512
   macro avg       0.54      0.57      0.53     13512
weighted avg       0.73      0.64      0.66     13512

Decision Tree, Logistic Regression, Gaussian Naiive Bayes:
              precision    recall  f1-score   support

           0       0.25      0.11      0.15      1262
           1       0.43      0.59      0.50      3380
           2       0.80      0.75      0.78      8870

    accuracy                           0.65     13512
   macro avg       0.50      0.48      0.48     13512
weighted avg       0.66      0.65      0.65     13512

Decision Tree, Adaptive Boosting, Gradient Boosting:
              precision    recall  f1-score   support

           0       0.28      0.34      0.31      1262
           1     

              precision    recall  f1-score   support

           0       0.37      0.17      0.23      1262
           1       0.41      0.79      0.54      3380
           2       0.89      0.64      0.75      8870

    accuracy                           0.63     13512
   macro avg       0.56      0.53      0.51     13512
weighted avg       0.72      0.63      0.65     13512

KNN, Decision Tree, Logistic Regression, Gradient Boosting:
              precision    recall  f1-score   support

           0       0.38      0.17      0.24      1262
           1       0.42      0.81      0.56      3380
           2       0.91      0.66      0.76      8870

    accuracy                           0.65     13512
   macro avg       0.57      0.55      0.52     13512
weighted avg       0.74      0.65      0.66     13512

KNN, Decision Tree, Logistic Regression, Gaussian Naiive Bayes:
              precision    recall  f1-score   support

           0       0.42      0.08      0.14      1262
     

              precision    recall  f1-score   support

           0       0.44      0.13      0.20      1262
           1       0.45      0.81      0.58      3380
           2       0.90      0.71      0.79      8870

    accuracy                           0.68     13512
   macro avg       0.59      0.55      0.52     13512
weighted avg       0.74      0.68      0.68     13512

Decision Tree, Random Forest, Adaptive Boosting, Gaussian Naiive Bayes:
              precision    recall  f1-score   support

           0       0.53      0.09      0.15      1262
           1       0.52      0.77      0.62      3380
           2       0.86      0.80      0.83      8870

    accuracy                           0.73     13512
   macro avg       0.64      0.55      0.53     13512
weighted avg       0.74      0.73      0.71     13512

Decision Tree, Random Forest, Gradient Boosting, Gaussian Naiive Bayes:
              precision    recall  f1-score   support

           0       0.54      0.09      

              precision    recall  f1-score   support

           0       0.30      0.26      0.28      1262
           1       0.40      0.77      0.53      3380
           2       0.90      0.61      0.73      8870

    accuracy                           0.62     13512
   macro avg       0.53      0.54      0.51     13512
weighted avg       0.72      0.62      0.63     13512

KNN, Decision Tree, Logistic Regression, Adaptive Boosting, Gaussian Naiive Bayes:
              precision    recall  f1-score   support

           0       0.33      0.12      0.17      1262
           1       0.40      0.75      0.52      3380
           2       0.86      0.65      0.74      8870

    accuracy                           0.63     13512
   macro avg       0.53      0.51      0.48     13512
weighted avg       0.70      0.63      0.63     13512

KNN, Decision Tree, Logistic Regression, Gradient Boosting, Gaussian Naiive Bayes:
              precision    recall  f1-score   support

           0     

              precision    recall  f1-score   support

           0       0.48      0.08      0.14      1262
           1       0.46      0.79      0.58      3380
           2       0.88      0.74      0.80      8870

    accuracy                           0.69     13512
   macro avg       0.61      0.54      0.51     13512
weighted avg       0.74      0.69      0.69     13512

KNN, Decision Tree, Logistic Regression, Adaptive Boosting, Gradient Boosting, Gaussian Naiive Bayes:
              precision    recall  f1-score   support

           0       0.36      0.13      0.19      1262
           1       0.41      0.82      0.54      3380
           2       0.89      0.63      0.74      8870

    accuracy                           0.63     13512
   macro avg       0.55      0.53      0.49     13512
weighted avg       0.72      0.63      0.64     13512

KNN, Random Forest, Logistic Regression, Adaptive Boosting, Gradient Boosting, Gaussian Naiive Bayes:
              precision    recall 