In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from collections import Counter
from sklearn.model_selection import GridSearchCV
import random


import warnings
warnings.filterwarnings('ignore')


data = pd.read_pickle("ass2.pickle")
train = data["train"].to_numpy()
X_train = train[:, :-1]
y_train = train[:, -1]

test = data["test"].to_numpy()
X_test = test[:, :-1]
y_test = test[:, -1]

dev = data["dev"].to_numpy()
X_dev = dev[:, :-1]
y_dev = dev[:, -1]



Preliminary data analysis:

In [94]:
print(f"number of attributes: {X_train.shape[1] - 1}\n")
print(f"number of samples:\n\ttrain: {X_train.shape[0]}, test: {X_test.shape[0]}, dev: {X_dev.shape[0]}\n")

train_num_of_nulls = np.sum(np.isnan(train))
print(f"number of nulls in train: {train_num_of_nulls}")

test_num_of_nulls = np.sum(np.isnan(test))
print(f"number of nulls in test: {test_num_of_nulls}")

dev_num_of_nulls = np.sum(np.isnan(dev))
print(f"number of nulls in dev: {dev_num_of_nulls}")

# Scale the data
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
X_dev_scaled = scaler.fit_transform(X_dev)

# Merge the train and dev datasets
X_train_and_dev_scaled = np.vstack((X_train_scaled, X_dev_scaled))
y_train_and_dev = np.hstack((y_train, y_dev))

percentage_of_each_category = {category: count/len(y_train_and_dev) for category, count in Counter(y_train_and_dev).items()}
print("\npercentage of each category in the training data (check for balance):\n")
print(percentage_of_each_category)


number of attributes: 41

number of samples:
	train: 40533, test: 13512, dev: 13512

number of nulls in train: 0
number of nulls in test: 0
number of nulls in dev: 0

percentage of each category in the training data (check for balance):

{2: 0.6587658432787492, 1: 0.24525858081228605, 0: 0.09597557590896476}


Running the models on the original training data with default hyperparameters:

In [96]:
models = {
            'KNN': KNeighborsClassifier,
            'Decision Tree': DecisionTreeClassifier,
            'Random Forest': RandomForestClassifier,
            'Logistic Regression': LogisticRegression, 
            'Adaptive Boosting': AdaBoostClassifier,
            'Gradient Boosting': GradientBoostingClassifier,
#             'SVC': SVC, 
            'Gaussian Naiive Bayes': GaussianNB,            
        }

best_model = None
best_model_name = ""
best_accuracy = 0

for name, model_class in models.items():
    model = model_class()
    print(f"{name}:")

    clf = model.fit(X_train_and_dev_scaled, y_train_and_dev)
    y_pred = clf.predict(X_test_scaled)
    
    dict_report = classification_report(y_test, y_pred, output_dict=True)
    str_report = classification_report(y_test, y_pred, output_dict=False)
    print(str_report)
    
    if (dict_report["accuracy"] > best_accuracy):
        best_accuracy = dict_report["accuracy"]
        best_model = model
        best_model_name = name
        
print(f"\nBest model when fit to the original data with default hyperparameters:\n\t{best_model_name}")


KNN:
              precision    recall  f1-score   support

           0       0.29      0.23      0.26      1262
           1       0.63      0.52      0.57      3380
           2       0.80      0.88      0.84      8870

    accuracy                           0.73     13512
   macro avg       0.57      0.54      0.56     13512
weighted avg       0.71      0.73      0.72     13512

Decision Tree:
              precision    recall  f1-score   support

           0       0.22      0.25      0.23      1262
           1       0.61      0.60      0.60      3380
           2       0.83      0.82      0.82      8870

    accuracy                           0.71     13512
   macro avg       0.55      0.55      0.55     13512
weighted avg       0.72      0.71      0.71     13512

Random Forest:
              precision    recall  f1-score   support

           0       0.50      0.06      0.10      1262
           1       0.78      0.65      0.71      3380
           2       0.81      0.96      0

Running the models on the original training data with optimized hyperparameters:

In [97]:
best_model = None
best_model_name = ""
best_accuracy = 0

for name, model_class in models.items():
    model = model_class()
    print(f"{name}:")
    
    # Create a GridSearchCV object
    grid_search = GridSearchCV(estimator=model, param_grid={}, cv=5)

    # Fit the GridSearchCV object to your data
    grid_search.fit(X_train_and_dev_scaled, y_train_and_dev)

    print("\nafter grid_search\n")
    
    # Get the best hyperparameters
    best_params = grid_search.best_params_
    
    # Unpacking the best_params dict into the model constructor
    model = model_class(**best_params)

    clf = model.fit(X_train_and_dev_scaled, y_train_and_dev)
    y_pred = clf.predict(X_test_scaled)
    
    dict_report = classification_report(y_test, y_pred, output_dict=True)
    str_report = classification_report(y_test, y_pred, output_dict=False)
    print(str_report)
    
    if (dict_report["accuracy"] > best_accuracy):
        best_accuracy = dict_report["accuracy"]
        best_model = model
        best_model_name = name
        
print(f"\nBest model when fit to the original data with optimized hyperparameters:\n\t{best_model_name}")


KNN:

after grid_search

              precision    recall  f1-score   support

           0       0.29      0.23      0.26      1262
           1       0.63      0.52      0.57      3380
           2       0.80      0.88      0.84      8870

    accuracy                           0.73     13512
   macro avg       0.57      0.54      0.56     13512
weighted avg       0.71      0.73      0.72     13512

Decision Tree:

after grid_search

              precision    recall  f1-score   support

           0       0.22      0.24      0.23      1262
           1       0.61      0.60      0.61      3380
           2       0.82      0.82      0.82      8870

    accuracy                           0.71     13512
   macro avg       0.55      0.55      0.55     13512
weighted avg       0.71      0.71      0.71     13512

Random Forest:

after grid_search

              precision    recall  f1-score   support

           0       0.49      0.07      0.12      1262
           1       0.79      0.64 

Creating a balanced dataset

In [98]:
# Find the category with the smallest number of samples
smallest_category = min(percentage_of_each_category, key=percentage_of_each_category.get)
# Find the number of samples in that category
num_of_samples_of_smallest_category = int(percentage_of_each_category[smallest_category] * len(y_train_and_dev))

# Sample indices for every category
balanced_indices = np.empty(0)
for category in percentage_of_each_category.keys():
    category_indices = np.where(y_train_and_dev == category)[0]
    selected_indices = np.array(random.sample(list(category_indices), num_of_samples_of_smallest_category))
    balanced_indices = np.concatenate((balanced_indices, selected_indices))
balanced_indices = balanced_indices.astype(int)

print(balanced_indices)
    
# Create the balanced dataset
X_train_and_dev_balanced = X_train_and_dev_scaled[balanced_indices]
y_train_and_dev_balanced = y_train_and_dev[balanced_indices]

balanced_percentage_of_each_category = {category: count/len(y_train_and_dev_balanced) for category, count in Counter(y_train_and_dev_balanced).items()}
print("\npercentage of each category in the balanced training data (check for balance):\n")
print(balanced_percentage_of_each_category)

[26812 26583 12886 ...  8150 43818 43533]

percentage of each category in the balanced training data (check for balance):

{2: 0.3333333333333333, 1: 0.3333333333333333, 0: 0.3333333333333333}


Running the models on a balanced subset of the training data with optimized hyperparameters:

In [99]:
best_model = None
best_model_name = ""
best_accuracy = 0

for name, model_class in models.items():
    model = model_class()
    print(f"{name}:")
    
    # Create a GridSearchCV object
    grid_search = GridSearchCV(estimator=model, param_grid={}, cv=5)

    # Fit the GridSearchCV object to your data
    grid_search.fit(X_train_and_dev_balanced, y_train_and_dev_balanced)

    print("\nafter grid_search\n")
    
    # Get the best hyperparameters
    best_params = grid_search.best_params_
    
    # Unpacking the best_params dict into the model constructor
    model = model_class(**best_params)

    clf = model.fit(X_train_and_dev_balanced, y_train_and_dev_balanced)
    y_pred = clf.predict(X_test_scaled)
    
    dict_report = classification_report(y_test, y_pred, output_dict=True)
    str_report = classification_report(y_test, y_pred, output_dict=False)
    print(str_report)
    
    if (dict_report["accuracy"] > best_accuracy):
        best_accuracy = dict_report["accuracy"]
        best_model = model
        best_model_name = name
        
print(f"\nBest model when fit to a balanced subset of the original data with optimized hyperparameters:\n\t{best_model_name}")


KNN:

after grid_search

              precision    recall  f1-score   support

           0       0.16      0.65      0.26      1262
           1       0.44      0.51      0.47      3380
           2       0.88      0.44      0.59      8870

    accuracy                           0.48     13512
   macro avg       0.49      0.53      0.44     13512
weighted avg       0.70      0.48      0.53     13512

Decision Tree:

after grid_search

              precision    recall  f1-score   support

           0       0.16      0.47      0.24      1262
           1       0.49      0.57      0.53      3380
           2       0.86      0.58      0.69      8870

    accuracy                           0.56     13512
   macro avg       0.51      0.54      0.49     13512
weighted avg       0.70      0.56      0.61     13512

Random Forest:

after grid_search

              precision    recall  f1-score   support

           0       0.23      0.59      0.33      1262
           1       0.64      0.68 