In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from collections import Counter
from sklearn.model_selection import GridSearchCV


import warnings
warnings.filterwarnings('ignore')


data = pd.read_pickle("ass2.pickle")
train = data["train"].to_numpy()
X_train = train[:, :-1]
y_train = train[:, -1]

test = data["test"].to_numpy()
X_test = test[:, :-1]
y_test = test[:, -1]

dev = data["dev"].to_numpy()
X_dev = dev[:, :-1]
y_dev = dev[:, -1]



Preliminary data analysis:

In [52]:
print(f"number of attributes: {X_train.shape[1] - 1}\n")
print(f"number of samples:\n\ttrain: {X_train.shape[0]}, test: {X_test.shape[0]}, dev: {X_dev.shape[0]}\n")

train_num_of_nulls = np.sum(np.isnan(train))
print(f"number of nulls in train: {train_num_of_nulls}")

test_num_of_nulls = np.sum(np.isnan(test))
print(f"number of nulls in test: {test_num_of_nulls}")

dev_num_of_nulls = np.sum(np.isnan(dev))
print(f"number of nulls in dev: {dev_num_of_nulls}")

train_percentage_of_each_category = {category: count/len(y_train) for category, count in Counter(y_train).items()}
print("\npercentage of each category in the training data (check for balance):\n")
print(train_percentage_of_each_category)

number of attributes: 41

number of samples:
	train: 40533, test: 13512, dev: 13512

number of nulls in train: 0
number of nulls in test: 0
number of nulls in dev: 0

percentage of each category in the training data (check for balance):

{2: 0.6595613450768509, 1: 0.2438013470505514, 0: 0.09663730787259764}


In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
X_dev_scaled = scaler.fit_transform(X_dev)

models = {
            'KNN': KNeighborsClassifier,
            'Decision Tree': DecisionTreeClassifier,
            'Random Forest': RandomForestClassifier,
            'Logistic Regression': LogisticRegression, 
            'Adaptive Boosting': AdaBoostClassifier,
            'Gradient Boosting': GradientBoostingClassifier,
            'SVC': SVC, 
            'Gaussian Naiive Bayes': GaussianNB,            
        }

best_model = None
best_accuracy = 0

for name, model_class in models.items():
    model = model_class()
    
    # Create a GridSearchCV object
    grid_search = GridSearchCV(estimator=model, param_grid={}, cv=5)

    # Fit the GridSearchCV object to your data
    grid_search.fit(X_train_scaled, y_train)

    print("\nafter grid_search\n")
    
    # Get the best hyperparameters
    best_params = grid_search.best_params_
    
    # Unpacking the best_params dict into the model constructor
    model = model_class(**best_params)

    clf = model.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    print(f"{name}:")
    print(classification_report(y_test, y_pred))

KNN:
              precision    recall  f1-score   support

           0       0.29      0.23      0.26      1262
           1       0.62      0.51      0.56      3380
           2       0.80      0.88      0.84      8870

    accuracy                           0.73     13512
   macro avg       0.57      0.54      0.55     13512
weighted avg       0.71      0.73      0.71     13512

Decision Tree:
              precision    recall  f1-score   support

           0       0.22      0.24      0.23      1262
           1       0.63      0.62      0.62      3380
           2       0.83      0.83      0.83      8870

    accuracy                           0.72     13512
   macro avg       0.56      0.56      0.56     13512
weighted avg       0.72      0.72      0.72     13512

Random Forest:
              precision    recall  f1-score   support

           0       0.48      0.12      0.19      1262
           1       0.78      0.66      0.71      3380
           2       0.82      0.96      0