In [None]:
# Import required libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score,precision_score,f1_score,recall_score
from sklearn.metrics import confusion_matrix
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
pd.set_option("future.no_silent_downcasting", True)

### Support Vector Machines

In [None]:
svm_classifier = SVC()
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}
grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, cv=8, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
# Get the best parameters
best_params = grid_search.best_params_
print(best_params)
# Get the best model
best_svm_classifier = grid_search.best_estimator_

Fitting 8 folds for each of 12 candidates, totalling 96 fits
{'C': 10, 'gamma': 'scale', 'kernel': 'linear'}


In [None]:
pred_train_svm = best_svm_classifier.predict(X_train)
acc_train = accuracy_score(y_train, pred_train_svm)
prec_train = precision_score(y_train, pred_train_svm, average='macro')
recall_train = recall_score(y_train, pred_train_svm, average='macro')
f1_train = f1_score(y_train, pred_train_svm, average='macro')

pred_test_svm = best_svm_classifier.predict(X_test)
acc_test = accuracy_score(y_test, pred_test_svm)
prec_test = precision_score(y_test, pred_test_svm, average='macro')
recall_test = recall_score(y_test, pred_test_svm, average='macro')
f1_test = f1_score(y_test, pred_test_svm, average='macro')

In [None]:
print("*"*10 + "Training Phase" + "*"*10)
print("Train Acc: %.4f" % acc_train)
print("Train Precision: %.4f" % prec_train)
print("Train recall: %.4f" % recall_train)
print("Train f1-score: %.4f" % f1_train)

print("*"*10 + "Test Phase" + "*"*10)
print("Test Acc: %.4f" % acc_test)
print("Test Precision: %.4f" % prec_test)
print("Test recall: %.4f" % recall_test)
print("Test f1-score: %.4f" % f1_test)

**********Training Phase**********
Train Acc: 0.7820
Train Precision: 0.7730
Train recall: 0.7155
Train f1-score: 0.7370
**********Test Phase**********
Test Acc: 0.7558
Test Precision: 0.7605
Test recall: 0.6632
Test f1-score: 0.6758


### Gradient Boosting

In [None]:
gb_classifier = GradientBoostingClassifier()
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}
grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=8, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
# Get the best parameters
best_params = grid_search.best_params_
print(best_params)
# Get the best model
best_gb_classifier = grid_search.best_estimator_

Fitting 8 folds for each of 27 candidates, totalling 216 fits
{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}


In [None]:
pred_train_gb = best_gb_classifier.predict(X_train)
acc_train = accuracy_score(y_train, pred_train_gb)
prec_train = precision_score(y_train, pred_train_gb, average='macro')
recall_train = recall_score(y_train, pred_train_gb, average='macro')
f1_train = f1_score(y_train, pred_train_gb, average='macro')

pred_test_gb = best_gb_classifier.predict(X_test)
acc_test = accuracy_score(y_test, pred_test_gb)
prec_test = precision_score(y_test, pred_test_gb, average='macro')
recall_test = recall_score(y_test, pred_test_gb, average='macro')
f1_test = f1_score(y_test, pred_test_gb, average='macro')

In [None]:
print("*"*10 + "Training Phase" + "*"*10)
print("Train Acc: %.4f" % acc_train)
print("Train Precision: %.4f" % prec_train)
print("Train recall: %.4f" % recall_train)
print("Train f1-score: %.4f" % f1_train)

print("*"*10 + "Test Phase" + "*"*10)
print("Test Acc: %.4f" % acc_test)
print("Test Precision: %.4f" % prec_test)
print("Test recall: %.4f" % recall_test)
print("Test f1-score: %.4f" % f1_test)

**********Training Phase**********
Train Acc: 0.9041
Train Precision: 0.9116
Train recall: 0.8522
Train f1-score: 0.8758
**********Test Phase**********
Test Acc: 0.6977
Test Precision: 0.6667
Test recall: 0.5923
Test f1-score: 0.6012


### K-Nearest Neighbors

In [None]:
knn_regressor = KNeighborsRegressor()
param_grid = {
    'n_neighbors': [3, 5, 7, 10, 15],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # p=1 is for Manhattan distance, p=2 is for Euclidean distance
}
grid_search = GridSearchCV(estimator=knn_regressor, param_grid=param_grid, cv=8, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
# Get the best parameters
best_params = grid_search.best_params_
print(best_params)
# Get the best model
best_knn_regressor = grid_search.best_estimator_

Fitting 8 folds for each of 20 candidates, totalling 160 fits
{'n_neighbors': 15, 'p': 1, 'weights': 'uniform'}


In [None]:
pred_train_knn = best_knn_regressor.predict(X_train)
mse_train = mean_squared_error(y_train, pred_train_knn)
r2_train = r2_score(y_train, pred_train_knn)

pred_test_knn = best_knn_regressor.predict(X_test)
mse_test = mean_squared_error(y_test, pred_test_knn)
r2_test = r2_score(y_test, pred_test_knn)

In [None]:
print("*"*10 + "Training Phase" + "*"*10)
print("Train MSE: %.4f" % mse_train)
print("Train R^2: %.4f" % r2_train)

print("*"*10 + "Test Phase" + "*"*10)
print("Test MSE: %.4f" % mse_test)
print("Test R^2: %.4f" % r2_test)

**********Training Phase**********
Train MSE: 0.1636
Train R^2: 0.6300
**********Test Phase**********
Test MSE: 0.2279
Test R^2: 0.5946
