In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

In [28]:
X_train = pd.read_csv('X_train_PCA.csv')
y_train = pd.read_csv('y_train.csv')
X_test = pd.read_csv('X_test_PCA.csv')
y_test = pd.read_csv('y_test.csv')

In [29]:
param_grid_logreg = {
    "penalty": ["l1", "l2", "elasticnet", None],
    "C": [0.01, 0.1, 1, 10, 100],   # Regularization strength
    "solver": ["lbfgs", "liblinear", "saga"],
    "max_iter": [100, 200, 500]
}

In [30]:
rand_LR = RandomizedSearchCV(LogisticRegression(), param_distributions=param_grid_logreg, cv=5, verbose=3)
rand_LR.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END C=10, max_iter=100, penalty=l2, solver=saga;, score=0.777 total time=   0.0s
[CV 2/5] END C=10, max_iter=100, penalty=l2, solver=saga;, score=0.721 total time=   0.0s
[CV 3/5] END C=10, max_iter=100, penalty=l2, solver=saga;, score=0.810 total time=   0.0s
[CV 4/5] END C=10, max_iter=100, penalty=l2, solver=saga;, score=0.830 total time=   0.0s
[CV 5/5] END C=10, max_iter=100, penalty=l2, solver=saga;, score=0.844 total time=   0.0s
[CV 1/5] END C=10, max_iter=500, penalty=l2, solver=liblinear;, score=0.777 total time=   0.0s
[CV 2/5] END C=10, max_iter=500, penalty=l2, solver=liblinear;, score=0.721 total time=   0.0s
[CV 3/5] END C=10, max_iter=500, penalty=l2, solver=liblinear;, score=0.810 total time=   0.0s
[CV 4/5] END C=10, max_iter=500, penalty=l2, solver=liblinear;, score=0.830 total time=   0.0s
[CV 5/5] END C=10, max_iter=500, penalty=l2, solver=liblinear;, score=0.844 total time=   0.0s
[CV 1/5] END C

In [31]:
print("Best parameters found:", rand_LR.best_params_)
print("Best cross-validation score:", rand_LR.best_score_)

Best parameters found: {'solver': 'saga', 'penalty': 'l2', 'max_iter': 100, 'C': 10}
Best cross-validation score: 0.7962217319360178


In [32]:
param_grid_SVM = [

    {
        'kernel': ['linear'],
        'C': [0.01, 0.1, 1],
        'max_iter': [1000, -1]
    },

    {
        'kernel': ['rbf'],
        'C': [0.1, 1],
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
        'max_iter': [1000, -1]
    }
]

In [33]:
gridSVM = GridSearchCV(SVC(),param_grid=param_grid_SVM,cv=5,verbose=3)

In [34]:
gridSVM.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END C=0.01, kernel=linear, max_iter=1000;, score=0.770 total time=   0.0s
[CV 2/5] END C=0.01, kernel=linear, max_iter=1000;, score=0.735 total time=   0.0s
[CV 3/5] END C=0.01, kernel=linear, max_iter=1000;, score=0.782 total time=   0.0s
[CV 4/5] END C=0.01, kernel=linear, max_iter=1000;, score=0.816 total time=   0.0s
[CV 5/5] END C=0.01, kernel=linear, max_iter=1000;, score=0.803 total time=   0.0s
[CV 1/5] END C=0.01, kernel=linear, max_iter=-1;, score=0.770 total time=   0.0s
[CV 2/5] END C=0.01, kernel=linear, max_iter=-1;, score=0.735 total time=   0.0s
[CV 3/5] END C=0.01, kernel=linear, max_iter=-1;, score=0.782 total time=   0.0s
[CV 4/5] END C=0.01, kernel=linear, max_iter=-1;, score=0.816 total time=   0.0s
[CV 5/5] END C=0.01, kernel=linear, max_iter=-1;, score=0.803 total time=   0.0s
[CV 1/5] END C=0.1, kernel=linear, max_iter=1000;, score=0.784 total time=   0.0s
[CV 2/5] END C=0.1, kernel=linear, m

In [35]:
print("Best parameters found:", gridSVM.best_params_)
print("Best cross-validation score:", gridSVM.best_score_)


Best parameters found: {'C': 0.1, 'kernel': 'linear', 'max_iter': 1000}
Best cross-validation score: 0.8043758043758042


In [36]:
param_grid_tree = {
    "criterion": ["gini", "entropy", "log_loss"],
    "splitter": ["best", "random"],
    "max_depth": [None, 5, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4, 10],
    "max_features": [None, "sqrt", "log2"]
}


In [37]:
model = DecisionTreeClassifier()
rand_tree = RandomizedSearchCV(model, param_distributions=param_grid_tree, cv=5, verbose=3)
rand_tree.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=gini, max_depth=5, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, splitter=best;, score=0.709 total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=5, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, splitter=best;, score=0.680 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=5, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, splitter=best;, score=0.694 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=5, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, splitter=best;, score=0.741 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=5, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, splitter=best;, score=0.803 total time=   0.0s
[CV 1/5] END criterion=log_loss, max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, splitter=best;, score=0.642 total time=   0.0s
[CV 2/5] END criterion=log_loss, max

In [38]:
print("Best parameters found:", rand_tree.best_params_)
print("Best cross-validation score:", rand_tree.best_score_)

Best parameters found: {'splitter': 'best', 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 5, 'criterion': 'log_loss'}
Best cross-validation score: 0.7487129987129988


In [39]:
param_grid_rf = {
    "n_estimators": [100, 200, 500],
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [None, 10, 20, 30, 50],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
    "bootstrap": [True, False]
}


In [40]:
model = RandomForestClassifier()
rand_rf = RandomizedSearchCV(model, param_distributions=param_grid_rf, cv=5, verbose=3)
rand_rf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END bootstrap=False, criterion=log_loss, max_depth=30, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=500;, score=0.676 total time=   4.5s
[CV 2/5] END bootstrap=False, criterion=log_loss, max_depth=30, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=500;, score=0.660 total time=   4.2s
[CV 3/5] END bootstrap=False, criterion=log_loss, max_depth=30, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=500;, score=0.728 total time=   4.5s
[CV 4/5] END bootstrap=False, criterion=log_loss, max_depth=30, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=500;, score=0.653 total time=   4.5s
[CV 5/5] END bootstrap=False, criterion=log_loss, max_depth=30, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=500;, score=0.748 total time=   4.1s
[CV 1/5] END bootstrap=True, criterion=entropy, max_depth=None, max_f

In [41]:
print("Best parameters found:", rand_rf.best_params_)
print("Best cross-validation score:", rand_rf.best_score_)

Best parameters found: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 30, 'criterion': 'gini', 'bootstrap': True}
Best cross-validation score: 0.7948519948519948


In [42]:
models = {
    "Logistic Regression": rand_LR,
    "Decision Tree": rand_tree,
    "Random Forest": rand_rf,
    "SVM":gridSVM
}
best_model_name = max(models, key=lambda name: models[name].best_score_)
best_model = models[best_model_name]

print(f"Best model: {best_model_name}")
print(f"Best CV score: {best_model.best_score_}")
print(f"Best parameters: {best_model.best_params_}")

Best model: SVM
Best CV score: 0.8043758043758042
Best parameters: {'C': 0.1, 'kernel': 'linear', 'max_iter': 1000}
