In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from custom_bagging import CustomBaggingClassifier 
from custom_balanced_bagging import CustomImblearnBalancedBagging
from custom_xgboost import CustomXGBoost
from custom_decision_tree import CustomDecisionTree
from custom_random_forest import CustomRandomForest
from custom_lightgbm import CustomLightGBM
from custom_hist_gradient_boosting import CustomHGBoosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, roc_curve
from custom_functions import handle_class_imbalance, split_data
from sklearn.model_selection import GridSearchCV

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [7]:
# Importing the dataset
file_path = 'P:\DATA_OCT_22\Expert_Eye\Dataset\Data\data_v6.csv'
df = pd.read_csv(file_path)
dataset = df.copy()
dataset.head()

Unnamed: 0,Gender,VINCQ32DDN,VINICODEX003,FROPCOM0001,FROPCOM0005,FROPCOM0006[S1],FROPCOM0006[S2],FROPCOM0006[S3],FROPCOM0006[S4],FROPCOM0006[S5],...,romberg_EyesClosed_SwayDensity,romberg_EyesOpen_LateralVariance,romberg_EyesClosed_LateralVariance,romberg_EyesOpen_Score,romberg_EyesClosed_Score,BMI,Weight_Diff,MFESCALE_SCORE,Frailty_Score,Frailty_State
0,0,76.0,0.0,0.0,3.0,,,,,1.0,...,29.88,1.6809989999999999e-34,1.0824340000000001e-33,99.0,99.0,32.029086,-2.0,140.0,7,0
1,0,75.0,1.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,...,27.130802,0.001338352,0.01246047,98.0,88.0,28.90625,0.0,123.0,4,1
2,0,67.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,29.88,0.0006738724,0.007077824,99.0,81.0,39.033376,0.0,135.0,9,0
3,0,72.0,1.0,0.0,1.0,,,,,,...,29.88,0.002395301,0.001533375,99.0,99.0,25.23634,2.0,129.0,7,0
4,1,69.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,29.88,0.002942435,0.00250574,99.0,99.0,22.460034,0.0,140.0,8,0


In [8]:
# Handle class imbalance
df_oversampled, df_undersampled = handle_class_imbalance(dataset)

In [9]:
estimator = HistGradientBoostingClassifier()
models = {
    'DecisionTree': CustomDecisionTree(),
    'XGBoost': CustomXGBoost(),
    'RandomForest': CustomRandomForest(),
    'LightGBM': CustomLightGBM(),
    'HistGradientBoosting' : CustomHGBoosting(), 
    'Bagging': CustomBaggingClassifier(estimator=estimator) ,
    'BalancedBagging': CustomImblearnBalancedBagging(estimator=estimator),
}


In [10]:
best_models = {}
metrics = {}

X_train, X_test, y_train, y_test = split_data(df_oversampled)

for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Training set performance
    model_train_acc = accuracy_score(y_train, y_train_pred)
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    model_train_prec = precision_score(y_train, y_train_pred, average='weighted')
    model_train_rec = recall_score(y_train, y_train_pred, average='weighted')

    # Test set performance
    model_test_acc = accuracy_score(y_test, y_test_pred)
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted')
    model_test_prec = precision_score(y_test, y_test_pred, average='weighted')
    model_test_rec = recall_score(y_test, y_test_pred, average='weighted')

    print(list(models.keys())[i])
    print('confusion matrix: \n', confusion_matrix(y_test, y_test_pred))
    print('Training set accuracy: ', model_train_acc)
    print('Training set F1 score: ', model_train_f1)
    print('Training set precision: ', model_train_prec)
    print('Training set recall: ', model_train_rec)
    print('Training ROC AUC score: ', roc_auc_score(y_train, y_train_pred))

    print('_______________________________')

    print('confusion matrix: \n', confusion_matrix(y_test, y_test_pred))
    print('Test set accuracy: ', model_test_acc)
    print('Test set F1 score: ', model_test_f1)
    print('Test set precision: ', model_test_prec)
    print('Test set recall: ', model_test_rec)
    print('Test ROC AUC score: ', roc_auc_score(y_test, y_test_pred))

    print('='*30)
    print('\n')


ValueError: Classification metrics can't handle a mix of binary and unknown targets

In [8]:
# PLotting ROC curve
auc_models = [
    {
        'label': 'Random Forest Classifier',
        'model': CustomRandomForest(),
        'auc': roc_auc_score(y_test, y_test_pred)
    },
    {
        'label': 'XGBoost Classifier',
        'model': CustomXGBoost(),
        'auc': roc_auc_score(y_test, y_test_pred)
    }
]

for m in auc_models:
    model = m['model']
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])

    auc = roc_auc_score(y_test, y_pred)

    plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % (m['label'], m['auc']))

# custom settings for the plot
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('Recall')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()



AttributeError: 'CustomRandomForest' object has no attribute 'predict_proba'

In [7]:
param_grids = {
    'DecisionTree': {
        'max_depth': [3, 5, 7, 10],
        'min_samples_split': [2, 5, 10],
        'criterion': ['gini', 'entropy']
    },
    'XGBoost': {
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 10]
    },
    'LightGBM': {
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 10]
    },
    'Bagging': {
        'n_estimators': [10, 50, 100],
        'max_samples': [0.5, 1.0],
        'max_features': [0.5, 1.0]
    },
    'BalancedBagging': {
        'n_estimators': [10, 50, 100],
        'max_samples': [0.5, 1.0],
        'max_features': [0.5, 1.0]
    }
}


In [None]:
# list of models for hyperparameter tuning
gridsearch_models = [
    CustomDecisionTree(),
    CustomXGBoost(),
    CustomLightGBM(),
    CustomBaggingClassifier(base_estimator=estimator),
    CustomImblearnBalancedBagging(base_estimator=estimator)
]

In [None]:
def run_grid_search(model, params, X, y):
    grid_search = GridSearchCV(model, param_grid=params, cv=5, scoring='recall')
    grid_search.fit(X, y)
    
    print(f"Best parameters for {model.__class__.__name__}: {grid_search.best_params_}")
    print(f"Best score for {model.__class__.__name__}: {grid_search.best_score_}")

In [None]:
for model_name, model in models.items():
    print(f"Performing GridSearch on {model_name}...")
    params = param_grids.get(model_name, {})
    run_grid_search(model, params, X_train, y_train)