In [2]:
import os
os.environ['PYDEVD_DISABLE_FILE_VALIDATION'] = '1'

import numpy as np 
import pandas as pd 
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import matplotlib.transforms as transforms
import seaborn as sns
from tqdm import tqdm
import pickle
from copy import deepcopy
from sklearn.metrics import confusion_matrix

# IMB Learn SMOTE implementation
from imblearn.over_sampling import SMOTE

# Import EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

I want to try all of these using adasyn and gaussian noise augementation, compare the augmentation methods. And leave one out

In [5]:
# Load Datasets
data_train_raw = pd.read_csv('/media/matt/1TB/Projects/water-ml/datasets/sheet_1.csv')
data_test_raw = pd.read_csv('/media/matt/1TB/Projects/water-ml/datasets/sheet_2_3.csv')
data_train_unlabeled = pd.read_csv('/media/matt/1TB/Projects/water-ml/datasets/sheet_3.csv')

# Remove DWDS_sim_rows from data
dwds = data_test_raw[data_test_raw['Location'] == 'DWDS Simulator (EPA, 2016)']

# Drop DWDS sim data from sheets 2&3 (test_data)
data_test = data_test_raw[data_test_raw['Location'] != 'DWDS Simulator (EPA, 2016)']

# Concatenate train data and dwds data
data_train = pd.concat([data_train_raw, dwds])

# Prepare train data
target_columns = ['Scheme', 'Sample (reference)']
X_train = data_train_raw.drop(target_columns, axis=1)
y_train = data_train_raw['Scheme'].map({'Stable': 1, 'Failure': 0})
X_train.replace('ND', 0, inplace=True)

# Prepare test data
target_columns = ['Scheme', 'Sample', 'Location']
X_test = data_test.drop(target_columns, axis=1)
y_test = data_test['Scheme'].map({'Stable': 1, 'Failure': 0})
X_test.replace('ND', 0, inplace=True)
X_test.fillna(0, inplace=True)


In [6]:
import numpy as np
from sklearn.metrics import matthews_corrcoef
import warnings

def print_confusion_matrix_report(cm, y_true, y_pred):
    tn, fp, fn, tp = cm.ravel()
    total = np.sum(cm)

    accuracy = (tp + tn) / total
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    mcc = matthews_corrcoef(y_true, y_pred)

    print("Confusion Matrix:")
    print(cm)
    print()
    print(f"True Negatives (TN): {tn}")
    print(f"False Positives (FP): {fp}")
    print(f"False Negatives (FN): {fn}")
    print(f"True Positives (TP): {tp}")
    print()
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"Specificity: {specificity:.3f}")
    print(f"F1 Score: {f1_score:.3f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.3f}")

    if precision == 0 and recall == 0:
        warnings.warn("Precision and Recall are both zero. F1 Score may not be meaningful.")
    if tn + fp == 0:
        warnings.warn("No negative samples. Specificity may not be meaningful.")
        
def fit_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print_confusion_matrix_report(cm, y_test, y_pred)
    return model

In [7]:
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [8]:
from imblearn.over_sampling import ADASYN

adasyn_oversampler = ADASYN(sampling_strategy='auto', random_state=42)
X_train_adasyn, y_train_adasyn = adasyn_oversampler.fit_resample(X_train, y_train)

In [9]:
from imblearn.over_sampling import RandomOverSampler

gaussian_oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_train_gaussian, y_train_gaussian = gaussian_oversampler.fit_resample(X_train, y_train)

In [6]:
from sklearn.ensemble import RandomForestClassifier

# Tuned Random Forest
# eec = EasyEnsembleClassifier(n_estimators=10, estimator=RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=15))
# Confusion Matrix:
# [[11 37]
#  [ 5 97]]

# True Negatives (TN): 11
# False Positives (FP): 37
# False Negatives (FN): 5
# True Positives (TP): 97

# Accuracy: 0.720
# Precision: 0.724
# Recall: 0.951
# Specificity: 0.229
# F1 Score: 0.822
# Matthews Correlation Coefficient (MCC): 0.272

# Vanilla Random Forest (default parameters)
eec = EasyEnsembleClassifier(n_estimators=10, estimator=RandomForestClassifier())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[ 7 41]
 [ 5 97]]

True Negatives (TN): 7
False Positives (FP): 41
False Negatives (FN): 5
True Positives (TP): 97

Accuracy: 0.693
Precision: 0.703
Recall: 0.951
Specificity: 0.146
F1 Score: 0.808
Matthews Correlation Coefficient (MCC): 0.166


In [7]:
from sklearn.tree import DecisionTreeClassifier

eec = EasyEnsembleClassifier(n_estimators=10, estimator=DecisionTreeClassifier())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[  0  48]
 [  0 102]]

True Negatives (TN): 0
False Positives (FP): 48
False Negatives (FN): 0
True Positives (TP): 102

Accuracy: 0.680
Precision: 0.680
Recall: 1.000
Specificity: 0.000
F1 Score: 0.810
Matthews Correlation Coefficient (MCC): 0.000


In [8]:
from sklearn.linear_model import LogisticRegression

eec = EasyEnsembleClassifier(n_estimators=10, estimator=LogisticRegression())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[ 46   2]
 [101   1]]

True Negatives (TN): 46
False Positives (FP): 2
False Negatives (FN): 101
True Positives (TP): 1

Accuracy: 0.313
Precision: 0.333
Recall: 0.010
Specificity: 0.958
F1 Score: 0.019
Matthews Correlation Coefficient (MCC): -0.106


In [9]:
from sklearn.svm import SVC

eec = EasyEnsembleClassifier(n_estimators=10, estimator=SVC())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[46  2]
 [99  3]]

True Negatives (TN): 46
False Positives (FP): 2
False Negatives (FN): 99
True Positives (TP): 3

Accuracy: 0.327
Precision: 0.600
Recall: 0.029
Specificity: 0.958
F1 Score: 0.056
Matthews Correlation Coefficient (MCC): -0.032


In [10]:
from sklearn.naive_bayes import GaussianNB

eec = EasyEnsembleClassifier(n_estimators=10, estimator=GaussianNB())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[38 10]
 [12 90]]

True Negatives (TN): 38
False Positives (FP): 10
False Negatives (FN): 12
True Positives (TP): 90

Accuracy: 0.853
Precision: 0.900
Recall: 0.882
Specificity: 0.792
F1 Score: 0.891
Matthews Correlation Coefficient (MCC): 0.667


In [11]:
from sklearn.neighbors import KNeighborsClassifier

eec = EasyEnsembleClassifier(n_estimators=10, estimator=KNeighborsClassifier())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[31 17]
 [12 90]]

True Negatives (TN): 31
False Positives (FP): 17
False Negatives (FN): 12
True Positives (TP): 90

Accuracy: 0.807
Precision: 0.841
Recall: 0.882
Specificity: 0.646
F1 Score: 0.861
Matthews Correlation Coefficient (MCC): 0.545


In [12]:
from sklearn.ensemble import GradientBoostingClassifier

eec = EasyEnsembleClassifier(n_estimators=10, estimator=GradientBoostingClassifier())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[ 6 42]
 [ 5 97]]

True Negatives (TN): 6
False Positives (FP): 42
False Negatives (FN): 5
True Positives (TP): 97

Accuracy: 0.687
Precision: 0.698
Recall: 0.951
Specificity: 0.125
F1 Score: 0.805
Matthews Correlation Coefficient (MCC): 0.136


In [13]:
from sklearn.ensemble import ExtraTreesClassifier

eec = EasyEnsembleClassifier(n_estimators=10, estimator=ExtraTreesClassifier())

fit_model(eec, X_train, y_train, X_test, y_test)    

Confusion Matrix:
[[29 19]
 [ 6 96]]

True Negatives (TN): 29
False Positives (FP): 19
False Negatives (FN): 6
True Positives (TP): 96

Accuracy: 0.833
Precision: 0.835
Recall: 0.941
Specificity: 0.604
F1 Score: 0.885
Matthews Correlation Coefficient (MCC): 0.601


In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score
from imblearn.ensemble import EasyEnsembleClassifier

def grid_search_augmented(X_train_smote, y_train_smote, X_train_gaussian, y_train_gaussian, X_train_adasyn, y_train_adasyn, X_test, y_test):
    # Define the parameter grid for each model
    param_grid = {
        'RandomForestClassifier': {
            'estimator__n_estimators': [50, 100, 200],
            'estimator__max_depth': [3, 5, 7],
            'estimator__min_samples_split': [2, 5, 10]
        },
        'DecisionTreeClassifier': {
            'estimator__max_depth': [3, 5, 7],
            'estimator__min_samples_split': [2, 5, 10]
        },
        'LogisticRegression': {
            'estimator__C': [0.1, 1, 10],
            'estimator__solver': ['liblinear', 'saga']
        },
        'SVC': {
            'estimator__C': [0.1, 1, 10, 15, 20, 25],
            'estimator__kernel': ['linear', 'rbf']
        },
        'GaussianNB': {
            'estimator__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
        },
        'KNeighborsClassifier': {
            'estimator__n_neighbors': [3, 5, 7],
            'estimator__weights': ['uniform', 'distance']
        },
        'GradientBoostingClassifier': {
            'estimator__n_estimators': [50, 100, 200],
            'estimator__learning_rate': [0.01, 0.1, 1]
        },
        'ExtraTreesClassifier': {
            'estimator__n_estimators': [50, 100, 200],
            'estimator__criterion': ['gini', 'entropy', 'log_loss'],
            'estimator__max_depth': [None, 5, 10, 20]
        }
    }

    # Define the models to evaluate
    models = {
        'RandomForestClassifier': RandomForestClassifier(),
        'DecisionTreeClassifier': DecisionTreeClassifier(),
        'LogisticRegression': LogisticRegression(),
        'SVC': SVC(),
        'GaussianNB': GaussianNB(),
        'KNeighborsClassifier': KNeighborsClassifier(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'ExtraTreesClassifier': ExtraTreesClassifier()
    }

    # Perform grid search for each model and augmentation method
    for model_name, estimator in models.items():
        print(f"Grid search for {model_name}:")

        # SMOTE
        print("SMOTE:")
        eec_smote = EasyEnsembleClassifier(n_estimators=10, estimator=estimator)
        grid_search_smote = GridSearchCV(
            estimator=eec_smote,
            param_grid=param_grid[model_name],
            scoring='accuracy',
            cv=10,
            n_jobs=-1,
            verbose=1
        )
        grid_search_smote.fit(X_train_smote, y_train_smote)
        best_model_smote = grid_search_smote.best_estimator_
        y_pred_smote = best_model_smote.predict(X_test)
        accuracy_smote = accuracy_score(y_test, y_pred_smote)
        print(f"Best parameters for {model_name} with SMOTE: {grid_search_smote.best_params_}")
        print(f"Test set accuracy for {model_name} with SMOTE: {accuracy_smote}")
        print()

        # Gaussian
        print("Gaussian:")
        eec_gaussian = EasyEnsembleClassifier(n_estimators=10, estimator=estimator)
        grid_search_gaussian = GridSearchCV(
            estimator=eec_gaussian,
            param_grid=param_grid[model_name],
            scoring='accuracy',
            cv=10,
            n_jobs=-1,
            verbose=1
        )
        grid_search_gaussian.fit(X_train_gaussian, y_train_gaussian)
        best_model_gaussian = grid_search_gaussian.best_estimator_
        y_pred_gaussian = best_model_gaussian.predict(X_test)
        accuracy_gaussian = accuracy_score(y_test, y_pred_gaussian)
        print(f"Best parameters for {model_name} with Gaussian: {grid_search_gaussian.best_params_}")
        print(f"Test set accuracy for {model_name} with Gaussian: {accuracy_gaussian}")
        print()

        # ADASYN
        print("ADASYN:")
        eec_adasyn = EasyEnsembleClassifier(n_estimators=10, estimator=estimator)
        grid_search_adasyn = GridSearchCV(
            estimator=eec_adasyn,
            param_grid=param_grid[model_name],
            scoring='accuracy',
            cv=10,
            n_jobs=-1,
            verbose=1
        )
        grid_search_adasyn.fit(X_train_adasyn, y_train_adasyn)
        best_model_adasyn = grid_search_adasyn.best_estimator_
        y_pred_adasyn = best_model_adasyn.predict(X_test)
        accuracy_adasyn = accuracy_score(y_test, y_pred_adasyn)
        print(f"Best parameters for {model_name} with ADASYN: {grid_search_adasyn.best_params_}")
        print(f"Test set accuracy for {model_name} with ADASYN: {accuracy_adasyn}")
        print()

In [11]:
grid_search_augmented(X_train_smote, y_train_smote, X_train_gaussian, y_train_gaussian, X_train_adasyn, y_train_adasyn, X_test, y_test)

Grid search for RandomForestClassifier:
SMOTE:
Fitting 10 folds for each of 27 candidates, totalling 270 fits
Best parameters for RandomForestClassifier with SMOTE: {'estimator__max_depth': 3, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 50}
Test set accuracy for RandomForestClassifier with SMOTE: 0.6933333333333334

Gaussian:
Fitting 10 folds for each of 27 candidates, totalling 270 fits
Best parameters for RandomForestClassifier with Gaussian: {'estimator__max_depth': 3, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 50}
Test set accuracy for RandomForestClassifier with Gaussian: 0.6933333333333334

ADASYN:
Fitting 10 folds for each of 27 candidates, totalling 270 fits
Best parameters for RandomForestClassifier with ADASYN: {'estimator__max_depth': 3, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 50}
Test set accuracy for RandomForestClassifier with ADASYN: 0.68

Grid search for DecisionTreeClassifier:
SMOTE:
Fitting 10 folds for each o

In [15]:
# 'estimator__max_depth': 3, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 50
eec = EasyEnsembleClassifier(n_estimators=10, estimator=RandomForestClassifier(max_depth=3, min_samples_split=2, n_estimators=50))

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[ 4 44]
 [ 3 99]]

True Negatives (TN): 4
False Positives (FP): 44
False Negatives (FN): 3
True Positives (TP): 99

Accuracy: 0.687
Precision: 0.692
Recall: 0.971
Specificity: 0.083
F1 Score: 0.808
Matthews Correlation Coefficient (MCC): 0.119


In [16]:
# 'estimator__max_depth': 3, 'estimator__min_samples_split': 2
eec = EasyEnsembleClassifier(n_estimators=10, estimator=DecisionTreeClassifier(max_depth=3, min_samples_split=2))

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[14 34]
 [ 7 95]]

True Negatives (TN): 14
False Positives (FP): 34
False Negatives (FN): 7
True Positives (TP): 95

Accuracy: 0.727
Precision: 0.736
Recall: 0.931
Specificity: 0.292
F1 Score: 0.823
Matthews Correlation Coefficient (MCC): 0.300


In [17]:
# 'estimator__C': 10, 'estimator__solver': 'liblinear'
eec = EasyEnsembleClassifier(n_estimators=10, estimator=LogisticRegression(C=10, solver='liblinear'))

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[ 46   2]
 [100   2]]

True Negatives (TN): 46
False Positives (FP): 2
False Negatives (FN): 100
True Positives (TP): 2

Accuracy: 0.320
Precision: 0.500
Recall: 0.020
Specificity: 0.958
F1 Score: 0.038
Matthews Correlation Coefficient (MCC): -0.064


In [18]:
eec = EasyEnsembleClassifier(n_estimators=10, estimator=SVC(C=10, kernel='rbf'))

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[  2  46]
 [  0 102]]

True Negatives (TN): 2
False Positives (FP): 46
False Negatives (FN): 0
True Positives (TP): 102

Accuracy: 0.693
Precision: 0.689
Recall: 1.000
Specificity: 0.042
F1 Score: 0.816
Matthews Correlation Coefficient (MCC): 0.169


In [19]:
# GaussianNB
# Confusion Matrix:
# [[37 11]
#  [11 91]]

# True Negatives (TN): 37
# False Positives (FP): 11
# False Negatives (FN): 11
# True Positives (TP): 91

# Accuracy: 0.853
# Precision: 0.892
# Recall: 0.892
# Specificity: 0.771
# F1 Score: 0.892
# Matthews Correlation Coefficient (MCC): 0.663

# 'estimator__var_smoothing': 1e-09
eec = EasyEnsembleClassifier(n_estimators=10, estimator=GaussianNB())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[37 11]
 [11 91]]

True Negatives (TN): 37
False Positives (FP): 11
False Negatives (FN): 11
True Positives (TP): 91

Accuracy: 0.853
Precision: 0.892
Recall: 0.892
Specificity: 0.771
F1 Score: 0.892
Matthews Correlation Coefficient (MCC): 0.663


In [20]:
# 'estimator__n_neighbors': 3, 'estimator__weights': 'uniform'
eec = EasyEnsembleClassifier(n_estimators=10, estimator=KNeighborsClassifier(n_neighbors=3, weights='uniform'))

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[  2  46]
 [  0 102]]

True Negatives (TN): 2
False Positives (FP): 46
False Negatives (FN): 0
True Positives (TP): 102

Accuracy: 0.693
Precision: 0.689
Recall: 1.000
Specificity: 0.042
F1 Score: 0.816
Matthews Correlation Coefficient (MCC): 0.169


In [21]:
# 'estimator__learning_rate': 0.01, 'estimator__n_estimators': 50
eec = EasyEnsembleClassifier(n_estimators=10, estimator=GradientBoostingClassifier(learning_rate=0.01, n_estimators=50))

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[47  1]
 [97  5]]

True Negatives (TN): 47
False Positives (FP): 1
False Negatives (FN): 97
True Positives (TP): 5

Accuracy: 0.347
Precision: 0.833
Recall: 0.049
Specificity: 0.979
F1 Score: 0.093
Matthews Correlation Coefficient (MCC): 0.067


In [22]:
# 'estimator__max_depth': 3, 'estimator__n_estimators': 50
eec = EasyEnsembleClassifier(n_estimators=10, estimator=ExtraTreesClassifier(max_depth=3, n_estimators=50))

fit_model(eec, X_train, y_train, X_test, y_test)    

Confusion Matrix:
[[25 23]
 [ 5 97]]

True Negatives (TN): 25
False Positives (FP): 23
False Negatives (FN): 5
True Positives (TP): 97

Accuracy: 0.813
Precision: 0.808
Recall: 0.951
Specificity: 0.521
F1 Score: 0.874
Matthews Correlation Coefficient (MCC): 0.550


In [12]:
# 'estimator__max_depth': 3, 'estimator__n_estimators': 50
# eec = EasyEnsembleClassifier(n_estimators=10, estimator=ExtraTreesClassifier(max_depth=3, n_estimators=50))
estimator=ExtraTreesClassifier(max_depth=3, n_estimators=50)

fit_model(estimator, X_train, y_train, X_test, y_test)    

Confusion Matrix:
[[27 21]
 [ 6 96]]

True Negatives (TN): 27
False Positives (FP): 21
False Negatives (FN): 6
True Positives (TP): 96

Accuracy: 0.820
Precision: 0.821
Recall: 0.941
Specificity: 0.562
F1 Score: 0.877
Matthews Correlation Coefficient (MCC): 0.567


In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score

def grid_search_augmented(X_train_smote, y_train_smote, X_train_gaussian, y_train_gaussian, X_train_adasyn, y_train_adasyn, X_test, y_test):
    # Define the parameter grid for ExtraTreesClassifier
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7, 10],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None]
    }

    # Define the ExtraTreesClassifier model
    model = ExtraTreesClassifier()

    # Perform grid search for each augmentation method
    print("Grid search for ExtraTreesClassifier:")

    # SMOTE
    print("SMOTE:")
    grid_search_smote = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='accuracy',
        cv=10,
        n_jobs=-1,
        verbose=1
    )
    grid_search_smote.fit(X_train_smote, y_train_smote)
    best_model_smote = grid_search_smote.best_estimator_
    y_pred_smote = best_model_smote.predict(X_test)
    accuracy_smote = accuracy_score(y_test, y_pred_smote)
    print(f"Best parameters for ExtraTreesClassifier with SMOTE: {grid_search_smote.best_params_}")
    print(f"Test set accuracy for ExtraTreesClassifier with SMOTE: {accuracy_smote}")
    print()

    # Gaussian
    print("Gaussian:")
    grid_search_gaussian = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='accuracy',
        cv=10,
        n_jobs=-1,
        verbose=1
    )
    grid_search_gaussian.fit(X_train_gaussian, y_train_gaussian)
    best_model_gaussian = grid_search_gaussian.best_estimator_
    y_pred_gaussian = best_model_gaussian.predict(X_test)
    accuracy_gaussian = accuracy_score(y_test, y_pred_gaussian)
    print(f"Best parameters for ExtraTreesClassifier with Gaussian: {grid_search_gaussian.best_params_}")
    print(f"Test set accuracy for ExtraTreesClassifier with Gaussian: {accuracy_gaussian}")
    print()

    # ADASYN
    print("ADASYN:")
    grid_search_adasyn = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='accuracy',
        cv=10,
        n_jobs=-1,
        verbose=1
    )
    grid_search_adasyn.fit(X_train_adasyn, y_train_adasyn)
    best_model_adasyn = grid_search_adasyn.best_estimator_
    y_pred_adasyn = best_model_adasyn.predict(X_test)
    accuracy_adasyn = accuracy_score(y_test, y_pred_adasyn)
    print(f"Best parameters for ExtraTreesClassifier with ADASYN: {grid_search_adasyn.best_params_}")
    print(f"Test set accuracy for ExtraTreesClassifier with ADASYN: {accuracy_adasyn}")
    print()

In [17]:
grid_search_augmented(X_train_smote, y_train_smote, X_train_gaussian, y_train_gaussian, X_train_adasyn, y_train_adasyn, X_test, y_test)

Grid search for ExtraTreesClassifier:
SMOTE:
Fitting 10 folds for each of 324 candidates, totalling 3240 fits
Best parameters for ExtraTreesClassifier with SMOTE: {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Test set accuracy for ExtraTreesClassifier with SMOTE: 0.82

Gaussian:
Fitting 10 folds for each of 324 candidates, totalling 3240 fits
Best parameters for ExtraTreesClassifier with Gaussian: {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Test set accuracy for ExtraTreesClassifier with Gaussian: 0.7933333333333333

ADASYN:
Fitting 10 folds for each of 324 candidates, totalling 3240 fits
Best parameters for ExtraTreesClassifier with ADASYN: {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Test set accuracy for ExtraTreesClassifier with ADASYN: 0.8533333333333334



In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import confusion_matrix, matthews_corrcoef, make_scorer, accuracy_score
import numpy as np
import pandas as pd
import warnings

def print_confusion_matrix_report(cm, y_true, y_pred):
    tn, fp, fn, tp = cm.ravel()
    total = np.sum(cm)
    accuracy = (tp + tn) / total
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    mcc = matthews_corrcoef(y_true, y_pred)
    print("Confusion Matrix:")
    print(cm)
    print()
    print(f"True Negatives (TN): {tn}")
    print(f"False Positives (FP): {fp}")
    print(f"False Negatives (FN): {fn}")
    print(f"True Positives (TP): {tp}")
    print()
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"Specificity: {specificity:.3f}")
    print(f"F1 Score: {f1_score:.3f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.3f}")
    if precision == 0 and recall == 0:
        warnings.warn("Precision and Recall are both zero. F1 Score may not be meaningful.")
    if tn + fp == 0:
        warnings.warn("No negative samples. Specificity may not be meaningful.")

def fit_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print_confusion_matrix_report(cm, y_test, y_pred)
    return model

def grid_search_augmented(X_train_adasyn, y_train_adasyn, X_test, y_test):
    # Define the expanded parameter grid for ExtraTreesClassifier
    param_grid = {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [3, 5, 7, 10, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None],
        'bootstrap': [True, False],
        'criterion': ['gini', 'entropy']
    }

    # Define the ExtraTreesClassifier model
    model = ExtraTreesClassifier()

    # Define the scoring functions
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'mcc': make_scorer(matthews_corrcoef)
    }

    # Perform grid search for ADASYN
    print("Grid search for ExtraTreesClassifier with ADASYN:")
    grid_search_adasyn = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring=scoring,
        refit='accuracy',  # Specify the metric to use for refitting the best model
        cv=10,
        n_jobs=-1,
        verbose=1
    )
    grid_search_adasyn.fit(X_train_adasyn, y_train_adasyn)

    # Get the best model based on accuracy
    best_model_accuracy = grid_search_adasyn.best_estimator_
    best_params_accuracy = grid_search_adasyn.best_params_

    # Get the best model based on MCC score
    best_index_mcc = np.argmax(grid_search_adasyn.cv_results_['mean_test_mcc'])
    best_params_mcc = grid_search_adasyn.cv_results_['params'][best_index_mcc]
    best_model_mcc = ExtraTreesClassifier(**best_params_mcc)

    # Evaluate the best model based on accuracy
    print("\nBest Model based on Accuracy:")
    print(f"Best parameters: {best_params_accuracy}")
    fit_model(best_model_accuracy, X_train_adasyn, y_train_adasyn, X_test, y_test)

    # Evaluate the best model based on MCC score
    print("\nBest Model based on MCC Score:")
    print(f"Best parameters: {best_params_mcc}")
    fit_model(best_model_mcc, X_train_adasyn, y_train_adasyn, X_test, y_test)

# Example usage
grid_search_augmented(X_train_adasyn, y_train_adasyn, X_test, y_test)

Grid search for ExtraTreesClassifier with ADASYN:
Fitting 10 folds for each of 2160 candidates, totalling 21600 fits

Best Model based on Accuracy:
Best parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Confusion Matrix:
[[ 4 44]
 [ 4 98]]

True Negatives (TN): 4
False Positives (FP): 44
False Negatives (FN): 4
True Positives (TP): 98

Accuracy: 0.680
Precision: 0.690
Recall: 0.961
Specificity: 0.083
F1 Score: 0.803
Matthews Correlation Coefficient (MCC): 0.092

Best Model based on MCC Score:
Best parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Confusion Matrix:
[[34 14]
 [ 7 95]]

True Negatives (TN): 34
False Positives (FP): 14
False Negatives (FN): 7
True Positives (TP): 95

Accuracy: 0.860
Precision: 0.872
Recall: 0.931
Specificity: 0.708
F1 Score: 0.900
Matthews 

In [31]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import confusion_matrix, matthews_corrcoef, make_scorer, accuracy_score
import numpy as np
import pandas as pd
import warnings

def print_confusion_matrix_report(cm, y_true, y_pred):
    tn, fp, fn, tp = cm.ravel()
    total = np.sum(cm)
    accuracy = (tp + tn) / total
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    mcc = matthews_corrcoef(y_true, y_pred)
    print("Confusion Matrix:")
    print(cm)
    print()
    print(f"True Negatives (TN): {tn}")
    print(f"False Positives (FP): {fp}")
    print(f"False Negatives (FN): {fn}")
    print(f"True Positives (TP): {tp}")
    print()
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"Specificity: {specificity:.3f}")
    print(f"F1 Score: {f1_score:.3f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.3f}")
    if precision == 0 and recall == 0:
        warnings.warn("Precision and Recall are both zero. F1 Score may not be meaningful.")
    if tn + fp == 0:
        warnings.warn("No negative samples. Specificity may not be meaningful.")

def fit_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print_confusion_matrix_report(cm, y_test, y_pred)
    return model

def grid_search_augmented(X_train_adasyn, y_train_adasyn, X_test, y_test):
    # Frozen parameters
    frozen_params = {
        'bootstrap': [True],
        'criterion': ['gini'],
        'max_depth': [3],
        'max_features': ['sqrt'],
        'min_samples_leaf': [1],
        'min_samples_split': [2],
        'n_estimators': [50]
    }

    # Additional parameters to tune
    param_grid = {
        # 'class_weight': [None, 'balanced', 'balanced_subsample'],
        # 'ccp_alpha': [0.0, 0.1, 0.2],
        # 'max_samples': [None, 0.5, 0.7, 0.9],
        # 'oob_score': [True, False],
        # 'warm_start': [True, False],
        # 'n_jobs': [-1]
    }

    # Combine frozen parameters and additional parameters
    param_grid.update(frozen_params)

    # Define the ExtraTreesClassifier model
    model = ExtraTreesClassifier()

    # Define the scoring functions
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'mcc': make_scorer(matthews_corrcoef)
    }

    # Perform grid search for ADASYN
    print("Grid search for ExtraTreesClassifier with ADASYN:")
    grid_search_adasyn = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring=scoring,
        refit='accuracy',  # Specify the metric to use for refitting the best model
        cv=10,
        n_jobs=-1,
        verbose=1
    )
    grid_search_adasyn.fit(X_train_adasyn, y_train_adasyn)

    # Get the best model based on accuracy
    best_model_accuracy = grid_search_adasyn.best_estimator_
    best_params_accuracy = grid_search_adasyn.best_params_

    # Get the best model based on MCC score
    best_index_mcc = np.argmax(grid_search_adasyn.cv_results_['mean_test_mcc'])
    best_params_mcc = grid_search_adasyn.cv_results_['params'][best_index_mcc]
    best_model_mcc = ExtraTreesClassifier(**best_params_mcc)

    # Evaluate the best model based on accuracy
    print("\nBest Model based on Accuracy:")
    print(f"Best parameters: {best_params_accuracy}")
    fit_model(best_model_accuracy, X_train_adasyn, y_train_adasyn, X_test, y_test)

    # Evaluate the best model based on MCC score
    print("\nBest Model based on MCC Score:")
    print(f"Best parameters: {best_params_mcc}")
    fit_model(best_model_mcc, X_train_adasyn, y_train_adasyn, X_test, y_test)

# Example usage
grid_search_augmented(X_train_adasyn, y_train_adasyn, X_test, y_test)

Grid search for ExtraTreesClassifier with ADASYN:
Fitting 10 folds for each of 1 candidates, totalling 10 fits

Best Model based on Accuracy:
Best parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Confusion Matrix:
[[26 22]
 [ 7 95]]

True Negatives (TN): 26
False Positives (FP): 22
False Negatives (FN): 7
True Positives (TP): 95

Accuracy: 0.807
Precision: 0.812
Recall: 0.931
Specificity: 0.542
F1 Score: 0.868
Matthews Correlation Coefficient (MCC): 0.533

Best Model based on MCC Score:
Best parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Confusion Matrix:
[[38 10]
 [10 92]]

True Negatives (TN): 38
False Positives (FP): 10
False Negatives (FN): 10
True Positives (TP): 92

Accuracy: 0.867
Precision: 0.902
Recall: 0.902
Specificity: 0.792
F1 Score: 0.902
Matthews Corr

In [13]:
fit_model(estimator, X_train_smote, y_train_smote, X_test, y_test)

Confusion Matrix:
[[22 26]
 [ 4 98]]

True Negatives (TN): 22
False Positives (FP): 26
False Negatives (FN): 4
True Positives (TP): 98

Accuracy: 0.800
Precision: 0.790
Recall: 0.961
Specificity: 0.458
F1 Score: 0.867
Matthews Correlation Coefficient (MCC): 0.516


In [14]:
fit_model(estimator, X_train_adasyn, y_train_adasyn, X_test, y_test)

Confusion Matrix:
[[35 13]
 [12 90]]

True Negatives (TN): 35
False Positives (FP): 13
False Negatives (FN): 12
True Positives (TP): 90

Accuracy: 0.833
Precision: 0.874
Recall: 0.882
Specificity: 0.729
F1 Score: 0.878
Matthews Correlation Coefficient (MCC): 0.615


In [15]:
fit_model(estimator, X_train_gaussian, y_train_gaussian, X_test, y_test)

Confusion Matrix:
[[28 20]
 [ 7 95]]

True Negatives (TN): 28
False Positives (FP): 20
False Negatives (FN): 7
True Positives (TP): 95

Accuracy: 0.820
Precision: 0.826
Recall: 0.931
Specificity: 0.583
F1 Score: 0.876
Matthews Correlation Coefficient (MCC): 0.568


In [23]:
from sklearn.ensemble import ExtraTreesClassifier

# Define the parameter grid for ExtraTreesClassifier
param_grid_etc = {
    'n_estimators': [50, 100, 200],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.2],
    'max_features': ['sqrt', 'log2', None],
    'max_leaf_nodes': [None, 10, 20, 30],
    'min_impurity_decrease': [0.0, 0.1, 0.2],
    'bootstrap': [True, False],
    'oob_score': [True, False],
    'n_jobs': [-1],
    'random_state': [42],
    'verbose': [0],
    'warm_start': [True, False],
    'class_weight': [None, 'balanced', 'balanced_subsample'],
    'ccp_alpha': [0.0, 0.1, 0.2],
    'max_samples': [None, 0.5, 0.7, 0.9]
    # 'monotonic_cst': [[1] * X_train.shape[1], [0] * X_train.shape[1], [-1] * X_train.shape[1], None]
}

# Create an instance of ExtraTreesClassifier
etc = ExtraTreesClassifier()

# Perform grid search
grid_search_etc = GridSearchCV(
    estimator=etc,
    param_grid=param_grid_etc,
    scoring='accuracy',
    cv=10,
    n_jobs=-1,
    verbose=1
)

# Fit the grid search object
grid_search_etc.fit(X_train, y_train)

# Get the best model and its parameters
best_etc = grid_search_etc.best_estimator_
best_params_etc = grid_search_etc.best_params_

# Evaluate the best model on the test set
y_pred_etc = best_etc.predict(X_test)
accuracy_etc = accuracy_score(y_test, y_pred_etc)

print(f"Best parameters for ExtraTreesClassifier: {best_params_etc}")
print(f"Test set accuracy for ExtraTreesClassifier: {accuracy_etc}")

Fitting 10 folds for each of 10077696 candidates, totalling 100776960 fits


KeyboardInterrupt: 