In [102]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import matplotlib.transforms as transforms
import seaborn as sns
from tqdm import tqdm
import pickle
from copy import deepcopy
from sklearn.metrics import confusion_matrix

# IMB Learn SMOTE implementation
from imblearn.over_sampling import SMOTE

# Import EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

In [103]:
# Load Datasets
data_train_raw = pd.read_csv('~/Projects/water-ml/datasets/sheet_1.csv')
data_test_raw = pd.read_csv('~/Projects/water-ml/datasets/sheet_2_3.csv')
data_train_unlabeled = pd.read_csv('~/Projects/water-ml/datasets/sheet_3.csv')

# Remove DWDS_sim_rows from data
dwds = data_test_raw[data_test_raw['Location'] == 'DWDS Simulator (EPA, 2016)']

# Drop DWDS sim data from sheets 2&3 (test_data)
data_test = data_test_raw[data_test_raw['Location'] != 'DWDS Simulator (EPA, 2016)']

# Concatenate train data and dwds data
data_train = pd.concat([data_train_raw, dwds])

# Prepare train data
target_columns = ['Scheme', 'Sample (reference)']
X_train = data_train_raw.drop(target_columns, axis=1)
y_train = data_train_raw['Scheme'].map({'Stable': 1, 'Failure': 0})
X_train.replace('ND', 0, inplace=True)

# Prepare test data
target_columns = ['Scheme', 'Sample', 'Location']
X_test = data_test.drop(target_columns, axis=1)
y_test = data_test['Scheme'].map({'Stable': 1, 'Failure': 0})
X_test.replace('ND', 0, inplace=True)
X_test.fillna(0, inplace=True)


In [104]:
import numpy as np
from sklearn.metrics import matthews_corrcoef
import warnings

def print_confusion_matrix_report(cm, y_true, y_pred):
    tn, fp, fn, tp = cm.ravel()
    total = np.sum(cm)

    accuracy = (tp + tn) / total
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    mcc = matthews_corrcoef(y_true, y_pred)

    print("Confusion Matrix:")
    print(cm)
    print()
    print(f"True Negatives (TN): {tn}")
    print(f"False Positives (FP): {fp}")
    print(f"False Negatives (FN): {fn}")
    print(f"True Positives (TP): {tp}")
    print()
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"Specificity: {specificity:.3f}")
    print(f"F1 Score: {f1_score:.3f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.3f}")

    if precision == 0 and recall == 0:
        warnings.warn("Precision and Recall are both zero. F1 Score may not be meaningful.")
    if tn + fp == 0:
        warnings.warn("No negative samples. Specificity may not be meaningful.")
        
def fit_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print_confusion_matrix_report(cm, y_test, y_pred)
    return model

In [105]:
# Handle class imbalance, smote addresses class imbalance, no need to address the kfold split
# smote = SMOTE()
# X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Handle class imbalance, smote address class imbalance, no need to address the 
# smote = SMOTE()
# X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [106]:
from sklearn.ensemble import RandomForestClassifier

# Tuned Random Forest
# eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=15))
# Confusion Matrix:
# [[11 37]
#  [ 5 97]]

# True Negatives (TN): 11
# False Positives (FP): 37
# False Negatives (FN): 5
# True Positives (TP): 97

# Accuracy: 0.720
# Precision: 0.724
# Recall: 0.951
# Specificity: 0.229
# F1 Score: 0.822
# Matthews Correlation Coefficient (MCC): 0.272

# Vanilla Random Forest (default parameters)
eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=RandomForestClassifier())

fit_model(eec, X_train, y_train, X_test, y_test)



Confusion Matrix:
[[ 6 42]
 [ 5 97]]

True Negatives (TN): 6
False Positives (FP): 42
False Negatives (FN): 5
True Positives (TP): 97

Accuracy: 0.687
Precision: 0.698
Recall: 0.951
Specificity: 0.125
F1 Score: 0.805
Matthews Correlation Coefficient (MCC): 0.136


In [107]:
from sklearn.tree import DecisionTreeClassifier

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=DecisionTreeClassifier())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[13 35]
 [ 7 95]]

True Negatives (TN): 13
False Positives (FP): 35
False Negatives (FN): 7
True Positives (TP): 95

Accuracy: 0.720
Precision: 0.731
Recall: 0.931
Specificity: 0.271
F1 Score: 0.819
Matthews Correlation Coefficient (MCC): 0.277




In [108]:
from sklearn.linear_model import LogisticRegression

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=LogisticRegression())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[ 46   2]
 [101   1]]

True Negatives (TN): 46
False Positives (FP): 2
False Negatives (FN): 101
True Positives (TP): 1

Accuracy: 0.313
Precision: 0.333
Recall: 0.010
Specificity: 0.958
F1 Score: 0.019
Matthews Correlation Coefficient (MCC): -0.106




In [109]:
from sklearn.svm import SVC

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=SVC())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[  6  42]
 [  2 100]]

True Negatives (TN): 6
False Positives (FP): 42
False Negatives (FN): 2
True Positives (TP): 100

Accuracy: 0.707
Precision: 0.704
Recall: 0.980
Specificity: 0.125
F1 Score: 0.820
Matthews Correlation Coefficient (MCC): 0.219




In [110]:
from sklearn.naive_bayes import GaussianNB

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=GaussianNB())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[37 11]
 [11 91]]

True Negatives (TN): 37
False Positives (FP): 11
False Negatives (FN): 11
True Positives (TP): 91

Accuracy: 0.853
Precision: 0.892
Recall: 0.892
Specificity: 0.771
F1 Score: 0.892
Matthews Correlation Coefficient (MCC): 0.663




In [111]:
from sklearn.neighbors import KNeighborsClassifier

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=KNeighborsClassifier())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[31 17]
 [12 90]]

True Negatives (TN): 31
False Positives (FP): 17
False Negatives (FN): 12
True Positives (TP): 90

Accuracy: 0.807
Precision: 0.841
Recall: 0.882
Specificity: 0.646
F1 Score: 0.861
Matthews Correlation Coefficient (MCC): 0.545




In [112]:
from sklearn.ensemble import GradientBoostingClassifier

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=GradientBoostingClassifier())

fit_model(eec, X_train, y_train, X_test, y_test)



Confusion Matrix:
[[33 15]
 [15 87]]

True Negatives (TN): 33
False Positives (FP): 15
False Negatives (FN): 15
True Positives (TP): 87

Accuracy: 0.800
Precision: 0.853
Recall: 0.853
Specificity: 0.688
F1 Score: 0.853
Matthews Correlation Coefficient (MCC): 0.540


In [113]:
from sklearn.ensemble import ExtraTreesClassifier

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=ExtraTreesClassifier())

fit_model(eec, X_train, y_train, X_test, y_test)    



Confusion Matrix:
[[27 21]
 [ 6 96]]

True Negatives (TN): 27
False Positives (FP): 21
False Negatives (FN): 6
True Positives (TP): 96

Accuracy: 0.820
Precision: 0.821
Recall: 0.941
Specificity: 0.562
F1 Score: 0.877
Matthews Correlation Coefficient (MCC): 0.567


In [114]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score

# Define the parameter grid for each model
param_grid = {
    'RandomForestClassifier': {
        'base_estimator__n_estimators': [50, 100, 200],
        'base_estimator__max_depth': [3, 5, 7],
        'base_estimator__min_samples_split': [2, 5, 10]
    },
    'DecisionTreeClassifier': {
        'base_estimator__max_depth': [3, 5, 7],
        'base_estimator__min_samples_split': [2, 5, 10]
    },
    'LogisticRegression': {
        'base_estimator__C': [0.1, 1, 10],
        'base_estimator__solver': ['liblinear', 'saga']
    },
    'SVC': {
        'base_estimator__C': [0.1, 1, 10],
        'base_estimator__kernel': ['linear', 'rbf']
    },
    'GaussianNB': {
        'base_estimator__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]    
    },  
    'KNeighborsClassifier': {
        'base_estimator__n_neighbors': [3, 5, 7],
        'base_estimator__weights': ['uniform', 'distance']
    },
    'GradientBoostingClassifier': {
        'base_estimator__n_estimators': [50, 100, 200],
        'base_estimator__learning_rate': [0.01, 0.1, 1]
    },
    'ExtraTreesClassifier': {
        'base_estimator__n_estimators': [50, 100, 200],
        'base_estimator__max_depth': [3, 5, 7]
    }
}

# Define the models to evaluate
models = {
    'RandomForestClassifier': RandomForestClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(),
    'GaussianNB': GaussianNB(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'ExtraTreesClassifier': ExtraTreesClassifier()
}

# Perform grid search for each model
# for model_name, base_estimator in models.items():
#     print(f"Grid search for {model_name}:")
    
#     eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=base_estimator)
    
#     grid_search = GridSearchCV(
#         estimator=eec,
#         param_grid=param_grid[model_name],
#         scoring='accuracy',
#         cv=10,
#         n_jobs=-1,
#         verbose=1
#     )
    
#     grid_search.fit(X_train, y_train)
    
#     print(f"Best parameters for {model_name}: {grid_search.best_params_}")
#     print(f"Best score for {model_name}: {grid_search.best_score_}")
#     print()

# Perform grid search for each model
for model_name, base_estimator in models.items():
    print(f"Grid search for {model_name}:")
    
    eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=base_estimator)
    
    grid_search = GridSearchCV(
        estimator=eec,
        param_grid=param_grid[model_name],
        scoring='accuracy',
        cv=10,
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Test set accuracy for {model_name}: {accuracy}")
    print()

Grid search for RandomForestClassifier:
Fitting 10 folds for each of 27 candidates, totalling 270 fits




Best parameters for RandomForestClassifier: {'base_estimator__max_depth': 3, 'base_estimator__min_samples_split': 2, 'base_estimator__n_estimators': 50}
Test set accuracy for RandomForestClassifier: 0.6866666666666666

Grid search for DecisionTreeClassifier:
Fitting 10 folds for each of 9 candidates, totalling 90 fits




Best parameters for DecisionTreeClassifier: {'base_estimator__max_depth': 3, 'base_estimator__min_samples_split': 5}
Test set accuracy for DecisionTreeClassifier: 0.6866666666666666

Grid search for LogisticRegression:
Fitting 10 folds for each of 6 candidates, totalling 60 fits




Best parameters for LogisticRegression: {'base_estimator__C': 10, 'base_estimator__solver': 'liblinear'}
Test set accuracy for LogisticRegression: 0.31333333333333335

Grid search for SVC:
Fitting 10 folds for each of 6 candidates, totalling 60 fits




Best parameters for SVC: {'base_estimator__C': 1, 'base_estimator__kernel': 'rbf'}
Test set accuracy for SVC: 0.7133333333333334

Grid search for GaussianNB:
Fitting 10 folds for each of 5 candidates, totalling 50 fits
Best parameters for GaussianNB: {'base_estimator__var_smoothing': 1e-09}
Test set accuracy for GaussianNB: 0.8533333333333334

Grid search for KNeighborsClassifier:
Fitting 10 folds for each of 6 candidates, totalling 60 fits




Best parameters for KNeighborsClassifier: {'base_estimator__n_neighbors': 3, 'base_estimator__weights': 'uniform'}
Test set accuracy for KNeighborsClassifier: 0.6933333333333334

Grid search for GradientBoostingClassifier:
Fitting 10 folds for each of 9 candidates, totalling 90 fits




Best parameters for GradientBoostingClassifier: {'base_estimator__learning_rate': 0.01, 'base_estimator__n_estimators': 50}
Test set accuracy for GradientBoostingClassifier: 0.7266666666666667

Grid search for ExtraTreesClassifier:
Fitting 10 folds for each of 9 candidates, totalling 90 fits




Best parameters for ExtraTreesClassifier: {'base_estimator__max_depth': 3, 'base_estimator__n_estimators': 50}
Test set accuracy for ExtraTreesClassifier: 0.84





In [115]:
# 'base_estimator__max_depth': 3, 'base_estimator__min_samples_split': 2, 'base_estimator__n_estimators': 50
eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=RandomForestClassifier(max_depth=3, min_samples_split=2, n_estimators=50))

fit_model(eec, X_train, y_train, X_test, y_test)



Confusion Matrix:
[[ 6 42]
 [ 5 97]]

True Negatives (TN): 6
False Positives (FP): 42
False Negatives (FN): 5
True Positives (TP): 97

Accuracy: 0.687
Precision: 0.698
Recall: 0.951
Specificity: 0.125
F1 Score: 0.805
Matthews Correlation Coefficient (MCC): 0.136


In [116]:
# 'base_estimator__max_depth': 3, 'base_estimator__min_samples_split': 2
eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=DecisionTreeClassifier(max_depth=3, min_samples_split=2))

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[  0  48]
 [  0 102]]

True Negatives (TN): 0
False Positives (FP): 48
False Negatives (FN): 0
True Positives (TP): 102

Accuracy: 0.680
Precision: 0.680
Recall: 1.000
Specificity: 0.000
F1 Score: 0.810
Matthews Correlation Coefficient (MCC): 0.000




In [117]:
# 'base_estimator__C': 10, 'base_estimator__solver': 'liblinear'
eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=LogisticRegression(C=10, solver='liblinear'))

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[ 46   2]
 [100   2]]

True Negatives (TN): 46
False Positives (FP): 2
False Negatives (FN): 100
True Positives (TP): 2

Accuracy: 0.320
Precision: 0.500
Recall: 0.020
Specificity: 0.958
F1 Score: 0.038
Matthews Correlation Coefficient (MCC): -0.064




In [118]:
eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=SVC(C=10, kernel='rbf'))

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[  0  48]
 [  0 102]]

True Negatives (TN): 0
False Positives (FP): 48
False Negatives (FN): 0
True Positives (TP): 102

Accuracy: 0.680
Precision: 0.680
Recall: 1.000
Specificity: 0.000
F1 Score: 0.810
Matthews Correlation Coefficient (MCC): 0.000




In [119]:
# GaussianNB
# Confusion Matrix:
# [[37 11]
#  [11 91]]

# True Negatives (TN): 37
# False Positives (FP): 11
# False Negatives (FN): 11
# True Positives (TP): 91

# Accuracy: 0.853
# Precision: 0.892
# Recall: 0.892
# Specificity: 0.771
# F1 Score: 0.892
# Matthews Correlation Coefficient (MCC): 0.663

# 'base_estimator__var_smoothing': 1e-09
eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=GaussianNB())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[38 10]
 [12 90]]

True Negatives (TN): 38
False Positives (FP): 10
False Negatives (FN): 12
True Positives (TP): 90

Accuracy: 0.853
Precision: 0.900
Recall: 0.882
Specificity: 0.792
F1 Score: 0.891
Matthews Correlation Coefficient (MCC): 0.667




In [120]:
# 'base_estimator__n_neighbors': 3, 'base_estimator__weights': 'uniform'
eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=KNeighborsClassifier(n_neighbors=3, weights='uniform'))

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[  1  47]
 [  0 102]]

True Negatives (TN): 1
False Positives (FP): 47
False Negatives (FN): 0
True Positives (TP): 102

Accuracy: 0.687
Precision: 0.685
Recall: 1.000
Specificity: 0.021
F1 Score: 0.813
Matthews Correlation Coefficient (MCC): 0.119




In [121]:
# 'base_estimator__learning_rate': 0.01, 'base_estimator__n_estimators': 50
eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=GradientBoostingClassifier(learning_rate=0.01, n_estimators=50))

fit_model(eec, X_train, y_train, X_test, y_test)



Confusion Matrix:
[[14 34]
 [ 7 95]]

True Negatives (TN): 14
False Positives (FP): 34
False Negatives (FN): 7
True Positives (TP): 95

Accuracy: 0.727
Precision: 0.736
Recall: 0.931
Specificity: 0.292
F1 Score: 0.823
Matthews Correlation Coefficient (MCC): 0.300


In [122]:
# 'base_estimator__max_depth': 3, 'base_estimator__n_estimators': 50
eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=ExtraTreesClassifier(max_depth=3, n_estimators=50))

fit_model(eec, X_train, y_train, X_test, y_test)    



Confusion Matrix:
[[24 24]
 [ 5 97]]

True Negatives (TN): 24
False Positives (FP): 24
False Negatives (FN): 5
True Positives (TP): 97

Accuracy: 0.807
Precision: 0.802
Recall: 0.951
Specificity: 0.500
F1 Score: 0.870
Matthews Correlation Coefficient (MCC): 0.533
