In [204]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import matplotlib.transforms as transforms
import seaborn as sns
from tqdm import tqdm
import pickle
from copy import deepcopy
from sklearn.metrics import confusion_matrix

# IMB Learn SMOTE implementation
from imblearn.over_sampling import SMOTE

# Import EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

In [205]:
# Load Datasets
data_train_raw = pd.read_csv('~/Projects/water-ml/datasets/sheet_1.csv')
data_test_raw = pd.read_csv('~/Projects/water-ml/datasets/sheet_2_3.csv')
data_train_unlabeled = pd.read_csv('~/Projects/water-ml/datasets/sheet_3.csv')

# Remove DWDS_sim_rows from data
dwds = data_test_raw[data_test_raw['Location'] == 'DWDS Simulator (EPA, 2016)']

# Drop DWDS sim data from sheets 2&3 (test_data)
data_test = data_test_raw[data_test_raw['Location'] != 'DWDS Simulator (EPA, 2016)']

# Concatenate train data and dwds data
data_train = pd.concat([data_train_raw, dwds])

# Prepare train data
target_columns = ['Scheme', 'Sample (reference)']
X_train = data_train_raw.drop(target_columns, axis=1)
y_train = data_train_raw['Scheme'].map({'Stable': 1, 'Failure': 0})
X_train.replace('ND', 0, inplace=True)

# Prepare test data
target_columns = ['Scheme', 'Sample', 'Location']
X_test = data_test.drop(target_columns, axis=1)
y_test = data_test['Scheme'].map({'Stable': 1, 'Failure': 0})
X_test.replace('ND', 0, inplace=True)
X_test.fillna(0, inplace=True)


In [206]:
import numpy as np
from sklearn.metrics import matthews_corrcoef
import warnings

def print_confusion_matrix_report(cm, y_true, y_pred):
    tn, fp, fn, tp = cm.ravel()
    total = np.sum(cm)

    accuracy = (tp + tn) / total
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    mcc = matthews_corrcoef(y_true, y_pred)

    print("Confusion Matrix:")
    print(cm)
    print()
    print(f"True Negatives (TN): {tn}")
    print(f"False Positives (FP): {fp}")
    print(f"False Negatives (FN): {fn}")
    print(f"True Positives (TP): {tp}")
    print()
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"Specificity: {specificity:.3f}")
    print(f"F1 Score: {f1_score:.3f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.3f}")

    if precision == 0 and recall == 0:
        warnings.warn("Precision and Recall are both zero. F1 Score may not be meaningful.")
    if tn + fp == 0:
        warnings.warn("No negative samples. Specificity may not be meaningful.")
        
def fit_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print_confusion_matrix_report(cm, y_test, y_pred)
    return model

In [207]:
# Handle class imbalance, smote addresses class imbalance, no need to address the kfold split
# smote = SMOTE()
# X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Handle class imbalance, smote address class imbalance, no need to address the 
# smote = SMOTE()
# X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [208]:
from sklearn.ensemble import RandomForestClassifier

# Tuned Random Forest
# eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=15))
# Confusion Matrix:
# [[11 37]
#  [ 5 97]]

# True Negatives (TN): 11
# False Positives (FP): 37
# False Negatives (FN): 5
# True Positives (TP): 97

# Accuracy: 0.720
# Precision: 0.724
# Recall: 0.951
# Specificity: 0.229
# F1 Score: 0.822
# Matthews Correlation Coefficient (MCC): 0.272

# Vanilla Random Forest (default parameters)
eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=RandomForestClassifier())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[ 6 42]
 [ 4 98]]

True Negatives (TN): 6
False Positives (FP): 42
False Negatives (FN): 4
True Positives (TP): 98

Accuracy: 0.693
Precision: 0.700
Recall: 0.961
Specificity: 0.125
F1 Score: 0.810
Matthews Correlation Coefficient (MCC): 0.160


In [209]:
from sklearn.tree import DecisionTreeClassifier

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=DecisionTreeClassifier())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[47  1]
 [97  5]]

True Negatives (TN): 47
False Positives (FP): 1
False Negatives (FN): 97
True Positives (TP): 5

Accuracy: 0.347
Precision: 0.833
Recall: 0.049
Specificity: 0.979
F1 Score: 0.093
Matthews Correlation Coefficient (MCC): 0.067


In [210]:
from sklearn.linear_model import LogisticRegression

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=LogisticRegression())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[ 47   1]
 [101   1]]

True Negatives (TN): 47
False Positives (FP): 1
False Negatives (FN): 101
True Positives (TP): 1

Accuracy: 0.320
Precision: 0.500
Recall: 0.010
Specificity: 0.979
F1 Score: 0.019
Matthews Correlation Coefficient (MCC): -0.045


In [211]:
from sklearn.svm import SVC

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=SVC())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[14 34]
 [ 7 95]]

True Negatives (TN): 14
False Positives (FP): 34
False Negatives (FN): 7
True Positives (TP): 95

Accuracy: 0.727
Precision: 0.736
Recall: 0.931
Specificity: 0.292
F1 Score: 0.823
Matthews Correlation Coefficient (MCC): 0.300


In [212]:
from sklearn.naive_bayes import GaussianNB

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=GaussianNB())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[38 10]
 [12 90]]

True Negatives (TN): 38
False Positives (FP): 10
False Negatives (FN): 12
True Positives (TP): 90

Accuracy: 0.853
Precision: 0.900
Recall: 0.882
Specificity: 0.792
F1 Score: 0.891
Matthews Correlation Coefficient (MCC): 0.667


In [213]:
from sklearn.neighbors import KNeighborsClassifier

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=KNeighborsClassifier())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[30 18]
 [12 90]]

True Negatives (TN): 30
False Positives (FP): 18
False Negatives (FN): 12
True Positives (TP): 90

Accuracy: 0.800
Precision: 0.833
Recall: 0.882
Specificity: 0.625
F1 Score: 0.857
Matthews Correlation Coefficient (MCC): 0.527


In [214]:
from sklearn.ensemble import GradientBoostingClassifier

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=GradientBoostingClassifier())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[  1  47]
 [  0 102]]

True Negatives (TN): 1
False Positives (FP): 47
False Negatives (FN): 0
True Positives (TP): 102

Accuracy: 0.687
Precision: 0.685
Recall: 1.000
Specificity: 0.021
F1 Score: 0.813
Matthews Correlation Coefficient (MCC): 0.119


In [215]:
from sklearn.ensemble import ExtraTreesClassifier

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=ExtraTreesClassifier())

fit_model(eec, X_train, y_train, X_test, y_test)    

Confusion Matrix:
[[30 18]
 [ 7 95]]

True Negatives (TN): 30
False Positives (FP): 18
False Negatives (FN): 7
True Positives (TP): 95

Accuracy: 0.833
Precision: 0.841
Recall: 0.931
Specificity: 0.625
F1 Score: 0.884
Matthews Correlation Coefficient (MCC): 0.602


In [216]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier

# Define the parameter grid for each model
param_grid = {
    'DecisionTreeClassifier': {
        'base_estimator__max_depth': [3, 5, 7],
        'base_estimator__min_samples_split': [2, 5, 10]
    },
    'LogisticRegression': {
        'base_estimator__C': [0.1, 1, 10],
        'base_estimator__solver': ['liblinear', 'saga']
    },
    'SVC': {
        'base_estimator__C': [0.1, 1, 10],
        'base_estimator__kernel': ['linear', 'rbf']
    },
    'GaussianNB': {},  # No hyperparameters to tune for Gaussian Naive Bayes
    'KNeighborsClassifier': {
        'base_estimator__n_neighbors': [3, 5, 7],
        'base_estimator__weights': ['uniform', 'distance']
    },
    'GradientBoostingClassifier': {
        'base_estimator__n_estimators': [50, 100, 200],
        'base_estimator__learning_rate': [0.01, 0.1, 1]
    },
    'ExtraTreesClassifier': {
        'base_estimator__n_estimators': [50, 100, 200],
        'base_estimator__max_depth': [3, 5, 7]
    }
}

# Define the models to evaluate
models = {
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(),
    'GaussianNB': GaussianNB(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'ExtraTreesClassifier': ExtraTreesClassifier()
}

# Perform grid search for each model
for model_name, base_estimator in models.items():
    print(f"Grid search for {model_name}:")
    
    eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=base_estimator)
    
    grid_search = GridSearchCV(
        estimator=eec,
        param_grid=param_grid[model_name],
        scoring='accuracy',
        cv=10,
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best score for {model_name}: {grid_search.best_score_}")
    print()

Grid search for DecisionTreeClassifier:
Fitting 10 folds for each of 9 candidates, totalling 90 fits




Best parameters for DecisionTreeClassifier: {'base_estimator__max_depth': 3, 'base_estimator__min_samples_split': 2}
Best score for DecisionTreeClassifier: 1.0

Grid search for LogisticRegression:
Fitting 10 folds for each of 6 candidates, totalling 60 fits




Best parameters for LogisticRegression: {'base_estimator__C': 10, 'base_estimator__solver': 'liblinear'}
Best score for LogisticRegression: 0.975

Grid search for SVC:
Fitting 10 folds for each of 6 candidates, totalling 60 fits




Best parameters for SVC: {'base_estimator__C': 1, 'base_estimator__kernel': 'rbf'}
Best score for SVC: 1.0

Grid search for GaussianNB:
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Best parameters for GaussianNB: {}
Best score for GaussianNB: 1.0

Grid search for KNeighborsClassifier:
Fitting 10 folds for each of 6 candidates, totalling 60 fits




Best parameters for KNeighborsClassifier: {'base_estimator__n_neighbors': 3, 'base_estimator__weights': 'uniform'}
Best score for KNeighborsClassifier: 1.0

Grid search for GradientBoostingClassifier:
Fitting 10 folds for each of 9 candidates, totalling 90 fits




Best parameters for GradientBoostingClassifier: {'base_estimator__learning_rate': 0.01, 'base_estimator__n_estimators': 50}
Best score for GradientBoostingClassifier: 1.0

Grid search for ExtraTreesClassifier:
Fitting 10 folds for each of 9 candidates, totalling 90 fits




Best parameters for ExtraTreesClassifier: {'base_estimator__max_depth': 3, 'base_estimator__n_estimators': 50}
Best score for ExtraTreesClassifier: 1.0

