In [163]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import matplotlib.transforms as transforms
import seaborn as sns
from tqdm import tqdm
import pickle
from copy import deepcopy
from sklearn.metrics import confusion_matrix

# IMB Learn SMOTE implementation
from imblearn.over_sampling import SMOTE

# Import EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

In [164]:
# Load Datasets
data_train_raw = pd.read_csv('~/Projects/water-ml/datasets/sheet_1.csv')
data_test_raw = pd.read_csv('~/Projects/water-ml/datasets/sheet_2_3.csv')
data_train_unlabeled = pd.read_csv('~/Projects/water-ml/datasets/sheet_3.csv')

# Remove DWDS_sim_rows from data
dwds = data_test_raw[data_test_raw['Location'] == 'DWDS Simulator (EPA, 2016)']

# Drop DWDS sim data from sheets 2&3 (test_data)
data_test = data_test_raw[data_test_raw['Location'] != 'DWDS Simulator (EPA, 2016)']

# Concatenate train data and dwds data
data_train = pd.concat([data_train_raw, dwds])

# Prepare train data
target_columns = ['Scheme', 'Sample (reference)']
X_train = data_train_raw.drop(target_columns, axis=1)
y_train = data_train_raw['Scheme'].map({'Stable': 1, 'Failure': 0})
X_train.replace('ND', 0, inplace=True)

# Prepare test data
target_columns = ['Scheme', 'Sample', 'Location']
X_test = data_test.drop(target_columns, axis=1)
y_test = data_test['Scheme'].map({'Stable': 1, 'Failure': 0})
X_test.replace('ND', 0, inplace=True)
X_test.fillna(0, inplace=True)


In [165]:
import numpy as np
from sklearn.metrics import matthews_corrcoef
import warnings

def print_confusion_matrix_report(cm, y_true, y_pred):
    tn, fp, fn, tp = cm.ravel()
    total = np.sum(cm)

    accuracy = (tp + tn) / total
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    mcc = matthews_corrcoef(y_true, y_pred)

    print("Confusion Matrix:")
    print(cm)
    print()
    print(f"True Negatives (TN): {tn}")
    print(f"False Positives (FP): {fp}")
    print(f"False Negatives (FN): {fn}")
    print(f"True Positives (TP): {tp}")
    print()
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"Specificity: {specificity:.3f}")
    print(f"F1 Score: {f1_score:.3f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.3f}")

    if precision == 0 and recall == 0:
        warnings.warn("Precision and Recall are both zero. F1 Score may not be meaningful.")
    if tn + fp == 0:
        warnings.warn("No negative samples. Specificity may not be meaningful.")
        
def fit_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print_confusion_matrix_report(cm, y_test, y_pred)
    return model

In [166]:
# Handle class imbalance, smote addresses class imbalance, no need to address the kfold split
# smote = SMOTE()
# X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Handle class imbalance, smote address class imbalance, no need to address the 
# smote = SMOTE()
# X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [167]:
from sklearn.ensemble import RandomForestClassifier

# Tuned Random Forest
# eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=15))
# Confusion Matrix:
# [[11 37]
#  [ 5 97]]

# True Negatives (TN): 11
# False Positives (FP): 37
# False Negatives (FN): 5
# True Positives (TP): 97

# Accuracy: 0.720
# Precision: 0.724
# Recall: 0.951
# Specificity: 0.229
# F1 Score: 0.822
# Matthews Correlation Coefficient (MCC): 0.272

# Vanilla Random Forest (default parameters)
eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=RandomForestClassifier())

fit_model(eec, X_train, y_train, X_test, y_test)



Confusion Matrix:
[[ 6 42]
 [ 5 97]]

True Negatives (TN): 6
False Positives (FP): 42
False Negatives (FN): 5
True Positives (TP): 97

Accuracy: 0.687
Precision: 0.698
Recall: 0.951
Specificity: 0.125
F1 Score: 0.805
Matthews Correlation Coefficient (MCC): 0.136


In [168]:
from sklearn.tree import DecisionTreeClassifier

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=DecisionTreeClassifier())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[  1  47]
 [  0 102]]

True Negatives (TN): 1
False Positives (FP): 47
False Negatives (FN): 0
True Positives (TP): 102

Accuracy: 0.687
Precision: 0.685
Recall: 1.000
Specificity: 0.021
F1 Score: 0.813
Matthews Correlation Coefficient (MCC): 0.119




In [169]:
from sklearn.linear_model import LogisticRegression

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=LogisticRegression())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[ 46   2]
 [101   1]]

True Negatives (TN): 46
False Positives (FP): 2
False Negatives (FN): 101
True Positives (TP): 1

Accuracy: 0.313
Precision: 0.333
Recall: 0.010
Specificity: 0.958
F1 Score: 0.019
Matthews Correlation Coefficient (MCC): -0.106




In [170]:
from sklearn.svm import SVC

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=SVC())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[ 9 39]
 [ 3 99]]

True Negatives (TN): 9
False Positives (FP): 39
False Negatives (FN): 3
True Positives (TP): 99

Accuracy: 0.720
Precision: 0.717
Recall: 0.971
Specificity: 0.188
F1 Score: 0.825
Matthews Correlation Coefficient (MCC): 0.272




In [171]:
from sklearn.naive_bayes import GaussianNB

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=GaussianNB())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[37 11]
 [11 91]]

True Negatives (TN): 37
False Positives (FP): 11
False Negatives (FN): 11
True Positives (TP): 91

Accuracy: 0.853
Precision: 0.892
Recall: 0.892
Specificity: 0.771
F1 Score: 0.892
Matthews Correlation Coefficient (MCC): 0.663




In [172]:
from sklearn.neighbors import KNeighborsClassifier

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=KNeighborsClassifier())

fit_model(eec, X_train, y_train, X_test, y_test)

Confusion Matrix:
[[41  7]
 [38 64]]

True Negatives (TN): 41
False Positives (FP): 7
False Negatives (FN): 38
True Positives (TP): 64

Accuracy: 0.700
Precision: 0.901
Recall: 0.627
Specificity: 0.854
F1 Score: 0.740
Matthews Correlation Coefficient (MCC): 0.450




In [173]:
from sklearn.ensemble import GradientBoostingClassifier

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=GradientBoostingClassifier())

fit_model(eec, X_train, y_train, X_test, y_test)



Confusion Matrix:
[[ 6 42]
 [ 5 97]]

True Negatives (TN): 6
False Positives (FP): 42
False Negatives (FN): 5
True Positives (TP): 97

Accuracy: 0.687
Precision: 0.698
Recall: 0.951
Specificity: 0.125
F1 Score: 0.805
Matthews Correlation Coefficient (MCC): 0.136


In [174]:
from sklearn.ensemble import ExtraTreesClassifier

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=ExtraTreesClassifier())

fit_model(eec, X_train, y_train, X_test, y_test)    



Confusion Matrix:
[[25 23]
 [ 6 96]]

True Negatives (TN): 25
False Positives (FP): 23
False Negatives (FN): 6
True Positives (TP): 96

Accuracy: 0.807
Precision: 0.807
Recall: 0.941
Specificity: 0.521
F1 Score: 0.869
Matthews Correlation Coefficient (MCC): 0.532
