In [7]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import matplotlib.transforms as transforms
import seaborn as sns
from tqdm import tqdm
import pickle
from copy import deepcopy
from sklearn.metrics import confusion_matrix

# IMB Learn SMOTE implementation
from imblearn.over_sampling import SMOTE

# Import EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

In [8]:
# Load Datasets
data_train_raw = pd.read_csv('~/Projects/water-ml/datasets/sheet_1.csv')
data_test_raw = pd.read_csv('~/Projects/water-ml/datasets/sheet_2_3.csv')
data_train_unlabeled = pd.read_csv('~/Projects/water-ml/datasets/sheet_3.csv')

# Remove DWDS_sim_rows from data
dwds = data_test_raw[data_test_raw['Location'] == 'DWDS Simulator (EPA, 2016)']

# Drop DWDS sim data from sheets 2&3 (test_data)
data_test = data_test_raw[data_test_raw['Location'] != 'DWDS Simulator (EPA, 2016)']

# Concatenate train data and dwds data
data_train = pd.concat([data_train_raw, dwds])

# Prepare train data
target_columns = ['Scheme', 'Sample (reference)']
X_train = data_train_raw.drop(target_columns, axis=1)
y_train = data_train_raw['Scheme'].map({'Stable': 1, 'Failure': 0})
X_train.replace('ND', 0, inplace=True)

# Prepare test data
target_columns = ['Scheme', 'Sample', 'Location']
X_test = data_test.drop(target_columns, axis=1)
y_test = data_test['Scheme'].map({'Stable': 1, 'Failure': 0})
X_test.replace('ND', 0, inplace=True)
X_test.fillna(0, inplace=True)


In [9]:
import numpy as np
from sklearn.metrics import matthews_corrcoef
import warnings

def print_confusion_matrix_report(cm, y_true, y_pred):
    tn, fp, fn, tp = cm.ravel()
    total = np.sum(cm)

    accuracy = (tp + tn) / total
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    mcc = matthews_corrcoef(y_true, y_pred)

    print("Confusion Matrix:")
    print(cm)
    print()
    print(f"True Negatives (TN): {tn}")
    print(f"False Positives (FP): {fp}")
    print(f"False Negatives (FN): {fn}")
    print(f"True Positives (TP): {tp}")
    print()
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"Specificity: {specificity:.3f}")
    print(f"F1 Score: {f1_score:.3f}")
    print(f"Matthews Correlation Coefficient (MCC): {mcc:.3f}")

    if precision == 0 and recall == 0:
        warnings.warn("Precision and Recall are both zero. F1 Score may not be meaningful.")
    if tn + fp == 0:
        warnings.warn("No negative samples. Specificity may not be meaningful.")

In [10]:
# Handle class imbalance, smote addresses class imbalance, no need to address the kfold split
# smote = SMOTE()
# X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Handle class imbalance, smote address class imbalance, no need to address the 
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [11]:
from sklearn.ensemble import RandomForestClassifier

eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=RandomForestClassifier())
eec_smote = EasyEnsembleClassifier(n_estimators=10, base_estimator=RandomForestClassifier())
eec.fit(X_train, y_train)
eec_smote.fit(X_train_smote, y_train_smote)

y_pred = eec.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

y_pred_smote = eec_smote.predict(X_test)
cm_smote = confusion_matrix(y_test, y_pred_smote)



In [12]:
# With SMOTE
print_confusion_matrix_report(cm_smote, y_test, y_pred)

Confusion Matrix:
[[ 5 43]
 [ 3 99]]

True Negatives (TN): 5
False Positives (FP): 43
False Negatives (FN): 3
True Positives (TP): 99

Accuracy: 0.693
Precision: 0.697
Recall: 0.971
Specificity: 0.104
F1 Score: 0.811
Matthews Correlation Coefficient (MCC): 0.166


In [13]:
# Without SMOTE
print_confusion_matrix_report(cm, y_test, y_pred)

Confusion Matrix:
[[ 7 41]
 [ 5 97]]

True Negatives (TN): 7
False Positives (FP): 41
False Negatives (FN): 5
True Positives (TP): 97

Accuracy: 0.693
Precision: 0.703
Recall: 0.951
Specificity: 0.146
F1 Score: 0.808
Matthews Correlation Coefficient (MCC): 0.166
