In [15]:
import warnings
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef, make_scorer
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import ADASYN
from joblib import dump, load

In [16]:
# Load Datasets
data_train_raw = pd.read_csv('/root/projects/water-ml/datasets/sheet_1.csv')
data_test_raw = pd.read_csv('/root/projects/water-ml/datasets/sheet_2_3.csv')
data_train_unlabeled = pd.read_csv('/root/projects/water-ml/datasets/sheet_3.csv')

# Remove DWDS_sim_rows from data
dwds = data_test_raw[data_test_raw['Location'] == 'DWDS Simulator (EPA, 2016)']

# Drop DWDS sim data from sheets 2&3 (test_data)
data_test = data_test_raw[data_test_raw['Location'] != 'DWDS Simulator (EPA, 2016)']

# Concatenate train data and dwds data
data_train = pd.concat([data_train_raw, dwds])

# Prepare train data
target_columns = ['Scheme', 'Sample (reference)']
X_train = data_train_raw.drop(target_columns, axis=1)
y_train = data_train_raw['Scheme'].map({'Stable': 1, 'Failure': 0})
X_train.replace('ND', 0, inplace=True)

# Prepare test data
target_columns = ['Scheme', 'Sample', 'Location']
X_test = data_test.drop(target_columns, axis=1)
y_test = data_test['Scheme'].map({'Stable': 1, 'Failure': 0})
X_test.replace('ND', 0, inplace=True)
X_test.fillna(0, inplace=True)


In [17]:
adasyn_oversampler = ADASYN(sampling_strategy='auto', random_state=42)
X_train_adasyn, y_train_adasyn = adasyn_oversampler.fit_resample(X_train, y_train)

In [7]:
# def print_confusion_matrix(cm, y_true, y_pred):
#     tn, fp, fn, tp = cm.ravel()
#     total = np.sum(cm)
#     accuracy = (tp + tn) / total
#     precision = tp / (tp + fp) if (tp + fp) != 0 else 0
#     recall = tp / (tp + fn) if (tp + fn) != 0 else 0
#     specificity = tn / (tn + fp) if (tn + fp) != 0 else 0
#     f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
#     mcc = matthews_corrcoef(y_true, y_pred)
#     print("Confusion Matrix:")
#     print(cm)
#     print()
#     print(f"True Negatives (TN): {tn}")
#     print(f"False Positives (FP): {fp}")
#     print(f"False Negatives (FN): {fn}")
#     print(f"True Positives (TP): {tp}")
#     print()
#     print(f"Accuracy: {accuracy:.3f}")
#     print(f"Precision: {precision:.3f}")
#     print(f"Recall: {recall:.3f}")
#     print(f"Specificity: {specificity:.3f}")
#     print(f"F1 Score: {f1_score:.3f}")
#     print(f"Matthews Correlation Coefficient (MCC): {mcc:.3f}")
#     if precision == 0 and recall == 0:
#         warnings.warn("Precision and Recall are both zero. F1 Score may not be meaningful.")
#     if tn + fp == 0:
#         warnings.warn("No negative samples. Specificity may not be meaningful.")


# def fit_model(model, X_train, y_train, X_test, y_test):
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     cm = confusion_matrix(y_test, y_pred)
#     print_confusion_matrix(cm, y_test, y_pred)
#     return model

In [10]:
# def grid_search_augmented(X_train_adasyn, y_train_adasyn, X_test, y_test):
#     # Frozen parameters
#     frozen_params = {
#         'bootstrap': [True],
#         'criterion': ['gini'],
#         'max_depth': [3],
#         'max_features': ['sqrt'],
#         'min_samples_leaf': [1],
#         'min_samples_split': [2],
#         'n_estimators': [50]
#     }

#     # Additional parameters to tune
#     param_grid = {
#         'class_weight': [None, 'balanced', 'balanced_subsample'],
#         'ccp_alpha': [0.0, 0.1, 0.2],
#         'max_samples': [None, 0.5, 0.7, 0.9],
#         'oob_score': [True, False],
#         'warm_start': [True, False],
#         'n_jobs': [-1]
#     }

#     # Combine frozen parameters and additional parameters
#     param_grid.update(frozen_params)

#     # Define the ExtraTreesClassifier model
#     model = ExtraTreesClassifier()

#     # Define the scoring functions
#     scoring = {
#         'accuracy': make_scorer(accuracy_score),
#         'mcc': make_scorer(matthews_corrcoef)
#     }

#     # Perform grid search for ADASYN
#     print("Grid search for ExtraTreesClassifier with ADASYN:")
#     grid_search_adasyn = GridSearchCV(
#         estimator=model,
#         param_grid=param_grid,
#         scoring=scoring,
#         refit='accuracy',  # Specify the metric to use for refitting the best model
#         cv=10,
#         n_jobs=-1,
#         verbose=1
#     )
#     grid_search_adasyn.fit(X_train_adasyn, y_train_adasyn)

#     # Get the best model based on accuracy
#     best_model_accuracy = grid_search_adasyn.best_estimator_
#     best_params_accuracy = grid_search_adasyn.best_params_

#     # Get the best model based on MCC score
#     best_index_mcc = np.argmax(grid_search_adasyn.cv_results_['mean_test_mcc'])
#     best_params_mcc = grid_search_adasyn.cv_results_['params'][best_index_mcc]
#     best_model_mcc = ExtraTreesClassifier(**best_params_mcc)

#     # Evaluate the best model based on accuracy
#     print("\nBest Model based on Accuracy:")
#     print(f"Best parameters: {best_params_accuracy}")
#     fit_model(best_model_accuracy, X_train_adasyn, y_train_adasyn, X_test, y_test)

#     # Evaluate the best model based on MCC score
#     print("\nBest Model based on MCC Score:")
#     print(f"Best parameters: {best_params_mcc}")
#     fit_model(best_model_mcc, X_train_adasyn, y_train_adasyn, X_test, y_test)

# # Example usage
# grid_search_augmented(X_train_adasyn, y_train_adasyn, X_test, y_test)

Grid search for ExtraTreesClassifier with ADASYN:
Fitting 10 folds for each of 1 candidates, totalling 10 fits



Best Model based on Accuracy:
Best parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Confusion Matrix:
[[18 30]
 [ 6 96]]

True Negatives (TN): 18
False Positives (FP): 30
False Negatives (FN): 6
True Positives (TP): 96

Accuracy: 0.760
Precision: 0.762
Recall: 0.941
Specificity: 0.375
F1 Score: 0.842
Matthews Correlation Coefficient (MCC): 0.402

Best Model based on MCC Score:
Best parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Confusion Matrix:
[[34 14]
 [ 7 95]]

True Negatives (TN): 34
False Positives (FP): 14
False Negatives (FN): 7
True Positives (TP): 95

Accuracy: 0.860
Precision: 0.872
Recall: 0.931
Specificity: 0.708
F1 Score: 0.900
Matthews Correlation Coefficient (MCC): 0.670


In [21]:
def grid_search_augmented(X_train_adasyn, y_train_adasyn, X_test, y_test):
    # Define the parameter grid
    param_grid = {
        'class_weight': [None, 'balanced', 'balanced_subsample'],
        'ccp_alpha': [0.0, 0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.5],
        'max_samples': [None, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
        'oob_score': [False, True],
        'warm_start': [False, True],
        'n_estimators': [10, 50, 100, 150, 200, 250, 300, 350, 400],
        'max_depth': [None, 3, 5, 7, 10, 12, 15, 20],
        'min_samples_split': [2, 4, 6, 8, 10, 12, 14, 16],
        'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8],
        'max_features': ['sqrt', 'log2', 0.2, 0.4, 0.6, 0.8, None]
    }

    #  Define the scoring functions
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'mcc': make_scorer(matthews_corrcoef)
    }

    # Perform grid search for ADASYN
    print("Grid search for ExtraTreesClassifier with ADASYN:")
    grid_search_adasyn = GridSearchCV(
        estimator=ExtraTreesClassifier(),
        param_grid=param_grid,
        scoring=scoring,
        refit='accuracy',  # Specify the metric to use for refitting the best model
        cv=10,
        n_jobs=-1,
        verbose=1
    )

    # Perform the grid search
    print("Grid search for ExtraTreesClassifier with ADASYN:")
    grid_search_adasyn.fit(X_train_adasyn, y_train_adasyn)

    # Retrieve the best model based on accuracy
    best_model_accuracy = grid_search_adasyn.best_estimator_
    best_params_accuracy = grid_search_adasyn.best_params_

    # Evaluate and print the best model's performance
    print("\nBest Model based on Accuracy:")
    print(f"Best parameters: {best_params_accuracy}")
    fit_model(best_model_accuracy, X_train_adasyn, y_train_adasyn, X_test, y_test)

def fit_model(model, X_train, y_train, X_test, y_test):
    # Function to fit the model and evaluate it on the test set
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = np.mean(predictions == y_test)  # Calculate accuracy
    print(f"Model Accuracy: {accuracy}")

# Example usage (ensure you have your dataset variables defined before calling this function)
grid_search_augmented(X_train_adasyn, y_train_adasyn, X_test, y_test)


Grid search for ExtraTreesClassifier with ADASYN:
Grid search for ExtraTreesClassifier with ADASYN:


KeyboardInterrupt: 