In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from imblearn.over_sampling import SMOTE

In [31]:
# Load your data
data = pd.read_csv('~/Projects/water-ml/datasets/sheet_1.csv')

In [42]:
target_columns = ['Scheme', 'Sample (reference)']

X = data.drop(target_columns, axis=1)
y = data['Scheme'].map({'Stable': 0, 'Failure': 1})

X.replace('ND', 0, inplace=True)

In [43]:
# Create a KFold object
kf = KFold(n_splits=10, random_state=42, shuffle=True)

# Use the object to split your data
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [44]:
# Handle class imbalance
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [45]:
# Initialize Naive Bayes classifier
nb_classifier = GaussianNB()

# Train the classifier
nb_classifier.fit(X_train_smote, y_train_smote)

In [46]:
# Predict on test data
y_pred = nb_classifier.predict(X_test)

In [47]:
# Evaluate the model
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

precision, recall, _ = precision_recall_curve(y_test, y_pred)
pr_auc = auc(recall, precision)

# You can now print out the AUC for ROC and precision-recall curves
print(f"ROC AUC: {roc_auc}")
print(f"Precision-Recall AUC: {pr_auc}")

ROC AUC: 1.0
Precision-Recall AUC: 1.0


In [51]:
new_data = pd.read_csv('~/Projects/water-ml/datasets/sheet_2_3.csv')

target_columns = ['Scheme', 'Sample', 'Location']

X = new_data.drop(target_columns, axis=1)
y_actual = new_data['Scheme'].map({'Stable': 0, 'Failure': 1})

X.replace('ND', 0, inplace=True)
X.fillna(0, inplace=True)

In [49]:
predictions = nb_classifier.predict(X)

In [50]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print("Accuracy:", accuracy_score(y_actual, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_actual, predictions))
print("Classification Report:\n", classification_report(y_actual, predictions))

Accuracy: 0.8571428571428571
Confusion Matrix:
 [[100  13]
 [ 10  38]]
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.88      0.90       113
           1       0.75      0.79      0.77        48

    accuracy                           0.86       161
   macro avg       0.83      0.84      0.83       161
weighted avg       0.86      0.86      0.86       161

