In [13]:
# %conda install imbalanced-learn

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from imblearn.over_sampling import SMOTE

In [14]:
# Load your data
data = pd.read_csv('/home/A02398138/Downloads/Data Sheet 1.csv')

In [15]:
target_columns = ['Scheme', 'Sample (reference)']

X = data.drop(target_columns, axis=1)
y = data['Scheme'].map({'Stable': 0, 'Failure': 1})

X.replace('ND', 0, inplace=True)

X

Unnamed: 0,Taxa A1,Taxa A2,Taxa A3,Taxa A4,Taxa A5,Taxa B1,Taxa B2,Taxa B3
0,0.0013,0.0,0.002,0.0012,0.0098,0.0465,0.1462,0.2775
1,0.0,0.0,0.0003,0.0002,0.0015,0.0195,0.0227,0.0443
2,0.0022,0.0007,0.0028,0.0052,0.0095,0.0465,0.1575,0.2267
3,0.0,0.0,0.0,0.0003,0.0017,0.0155,0.0337,0.271
4,0.0007,0.001,0.0007,0.0028,0.0053,0.0308,0.0163,0.1657
5,0.004,0.0033,0.001,0.0117,0.0025,0.0405,0.1125,0.405
6,0.0,0.0,0.0005,0.0023,0.0047,0.0285,0.0153,0.1327
7,0.0033,0.0153,0.001,0.0507,0.0057,0.0253,0.067,0.4583
8,0.0048,0.0095,0.0018,0.043,0.0055,0.0247,0.066,0.4753
9,0.0032,0.0087,0.0007,0.0523,0.0045,0.0413,0.0357,0.5587


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [17]:
# Handle class imbalance
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [18]:
# Initialize Naive Bayes classifier
nb_classifier = GaussianNB()

# Train the classifier
nb_classifier.fit(X_train_smote, y_train_smote)

In [19]:
# Predict on test data
y_pred = nb_classifier.predict(X_test)

In [20]:
# Evaluate the model
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

precision, recall, _ = precision_recall_curve(y_test, y_pred)
pr_auc = auc(recall, precision)

# You can now print out the AUC for ROC and precision-recall curves
print(f"ROC AUC: {roc_auc}")
print(f"Precision-Recall AUC: {pr_auc}")

ROC AUC: 1.0
Precision-Recall AUC: 1.0


In [28]:
new_data = pd.read_csv('/home/A02398138/Downloads/DataSheet2&3.csv')

target_columns = ['Scheme', 'Sample', 'Location']

X = new_data.drop(target_columns, axis=1)
y_actual = new_data['Scheme'].map({'Stable': 0, 'Failure': 1})

X.replace('ND', 0, inplace=True)
X.fillna(0, inplace=True)

X

Unnamed: 0,Taxa A1,Taxa A2,Taxa A3,Taxa A4,Taxa A5,Taxa B1,Taxa B2,Taxa B3
0,0,0,0,0,0,0,0,0
1,0.0121,0,0,0,0.0001,0,0,0.002
2,0.0003,0,0,0,0,0,0,0
3,0.0284,0,0,0,0,0,0,0
4,0.0002,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
156,0.0037,0.0163,0,0.0045,0.001,0.0035,0.0043,0.0023
157,0.0007,0.0167,0,0.0012,0.0006,0.0004,0.002,0.0015
158,0,0.0082,0,0.0049,0.0055,0.0261,0.0102,0.0006
159,0,0.0015,0,0.0006,0,0,0.0002,0.0005


In [29]:
predictions = nb_classifier.predict(X)

In [30]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print("Accuracy:", accuracy_score(y_actual, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_actual, predictions))
print("Classification Report:\n", classification_report(y_actual, predictions))

Accuracy: 0.8633540372670807
Confusion Matrix:
 [[102  11]
 [ 11  37]]
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.90      0.90       113
           1       0.77      0.77      0.77        48

    accuracy                           0.86       161
   macro avg       0.84      0.84      0.84       161
weighted avg       0.86      0.86      0.86       161

