In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

In [4]:
data = pd.read_csv('../data/water_potability_oversampled.csv').dropna()

X = data.drop('Potability', axis=1)
y = data['Potability']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [7]:
rf_classifier = RandomForestClassifier(random_state=42)

rf_classifier.fit(X_resampled, y_resampled)

rf_predictions = rf_classifier.predict(X_test)

rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)
rf_f1 = f1_score(y_test, rf_predictions)

print(f'Accuracy (Random Forest): {rf_accuracy:.2f}')
print(f'Precision (Random Forest): {rf_precision:.2f}')
print(f'Recall (Random Forest): {rf_recall:.2f}')
print(f'F1-score (Random Forest): {rf_f1:.2f}')

print('\nClassification Report (Random Forest):\n', classification_report(y_test, rf_predictions))

Accuracy (Random Forest): 0.72
Precision (Random Forest): 0.74
Recall (Random Forest): 0.71
F1-score (Random Forest): 0.73

Classification Report (Random Forest):
               precision    recall  f1-score   support

           0       0.70      0.74      0.72       232
           1       0.74      0.71      0.73       248

    accuracy                           0.72       480
   macro avg       0.72      0.72      0.72       480
weighted avg       0.72      0.72      0.72       480

