In [1]:
import numpy as np
import sys
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

# Load the Data

In [2]:
# the mean imputed data
df_train_mean = pd.read_csv(r"data\training_mean_imputed.csv")
df_test_mean = pd.read_csv(r"data\test_mean_imputed.csv")

# the iterative imputed data
df_train_iter = pd.read_csv(r"data\training_iter_imputed.csv")
df_test_iter = pd.read_csv(r"data\test_iter_imputed.csv")

In [3]:
X_train_mean = df_train_mean.drop(columns="Potability")
y_train_mean = df_train_mean["Potability"]

X_test_mean = df_test_mean.drop(columns="Potability")
y_test_mean = df_test_mean["Potability"]

X_train_iter = df_train_iter.drop(columns="Potability")
y_train_iter = df_train_iter["Potability"]

X_test_iter = df_test_iter.drop(columns="Potability")
y_test_iter = df_test_iter["Potability"]

## Train Baseline Model

In [11]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

In [13]:
# Train the logistic regression model and use this as baseline for comparison
from sklearn.linear_model import LogisticRegression

log_model_mean = LogisticRegression()
log_model_mean.fit(X_train_mean, y_train_mean)

preds_mean = log_model_mean.predict(X_test_mean)
print(f"Test Acc: {accuracy_score(y_test_mean, preds_mean)}")
print(f"Test F1-Score: {f1_score(y_test_mean, preds_mean)}")
print(f"Test Precision: {precision_score(y_test_mean, preds_mean)}")
print(f"Test Recall: {recall_score(y_test_mean, preds_mean)}")

Test Acc: 0.6265243902439024
Test F1-Score: 0.0
Test Precision: 0.0
Test Recall: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
# Train the logistic regression model and use this as baseline for comparison
from sklearn.linear_model import LogisticRegression

log_model_iter = LogisticRegression()
log_model_iter.fit(X_train_iter, y_train_iter)

preds_iter = log_model_iter.predict(X_test_iter)
print(f"Test Acc: {accuracy_score(y_test_iter, preds_iter)}")
print(f"Test F1-Score: {f1_score(y_test_iter, preds_iter)}")
print(f"Test Precision: {precision_score(y_test_iter, preds_iter)}")
print(f"Test Recall: {recall_score(y_test_iter, preds_iter)}")

Test Acc: 0.6265243902439024
Test F1-Score: 0.0
Test Precision: 0.0
Test Recall: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


## Train Other Models

In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [19]:
dict_models = {
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC(),
    "Nearest Neighbor": KNeighborsClassifier(),
    "Bagging": BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10),
    "Random Forest": RandomForestClassifier(),
    "Ada Boost": AdaBoostClassifier()
}

In [37]:
results_dict = {}
for model_name, model in dict_models.items():
    print(f"Start training {model_name}...")
    model.fit(X_train_mean, y_train_mean)
    
    preds = model.predict(X_test_mean)
    f1 = f1_score(y_test_mean, preds)
    precision = precision_score(y_test_mean, preds)
    recall = recall_score(y_test_mean, preds)

    print(f"Test F1-Score: {f1}")
    print(f"Test Precision: {precision}")
    print(f"Test Recall: {recall}")
    print("\n\n")
    results_dict[model_name] = (f1, precision, recall)

Start training Decision Tree...
Test F1-Score: 0.46280991735537186
Test Precision: 0.4686192468619247
Test Recall: 0.45714285714285713



Start training SVM...
Test F1-Score: 0.4482758620689655
Test Precision: 0.7572815533980582
Test Recall: 0.3183673469387755



Start training Nearest Neighbor...
Test F1-Score: 0.474040632054176
Test Precision: 0.5303030303030303
Test Recall: 0.42857142857142855



Start training Bagging...
Test F1-Score: 0.4607843137254902
Test Precision: 0.5766871165644172
Test Recall: 0.3836734693877551



Start training Random Forest...
Test F1-Score: 0.4684210526315789
Test Precision: 0.6592592592592592
Test Recall: 0.363265306122449



Start training Ada Boost...
Test F1-Score: 0.28488372093023256
Test Precision: 0.494949494949495
Test Recall: 0.2





In [38]:
df_results = (pd.DataFrame.from_dict(results_dict, orient="index", columns=["F1-Score", "Precision", "Recall"])
             .sort_values(by="F1-Score", ascending=False))
df_results

Unnamed: 0,F1-Score,Precision,Recall
Nearest Neighbor,0.474041,0.530303,0.428571
Random Forest,0.468421,0.659259,0.363265
Decision Tree,0.46281,0.468619,0.457143
Bagging,0.460784,0.576687,0.383673
SVM,0.448276,0.757282,0.318367
Ada Boost,0.284884,0.494949,0.2


For the sake of easy interpretation, the following models shall be examined further by applying hyper-parameter optimization on them: 
1. Decision Tree
2. Random Forest