In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import missingno as msno

In [None]:
df = pd.read_csv('water_potability.csv')
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
#Dependent Variable Analysis

In [None]:
d = pd.DataFrame(df["Potability"].value_counts())
fig = px.pie(d, values = "Potability", names = ["Not Potable", "Potable"], hole = 0.4, opacity = 0.8,
                                               labels = {"label":"Potability", "Potability":"Number of Samples"})
fig.update_layout(title = dict(text = "Pie Chart of Potability Feature"))
fig.update_traces(textposition = "outside", textinfo = "percent + label")
fig.show()

In [None]:
#Correlation Between Feature

In [None]:
df.corr()

In [None]:
sns.clustermap(df.corr(), cmap = "vlag", dendrogram_ratio = (0.1, 0.2), annot = True, linewidth = 0.8, figsize = (9,10))
plt.show()

In [None]:
#Distribution of Feature

In [None]:
non_potable = df.query("Potability == 0")
potable = df.query("Potability == 1")

plt.figure(figsize = (15,15))
for ax, col in  enumerate(df.columns[:9]):
    plt.subplot(3,3, ax+1)
    plt.title(col)
    sns.kdeplot(x = non_potable[col], label = "Non Potable")
    sns.kdeplot(x = potable[col], label = "Potable")
    plt.legend()
plt.tight_layout()

In [None]:
#Preprocessing: Missing Value problem

In [None]:
msno.matrix(df)
plt.show()

In [None]:
df.isnull().sum()

In [None]:
# handle missing value with average of features
df["ph"].fillna(value = df["ph"].mean(), inplace = True)
df["Sulfate"].fillna(value = df["Sulfate"].mean(), inplace = True)
df["Trihalomethanes"].fillna(value = df["Trihalomethanes"].mean(), inplace = True)

In [None]:
df.isnull().sum()

In [None]:
#Preprocessing: Train-Test_Split and Normalization

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV, RepeatedStratifiedKFold, train_test_split
from sklearn.metrics import precision_score, confusion_matrix
from sklearn import tree
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from xgboost import XGBClassifier

In [None]:
X = df.drop("Potability", axis = 1).values
y = df["Potability"].values

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 3)
print("X_train",X_train.shape)
print("X_test",X_test.shape)
print("y_train",y_train.shape)
print("y_test",y_test.shape)

In [None]:
# min-max normalization
x_train_max = np.max(X_train)
x_train_min = np.min(X_train)
X_train = (X_train - x_train_min)/(x_train_max-x_train_min)
X_test = (X_test - x_train_min)/(x_train_max-x_train_min)

In [None]:
#Modelling Decision Tree and Random Forest Classifiers

In [None]:
models = [("DTC", DecisionTreeClassifier(max_depth = 3)),
          ("RF",RandomForestClassifier()),
         ]

In [None]:
finalResults = []
cmList = []
for name, model in models:
    model.fit(X_train, y_train) # train
    model_result = model.predict(X_test) # prediction
    score = precision_score(y_test, model_result)
    cm = confusion_matrix(y_test, model_result)
    
    finalResults.append((name, score))
    cmList.append((name, cm))
finalResults

In [None]:
for name, i in cmList:
    plt.figure()
    sns.heatmap(i, annot = True, linewidths = 0.8, fmt = ".1f")
    plt.title(name)
    plt.show()

In [None]:
#Visualize Decision Tree

In [None]:
dt_clf = models[0][1]
dt_clf

In [None]:
plt.figure(figsize=(25,20))
tree.plot_tree(dt_clf,
              feature_names = df.columns.tolist()[:-1],
              class_names = ["0","1"],
              filled = True,
              precision = 5)
plt.show()

In [None]:
#Random Forest, XGBoost, GBM Hyperparameter Tuning

In [None]:
model_params = {
    "Random Forest":
    {
        "model":RandomForestClassifier(),
        "params":
        {
            "n_estimators":[10, 50, 100],
            "max_features":["auto","sqrt","log2"],
            "max_depth":list(range(1,21,3))
        }
    },
    "XGBoost":
    {
        "model":XGBClassifier(),
        "params":
        {
            "colsample_bytree":[0.6, 0.8, 1],
            "learning_rate":[0.1, 0.2, 0.01],
            "max_depth":list(range(1,21,3)),
            "n_estimators":[100,200,500]
        }
    },
    "GBM":
    {
        "model": GradientBoostingRegressor(),
        "params":
        {
            "learning_rate":[0.001, 0.01, 0.1, 0.2],
            "max_depth":list(range(1,21,3)),
            "n_estimators":[200, 500, 1000, 2000],
            "subsample":[1, 0.5, 0.75]
        }
    }
}
model_params

In [None]:
cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2)
scores = []
for model_name, params in model_params.items():
    rs = RandomizedSearchCV(params["model"], params["params"], cv = cv, n_iter = 10)
    rs.fit(X,y)
    scores.append([model_name, dict(rs.best_params_),rs.best_score_])
scores