In [547]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score



In [548]:
data  = pd.read_csv('data/water_potability.csv')

In [549]:
data.Potability.value_counts()

0    1998
1    1278
Name: Potability, dtype: int64

In [535]:
from sklearn.utils import shuffle
notpotable  = data[data['Potability']==0]
potable = data[data['Potability']==1]  

notpotable = notpotable.fillna(notpotable.mean())
potable  = potable.fillna(potable.mean())

data = pd.concat([notpotable ,potable])

data = shuffle(data)

In [536]:
data.Potability.value_counts()

0    1998
1    1278
Name: Potability, dtype: int64

In [537]:
!pip install imbalanced-learn



In [542]:
from sklearn.preprocessing import StandardScaler


y = data['Potability']
X = data.drop(columns=['Potability'], axis = 1)




X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify=y, random_state=1)




In [543]:
from imblearn.over_sampling import SMOTE,RandomOverSampler,BorderlineSMOTE,ADASYN

sm = SMOTE(random_state=42,k_neighbors=5)
ros = RandomOverSampler(sampling_strategy= 0.9,random_state=40,shrinkage=0.1)
bsm = BorderlineSMOTE(random_state= 20,k_neighbors=2,m_neighbors=10)
ads = ADASYN(random_state= 43)

X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

y_train_resampled.value_counts()

0    1398
1    1258
Name: Potability, dtype: int64

In [544]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)




In [545]:
rf = RandomForestClassifier(n_estimators=100,
                            max_depth=40,
                            random_state=10,
                            max_features=9,
                            min_samples_leaf = 2,
                            n_jobs=-1)



rf.fit(X_train_scaled, y_train_resampled)

rf_pred_train = rf.predict(X_train_scaled)
rf_pred_test = rf.predict(X_test_scaled)

print(accuracy_score(y_train_resampled, rf_pred_train), accuracy_score(y_test, rf_pred_test))

0.9962349397590361 0.7670396744659207


In [546]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=0)
lr.fit(X_train_scaled, y_train_resampled)


lr_pred_train = lr.predict(X_train_scaled)
lr_pred_test = lr.predict(X_test_scaled)


print(accuracy_score(y_train_resampled, lr_pred_train), accuracy_score(y_test, lr_pred_test))

0.5542168674698795 0.5869786368260427
