In [1598]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score



In [1632]:
data  = pd.read_csv('data/water_potability.csv')
corr = data.corr()
corr["Potability"].sort_values(ascending=False)

Potability         1.000000
Solids             0.033743
Chloramines        0.023779
Trihalomethanes    0.007130
Turbidity          0.001581
ph                -0.003556
Conductivity      -0.008128
Hardness          -0.013837
Sulfate           -0.023577
Organic_carbon    -0.030001
Name: Potability, dtype: float64

In [1633]:
data.Potability.value_counts()



0    1998
1    1278
Name: Potability, dtype: int64

In [1634]:
from sklearn.utils import shuffle
notpotable  = data[data['Potability']==0]
potable = data[data['Potability']==1]  

notpotable = notpotable.fillna(notpotable.mean())
potable  = potable.fillna(potable.mean())

data = pd.concat([notpotable ,potable])

data = shuffle(data)

In [1635]:
data.Potability.value_counts()

0    1998
1    1278
Name: Potability, dtype: int64

In [1636]:
!pip install imbalanced-learn



In [1637]:
from sklearn.preprocessing import StandardScaler


y = data['Potability']
X = data.drop(columns=['Potability'], axis = 1)




X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify=y, random_state=1)

X

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
979,8.163076,183.161853,20335.676510,6.028559,333.084455,475.296471,18.248098,49.604842,3.287244
3045,5.622807,194.463239,16119.340190,6.907987,336.648207,408.459081,16.364724,76.631186,3.992957
2413,7.753936,220.098279,17169.584382,7.984039,344.121899,385.045003,16.848512,47.003322,4.833027
769,5.729303,162.857585,34573.678786,5.654856,415.287072,297.631365,13.990842,76.744677,3.556395
2096,11.568768,192.924448,39234.470184,8.860762,235.995461,463.435102,19.628177,81.496529,3.761288
...,...,...,...,...,...,...,...,...,...
410,7.085378,185.755728,27345.174288,8.932764,334.564290,313.878771,13.420126,56.974850,4.407566
1027,7.085378,235.417457,13927.002507,8.551447,334.564290,446.422357,14.134954,51.705215,3.662648
2972,7.948488,197.594542,23602.187832,7.098458,334.564290,394.767989,14.876694,78.974227,3.891124
3087,4.032061,200.067973,14720.544829,6.482679,334.564290,519.213370,16.600777,77.293940,4.814235


In [1638]:
from imblearn.over_sampling import SMOTE,RandomOverSampler,BorderlineSMOTE,ADASYN

sm = SMOTE(random_state=42)
ros = RandomOverSampler(sampling_strategy= 1,random_state=40,shrinkage=0.1)
bsm = BorderlineSMOTE(random_state= 20,k_neighbors=2,m_neighbors=10)
ads = ADASYN(random_state= 43)

X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

y_train_resampled.value_counts()

0    1398
1    1398
Name: Potability, dtype: int64

In [1639]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)



In [1643]:
rf = RandomForestClassifier(n_estimators=500,
                            max_depth=300,
                            random_state=10,
                            max_features=4,
                            min_samples_leaf=2,
                            n_jobs=-1)



rf.fit(X_train_scaled, y_train_resampled)

rf_pred_train = rf.predict(X_train_scaled)
rf_pred_test = rf.predict(X_test_scaled)

print(accuracy_score(y_train_resampled, rf_pred_train), accuracy_score(y_test, rf_pred_test))

1.0 0.7721261444557477


In [1641]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=0)
lr.fit(X_train_scaled, y_train_resampled)


lr_pred_train = lr.predict(X_train_scaled)
lr_pred_test = lr.predict(X_test_scaled)


print(accuracy_score(y_train_resampled, lr_pred_train), accuracy_score(y_test, lr_pred_test))

0.540414878397711 0.5147507629704985


In [1642]:



X_s = scaler.transform(X)

rf_pred_X = rf.predict(X_s)
print(accuracy_score(y, rf_pred_X))

0.9316239316239316
