In [4838]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score



In [4839]:
data  = pd.read_csv('data/water_potability.csv')

corr = data.corr()




In [4840]:
data.Potability.value_counts()



0    1998
1    1278
Name: Potability, dtype: int64

In [4841]:
from sklearn.utils import shuffle
notpotable  = data[data['Potability']==0]
potable = data[data['Potability']==1]  

notpotable = notpotable.fillna(notpotable.mean())
potable  = potable.fillna(potable.mean())

data = pd.concat([notpotable ,potable])

data = shuffle(data)

corr["Potability"].sort_values(ascending=False)

Potability         1.000000
Solids             0.033743
Chloramines        0.023779
Trihalomethanes    0.007130
Turbidity          0.001581
ph                -0.003556
Conductivity      -0.008128
Hardness          -0.013837
Sulfate           -0.023577
Organic_carbon    -0.030001
Name: Potability, dtype: float64

In [4842]:
data.Potability.value_counts()

0    1998
1    1278
Name: Potability, dtype: int64

In [4843]:
from sklearn.preprocessing import StandardScaler


y = data['Potability']
X = data.drop(columns=['Potability','Turbidity'], axis = 1) ### Turbidity 낮은 상관계수 


## test_size 확인 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,stratify=y, random_state=1)

X

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes
2146,5.689554,202.364723,17493.650424,5.520356,385.197355,479.237852,21.692974,66.303555
2196,8.616824,228.762945,19126.298537,7.017578,327.894821,383.527023,14.759257,41.275927
2039,5.226895,127.117555,7907.330441,6.761929,335.461842,475.332357,12.725010,79.420013
1062,8.815358,208.331084,26780.691787,7.223962,329.119685,430.406894,7.151751,88.413296
1990,7.773758,251.462844,21688.616943,6.194910,395.088245,355.831683,14.324552,67.584311
...,...,...,...,...,...,...,...,...
216,7.086168,189.145378,4304.492483,3.615891,259.373322,400.904350,12.993012,63.902288
1017,8.551078,216.415932,28909.290284,6.936782,295.223707,510.895001,16.202371,79.859323
1018,6.013161,218.843256,21573.747571,9.295852,321.168313,444.276635,14.744347,62.443239
263,13.175402,47.432000,19237.949676,8.907020,375.147315,500.245952,12.083896,66.539684


In [4844]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [4845]:
from sklearn.metrics import accuracy_score, roc_auc_score

rf = RandomForestClassifier(n_estimators=300,
                            criterion = 'entropy',
                            max_depth=None,
                            random_state=10,
                            max_features=3,
                            min_samples_leaf=2,
                            n_jobs=-1
                          )



rf.fit(X_train_scaled, y_train)

rf_pred_train = rf.predict(X_train_scaled)
rf_pred_test = rf.predict(X_test_scaled)

proba_train = rf.predict_proba(X_train_scaled)
proba_test = rf.predict_proba(X_test_scaled)

print(accuracy_score(y_train, rf_pred_train), accuracy_score(y_test, rf_pred_test))
print(roc_auc_score(y_train, proba_train[:,1]), roc_auc_score(y_test, proba_test[:, 1]))

0.9982555604012211 0.8260427263479145
0.9999968030946044 0.8949281984334204


In [4846]:



X_s = scaler.transform(X)

rf_pred_X = rf.predict(X_s)
print(accuracy_score(y, rf_pred_X))

0.9465811965811965


In [4830]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=0)
lr.fit(X_train_scaled, y_train)


lr_pred_train = lr.predict(X_train_scaled)
lr_pred_test = lr.predict(X_test_scaled)


print(accuracy_score(y_train, lr_pred_train), accuracy_score(y_test, lr_pred_test))

0.6096816397732229 0.6103763987792472
