# 데이터셋 가져오기

In [None]:
import pandas as pd

df = pd.read_csv('water_potability.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['Potability'].value_counts()

# 결측치 처리

In [None]:
ph_mean_0 = df[df['Potability'] == 0]['ph'].mean(skipna=True)
df.loc[(df['Potability'] == 0) & (df['ph'].isnull()), 'ph'] = ph_mean_0

ph_mean_1 = df[df['Potability'] == 1]['ph'].mean(skipna=True)
df.loc[(df['Potability'] == 1) & (df['ph'].isnull()), 'ph'] = ph_mean_1

sulf_mean_0 = df[df['Potability'] == 0]['Sulfate'].mean(skipna=True)
df.loc[(df['Potability'] == 0) & (df['Sulfate'].isnull()), 'Sulfate'] = sulf_mean_0

sulf_mean_1 = df[df['Potability'] == 1]['Sulfate'].mean(skipna=True)
df.loc[(df['Potability'] == 1) & (df['Sulfate'].isnull()), 'Sulfate'] = sulf_mean_1

trih_mean_0 = df[df['Potability'] == 0]['Trihalomethanes'].mean(skipna=True)
df.loc[(df['Potability'] == 0) & (df['Trihalomethanes'].isnull()), 'Trihalomethanes'] = trih_mean_0

trih_mean_1 = df[df['Potability'] == 1]['Trihalomethanes'].mean(skipna=True)
df.loc[(df['Potability'] == 1) & (df['Trihalomethanes'].isnull()), 'Trihalomethanes'] = trih_mean_1

In [None]:
df['ph'].isnull().sum()

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# 데이터 분리

In [None]:
X = df.drop(columns='Potability')
y = df['Potability']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, stratify=y_train, random_state=0)

In [None]:
X_train.shape, X_val.shape, X_test.shape

# Feature Scaling

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)
X_test = sc.transform(X_test)

# 최적 파라미터

## RandomizedSearchCV

In [None]:
params = {
    "n_estimators":range(100,1001,100), 
    'max_depth':range(1,11),
    'max_features':range(5, 31, 2) 
}

rfc = RandomForestClassifier(random_state=0)
rs = RandomizedSearchCV(rfc, 
                        params, 
                        scoring='accuracy',
                        cv=4, 
                        n_jobs=-1,
                        n_iter=60)

In [None]:
rs.fit(X_train, y_train)

In [None]:
rs.best_score_

In [None]:
rs.best_params_

In [None]:
df_result = pd.DataFrame(rs.cv_results_).sort_values('rank_test_score')
df_result.head()

In [None]:
best_model = rs.best_estimator_
best_model

In [None]:
accuracy_score(y_test, best_model.predict(X_test))

In [None]:
print_classification_metrics(y_test, best_model.predict(X_test), title='Best Model Test')

In [None]:
fi_rs = best_model.feature_importances_
fi_rs

In [None]:
fi_rs_s = pd.Series(fi_rs, index = X.columns)
fi_rs_s

In [None]:
fi_rs_s.sort_values().plot(kind='barh', figsize=(8,6))