<a href="https://colab.research.google.com/github/oktaviacitra/classification/blob/main/WaterQuality.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
from google.colab import drive

origin_path = "/content/drive"
drive.mount(origin_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
import pandas as pd

folder_path = origin_path + "/MyDrive/Learning Journey/Water Quality"
dataset_path = folder_path + "/dataset.csv"
dataframe = pd.read_csv(dataset_path)
dataframe.head(3)

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0


In [39]:
dataframe.isna().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [40]:
values = {"ph": dataframe['ph'].mean(),
          "Sulfate": dataframe['Sulfate'].mean(),
          "Trihalomethanes": dataframe['Trihalomethanes'].mean()}
dataframe = dataframe.fillna(value=values)

In [41]:
dataframe.isna().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [42]:
X = dataframe.iloc[:, 2:]
y = dataframe.iloc[:, -1]

X.shape, y.shape

((3276, 8), (3276,))

In [43]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier

pipeline = Pipeline(
    [
        ('min/max scaler', MinMaxScaler()), 
        ('model', MLPClassifier(random_state=43))
    ]
)

In [44]:
from sklearn.model_selection import GridSearchCV

param_grid = {'model__alpha': [1e-5, 1e-4, 1e-3, 1e-2],
              'model__hidden_layer_sizes': [(15,10,5), (12,8,4), (10,5,3)],
              'model__max_iter': [50, 100, 150],
              'model__activation': ['tanh', 'relu'],
              'model__solver': ['sgd', 'adam'],
              'model__alpha': [0.0001, 0.05],
              'model__learning_rate': ['constant','adaptive']
              }

search = GridSearchCV(
    estimator = pipeline,
    param_grid = param_grid,
    n_jobs=-1,
    scoring="neg_mean_squared_error",
    cv=5,
    verbose=3
)

search.fit(X, y)

Fitting 5 folds for each of 144 candidates, totalling 720 fits




GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('min/max scaler', MinMaxScaler()),
                                       ('model',
                                        MLPClassifier(random_state=43))]),
             n_jobs=-1,
             param_grid={'model__activation': ['tanh', 'relu'],
                         'model__alpha': [0.0001, 0.05],
                         'model__hidden_layer_sizes': [(15, 10, 5), (12, 8, 4),
                                                       (10, 5, 3)],
                         'model__learning_rate': ['constant', 'adaptive'],
                         'model__max_iter': [50, 100, 150],
                         'model__solver': ['sgd', 'adam']},
             scoring='neg_mean_squared_error', verbose=3)

In [45]:
search.best_params_

{'model__activation': 'tanh',
 'model__alpha': 0.0001,
 'model__hidden_layer_sizes': (15, 10, 5),
 'model__learning_rate': 'constant',
 'model__max_iter': 50,
 'model__solver': 'sgd'}

In [46]:
search.best_score_

0.0

In [47]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape

((2293, 8), (983, 8))