In [20]:
import numpy as np
import torch
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import tensorflow as tf

## Loading and preprocessing dataset

In [23]:
# load and seperate data
data = pd.read_csv("water_potability.csv") 
cols = data.columns
data.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [24]:
# fill missing values 
imp_mean = SimpleImputer()
transformed = imp_mean.fit_transform(data)
transformed = pd.DataFrame(transformed, columns = cols)
labels = transformed.Potability
selected_features = pd.concat([transformed.Solids, transformed.Turbidity], axis=1)
all_features = transformed.iloc[:,:-1]
transformed.iloc[:,:-1].head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
0,7.080795,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135
1,3.71608,129.422921,18630.057858,6.635246,333.775777,592.885359,15.180013,56.329076,4.500656
2,8.099124,224.236259,19909.541732,9.275884,333.775777,418.606213,16.868637,66.420093,3.055934
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075


# Building initial Neural Network architecture

In [25]:
norm_layer = tf.keras.layers.Normalization()
norm_layer.adapt(selected_features)

In [26]:
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit

def genSSS(X, Y, folds=10, test_size=10):
    sss = sklearn.model_selection.StratifiedShuffleSplit(n_splits=folds, test_size=test_size)

    x_train_folds = []
    x_test_folds = []
    y_train_folds = []
    y_test_folds = []
    prev = { 'depth' : 0, 'accuracy' : 0, 'clf' : X }
    
    for train_index, test_index in sss.split(X, Y):
        x_train_folds.append(np.array(X.iloc[train_index]))
        y_train_folds.append(np.array(Y.iloc[train_index]))
        x_test_folds.append(np.array(X.iloc[test_index]))
        y_test_folds.append(np.array(Y.iloc[test_index]))
        
    return x_train_folds,y_train_folds, x_test_folds, y_test_folds


x_train_folds,y_train_folds, x_test_folds, y_test_folds = genSSS(selected_features, labels)
x_train_folds[1].shape

(3266, 2)

In [27]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
import keras

def create_NN(activations: list, input_dims, hidd_dims: list):
    assert(len(activations)==len(hidd_dims))

    model = Sequential()
    model.add(norm_layer) 
    model.add(Dense(input_dims,input_shape=(input_dims,), activation='sigmoid'))
    
    for act, h_d in zip(activations, hidd_dims):
        print(act,h_d)
        model.add(Dense(h_d, activation=act, kernel_initializer = tf.keras.initializers.GlorotNormal()))
        
    model.add(Dense(1, activation='tanh'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

## Instansiate model and train

In [28]:
model = create_NN(['sigmoid','tanh','sigmoid','sigmoid','tanh']
                  ,2
                  ,[2,4,8,4,2])
history = model.fit(
    (x_train_folds[1]),
    (y_train_folds[1]),
    batch_size=64,
    epochs=20,
    validation_data=((x_test_folds[1]),
                     (y_test_folds[1]))
)

sigmoid 2
tanh 4
sigmoid 8
sigmoid 4
tanh 2
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [29]:
from keras.utils.vis_utils import plot_model

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization (Normalizatio  (None, 2)                5         
 n)                                                              
                                                                 
 dense (Dense)               (None, 2)                 6         
                                                                 
 dense_1 (Dense)             (None, 2)                 6         
                                                                 
 dense_2 (Dense)             (None, 4)                 12        
                                                                 
 dense_3 (Dense)             (None, 8)                 40        
                                                                 
 dense_4 (Dense)             (None, 4)                 36        
                                                        

## Test with actual sensor values

In [33]:
tds = 2.1
tss = 492
into_nn = np.array([tds, tss]).reshape(1,2)
pred = model.predict(into_nn)

In [34]:
pred[0,0]

0.3874087

## Trying to diagnose and combat issue

In [36]:
clean = (labels==1)
clean_pred_vals = np.empty(np.sum(clean))
tds_c = transformed[clean].Solids
tss_c = transformed[clean].Turbidity
into_nn_c = np.array([tds_c, tss_c]).reshape(1278,2)
pred_c = model.predict(into_nn_c,batch_size=1278)

In [40]:
# average value for clean water
np.average(pred_c)

0.38691813

In [42]:
nclean = (labels==0)
nclean_pred_vals = np.empty(np.sum(nclean))
tds_n = transformed[nclean].Solids
tss_n = transformed[nclean].Turbidity
into_nn_n = np.array([tds_n, tss_n]).reshape(1998,2)
pred_n = model.predict(into_nn_n,batch_size=1998)

In [43]:
# average value for non-clean water
np.average(pred_n)

0.38691705