In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [136]:
df = pd.read_csv('water_potability.csv')

## Checking for missing data

In [137]:
#checking for missing data
df.isna().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

## Correcting missing data


In [138]:
#correcting all the missing data, with the mean() values of each column,inplace 
df['ph']=df['ph'].fillna(df['ph'].mean())
df['Sulfate']=df['Sulfate'].fillna(df['Sulfate'].mean())
df['Trihalomethanes']=df['Trihalomethanes'].fillna(df['Trihalomethanes'].mean())
df.isna().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [141]:
df.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0
mean,7.080795,196.369496,22014.092526,7.122277,333.775777,426.205111,14.28497,66.396293,3.966786,0.39011
std,1.469956,32.879761,8768.570828,1.583085,36.142612,80.824064,3.308162,15.769881,0.780382,0.487849
min,0.0,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45,0.0
25%,6.277673,176.850538,15666.690297,6.127421,317.094638,365.734414,12.065801,56.647656,3.439711,0.0
50%,7.080795,196.967627,20927.833607,7.130299,333.775777,421.884968,14.218338,66.396293,3.955028,0.0
75%,7.87005,216.667456,27332.762127,8.114887,350.385756,481.792304,16.557652,76.666609,4.50032,1.0
max,14.0,323.124,61227.196008,13.127,481.030642,753.34262,28.3,124.0,6.739,1.0


In [233]:
df.info

<bound method DataFrame.info of             ph    Hardness        Solids  Chloramines     Sulfate  \
0     7.080795  204.890455  20791.318981     7.300212  368.516441   
1     3.716080  129.422921  18630.057858     6.635246  333.775777   
2     8.099124  224.236259  19909.541732     9.275884  333.775777   
3     8.316766  214.373394  22018.417441     8.059332  356.886136   
4     9.092223  181.101509  17978.986339     6.546600  310.135738   
...        ...         ...           ...          ...         ...   
3271  4.668102  193.681735  47580.991603     7.166639  359.948574   
3272  7.808856  193.553212  17329.802160     8.061362  333.775777   
3273  9.419510  175.762646  33155.578218     7.350233  333.775777   
3274  5.126763  230.603758  11983.869376     6.303357  333.775777   
3275  7.874671  195.102299  17404.177061     7.509306  333.775777   

      Conductivity  Organic_carbon  Trihalomethanes  Turbidity  Potability  
0       564.308654       10.379783        86.990970   2.963135

In [231]:
df.shape

(3276, 10)

## Creating a Class and its Predictors

In [232]:
# split dataset

x_water = df.iloc[:, :9].values
y_water = df.iloc[:,9].values
x_waterTrain,x_waterTest,y_waterTrain,y_waterTest = train_test_split(x_water,y_water,test_size=0.20,random_state=0)

In [166]:
print(x_water)

[[7.08079450e+00 2.04890455e+02 2.07913190e+04 ... 1.03797831e+01
  8.69909705e+01 2.96313538e+00]
 [3.71608008e+00 1.29422921e+02 1.86300579e+04 ... 1.51800131e+01
  5.63290763e+01 4.50065627e+00]
 [8.09912419e+00 2.24236259e+02 1.99095417e+04 ... 1.68686369e+01
  6.64200925e+01 3.05593375e+00]
 ...
 [9.41951032e+00 1.75762646e+02 3.31555782e+04 ... 1.10390697e+01
  6.98454003e+01 3.29887550e+00]
 [5.12676292e+00 2.30603758e+02 1.19838694e+04 ... 1.11689462e+01
  7.74882131e+01 4.70865847e+00]
 [7.87467136e+00 1.95102299e+02 1.74041771e+04 ... 1.61403676e+01
  7.86984463e+01 2.30914906e+00]]


In [167]:
print(y_water)

[0 0 0 ... 1 1 1]


In [165]:
scaler = StandardScaler()

X_train = scaler.fit_transform(x_waterTrain)
X_test = scaler.transform(x_waterTest)

***********************************

# KNearestNeighbours Classifier

In [199]:
# Define the model: Init K-NN
classifierKNN = KNeighborsClassifier(n_neighbors=18, p=2,metric='euclidean')

In [200]:
# Fit Model
classifierKNN.fit(X_train,y_waterTrain)

In [227]:
# Predict the test set results
y_pred = classifierKNN.predict(X_test)

### Confusion matrix

In [228]:
# Evaluate Model
cm = confusion_matrix(y_waterTest, y_pred)
print (cm)
print(accuracy_score(y_waterTest, y_pred))

[[375  37]
 [191  53]]
0.6524390243902439


In [205]:
# Print the Confusion Matrix with k =3 and slice it into four pieces

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_waterTest, y_pred)

print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

Confusion matrix

 [[374  38]
 [215  29]]

True Positives(TP) =  374

True Negatives(TN) =  29

False Positives(FP) =  38

False Negatives(FN) =  215


In [208]:
from sklearn.metrics import classification_report

print(classification_report(y_waterTest, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.91      0.75       412
           1       0.43      0.12      0.19       244

    accuracy                           0.61       656
   macro avg       0.53      0.51      0.47       656
weighted avg       0.56      0.61      0.54       656



*******************************************

# Naive Bayes Classifier

In [225]:
#testing GaussianNB() method. Due to naive independence of predictors assumption
from sklearn.naive_bayes import GaussianNB


naive_water_quality = GaussianNB()

naive_water_quality.fit(X_train,y_waterTrain)


### Confusion matrix

In [224]:
naive_prediction = naive_water_quality.predict(X_test)
print( accuracy_score(y_waterTest,naive_prediction))

0.6173780487804879


In [226]:
print(classification_report(naive_prediction,y_waterTest))

              precision    recall  f1-score   support

           0       0.86      0.65      0.74       545
           1       0.21      0.47      0.29       111

    accuracy                           0.62       656
   macro avg       0.53      0.56      0.52       656
weighted avg       0.75      0.62      0.66       656

