In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
 
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [60]:
#1 - Pick a problem that interests you and find a dataset

# Read file
df = pd.read_csv('water_potability.csv')

# Getting Size
df.shape

(3276, 10)

In [61]:
# Look at the head
df.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [62]:
#2 - Describe the problem and how ML can help
# I chose this problem because there are many countries around the world with potable water accessibility issues hence
# they can benefit of a model trained with several water quality metrics to easily identify what is potable or not.

In [63]:
#3 - Prepare the data and run EDA

In [64]:
#3.1 - Change name of columns to be lower case
df.columns = df.columns.str.lower()

df.head()

Unnamed: 0,ph,hardness,solids,chloramines,sulfate,conductivity,organic_carbon,trihalomethanes,turbidity,potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [65]:
#3.2 - Identify columns with null
print("Number of NaNs per column:")
print(df.isna().sum())

Number of NaNs per column:
ph                 491
hardness             0
solids               0
chloramines          0
sulfate            781
conductivity         0
organic_carbon       0
trihalomethanes    162
turbidity            0
potability           0
dtype: int64


In [66]:
#3.3 - populate the NANs with the median for the columns as 0 would imply 0 ph, sulfate, trihalomethanes

# Printing the medians
print("Median PH: ", df['ph'].median())
print("Median Sulfate: ", df['sulfate'].median())
print("Median Trihalomethanes: ", df['trihalomethanes'].median())

# Assigning the median values
for col in ['ph', 'sulfate', 'trihalomethanes']:
    df[col] = df[col].fillna(df[col].median())

# Confirm change was applied
print("--------")
print("Number of NaNs per column:")
print(df.isna().sum())

Median PH:  7.036752103833548
Median Sulfate:  333.073545745888
Median Trihalomethanes:  66.62248509808484
--------
Number of NaNs per column:
ph                 0
hardness           0
solids             0
chloramines        0
sulfate            0
conductivity       0
organic_carbon     0
trihalomethanes    0
turbidity          0
potability         0
dtype: int64


In [67]:
# Checking the head again to see if values were applied correctly
df.head()

Unnamed: 0,ph,hardness,solids,chloramines,sulfate,conductivity,organic_carbon,trihalomethanes,turbidity,potability
0,7.036752,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,333.073546,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,333.073546,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [68]:
#3.4 - Shuffle records as they seem to be grouped in chunks of 0s and 1s which could bring problems to the model
# Random State = 42 will be used to guarantee same shuffle 
# Index will be reset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head()

Unnamed: 0,ph,hardness,solids,chloramines,sulfate,conductivity,organic_carbon,trihalomethanes,turbidity,potability
0,7.036752,183.521107,20461.25271,7.333212,333.119476,356.369022,20.179029,67.019903,4.886634,0
1,6.643159,188.913541,32873.820022,6.791509,333.848842,336.561501,14.70681,67.844849,4.562198,1
2,7.846058,224.058877,23264.109968,5.922367,300.40262,387.971336,13.406737,43.075186,2.487969,0
3,7.160467,183.08931,6743.346066,3.803036,277.599099,428.036344,9.799625,90.035374,3.884891,0
4,6.61535,179.240661,26392.863612,9.30916,333.073546,496.363562,12.786595,78.262369,4.453443,1


In [80]:
#3.5 - Split data
# Split the data into 3 parts: train/validation/test with 60%/20%/20% distribution
# Use train_test_split function for that with random_state=1
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

print("DataFrame Size: ", df.shape[0])

sizeFullTrainDataframe = len(df_full_train)
sizeTrainDataframe = len(df_train)
sizeValDataframe = len(df_val)
sizeTestDataframe = len(df_test)

print("Size of Full Train Dataframe: ", sizeFullTrainDataframe, " - Size of Train dataframe: ", sizeTrainDataframe, 
      " - Size of Validation Dataframe: ", sizeValDataframe, " - Size of Test dataframe: ", sizeTestDataframe)
print("Is the number of rows of dataframe equals to the sum of the Train, Validation and Test dataframe? ", 
      (df.shape[0] == sizeTrainDataframe + sizeValDataframe + sizeTestDataframe))

DataFrame Size:  3276
Size of Full Train Dataframe:  2620  - Size of Train dataframe:  1965  - Size of Validation Dataframe:  655  - Size of Test dataframe:  656
Is the number of rows of dataframe equals to the sum of the Train, Validation and Test dataframe?  True
