In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## #1 Importing dataset from Kaggle

In [25]:
file = '/Users/tylermeester/GitHub/Springboard/water_quality_capstone_project/data/water_potability.csv'
df = pd.read_csv(file)

***

## #2 Exploring the dataset

In [26]:
df

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.802160,8.061362,,392.449580,19.903225,,2.798243,1
3273,9.419510,175.762646,33155.578218,7.350233,,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.869376,6.303357,,402.883113,11.168946,77.488213,4.708658,1


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB


**All of the datatypes are float64 (floating point numbers) or int64 (whole integers) and do not need to be changed for future use.**

In [28]:
df.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,2785.0,3276.0,3276.0,3276.0,2495.0,3276.0,3276.0,3114.0,3276.0,3276.0
mean,7.080795,196.369496,22014.092526,7.122277,333.775777,426.205111,14.28497,66.396293,3.966786,0.39011
std,1.59432,32.879761,8768.570828,1.583085,41.41684,80.824064,3.308162,16.175008,0.780382,0.487849
min,0.0,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45,0.0
25%,6.093092,176.850538,15666.690297,6.127421,307.699498,365.734414,12.065801,55.844536,3.439711,0.0
50%,7.036752,196.967627,20927.833607,7.130299,333.073546,421.884968,14.218338,66.622485,3.955028,0.0
75%,8.062066,216.667456,27332.762127,8.114887,359.95017,481.792304,16.557652,77.337473,4.50032,1.0
max,14.0,323.124,61227.196008,13.127,481.030642,753.34262,28.3,124.0,6.739,1.0


**Column descriptions from data source:**

**1. ph: pH of 1. water (0 to 14).**\
**2. Hardness: Capacity of water to precipitate soap in mg/L.**\
**3. Solids: Total dissolved solids in ppm.**\
**4. Chloramines: Amount of Chloramines in ppm.**\
**5. Sulfate: Amount of Sulfates dissolved in mg/L.**\
**6. Conductivity: Electrical conductivity of water in μS/cm.**\
**7. Organic_carbon: Amount of organic carbon in ppm.**\
**8. Trihalomethanes: Amount of Trihalomethanes in μg/L.**\
**9. Turbidity: Measure of light emiting property of water in NTU.**\
**10. Potability: Indicates if water is safe for human consumption. Potable -1 and Not potable -0**

_ppm: parts per million\
μg/L: microgram per litre\
mg/L: milligram per litre_

___

## #3 Identifying and replacing NaN values.



In [29]:
#Determining how many NaN values are in each column
df.isna().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [30]:
#Checking if any features have a entry with a value of 0, which should be treated as a NaN value.
df[df['ph'] == 0]

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
3014,0.0,214.846144,49456.587108,7.897539,,583.448849,7.702328,77.712891,4.92884,0


___

In [31]:
#Changing the value of ph from 0 to the mean of ph values.
df.loc[3014, ['ph']] = df['ph'].mean()

In [32]:
#Checking to make sure the change was successful.
df.loc[3014]

ph                     7.080795
Hardness             214.846144
Solids             49456.587108
Chloramines            7.897539
Sulfate                     NaN
Conductivity         583.448849
Organic_carbon         7.702328
Trihalomethanes       77.712891
Turbidity              4.928840
Potability             0.000000
Name: 3014, dtype: float64

**ph, Sulfate, and Trihalomethanes have quite a few NaN values and ph has a value that is 0.
I will fill these NaN values with the means.**

In [33]:
#Replacing values with the means
df['ph'].fillna((df['ph'].mean()), inplace = True)
df['Sulfate'].fillna((df['Sulfate'].mean()), inplace = True)
df['Trihalomethanes'].fillna((df['Trihalomethanes'].mean()), inplace = True)

In [34]:
#Checking the NaN value count after making changes.
df.isna().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [35]:
df.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0
mean,7.083337,196.369496,22014.092526,7.122277,333.775777,426.205111,14.28497,66.396293,3.966786,0.39011
std,1.464737,32.879761,8768.570828,1.583085,36.142612,80.824064,3.308162,15.769881,0.780382,0.487849
min,0.227499,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45,0.0
25%,6.279317,176.850538,15666.690297,6.127421,317.094638,365.734414,12.065801,56.647656,3.439711,0.0
50%,7.083337,196.967627,20927.833607,7.130299,333.775777,421.884968,14.218338,66.396293,3.955028,0.0
75%,7.87005,216.667456,27332.762127,8.114887,350.385756,481.792304,16.557652,76.666609,4.50032,1.0
max,14.0,323.124,61227.196008,13.127,481.030642,753.34262,28.3,124.0,6.739,1.0


**All of the NaN values have been replaced and there is an equal count among all features.**

___

## #4 Saving the dataset for further analysis

In [39]:
df.to_csv('final_df.csv')