In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore

In [3]:
file_path = '/content/drive/MyDrive/water_potability.csv'
df = pd.read_csv(file_path)

In [4]:
print("First few rows of the dataset:")
print(df.head())
print("\nDataset summary:")
print(df.info())
print("\nCheck for missing values:")
print(df.isnull().sum())

First few rows of the dataset:
         ph    Hardness        Solids  Chloramines     Sulfate  Conductivity  \
0       NaN  204.890455  20791.318981     7.300212  368.516441    564.308654   
1  3.716080  129.422921  18630.057858     6.635246         NaN    592.885359   
2  8.099124  224.236259  19909.541732     9.275884         NaN    418.606213   
3  8.316766  214.373394  22018.417441     8.059332  356.886136    363.266516   
4  9.092223  181.101509  17978.986339     6.546600  310.135738    398.410813   

   Organic_carbon  Trihalomethanes  Turbidity  Potability  
0       10.379783        86.990970   2.963135           0  
1       15.180013        56.329076   4.500656           0  
2       16.868637        66.420093   3.055934           0  
3       18.436524       100.341674   4.628771           0  
4       11.558279        31.997993   4.075075           0  

Dataset summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 # 

In [5]:
# Count duplicate rows in the dataframe
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# Remove duplicate rows if any exist
if duplicates > 0:
    df = df.drop_duplicates()
    print(f"{duplicates} duplicate rows removed.")



Number of duplicate rows: 0


In [6]:
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
print("\nMissing values after imputation:")
print(df_imputed.isnull().sum())


Missing values after imputation:
ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64


In [7]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_imputed.iloc[:, :-1])
df_scaled = pd.DataFrame(scaled_features, columns=df_imputed.columns[:-1])
df_scaled['Potability'] = df_imputed['Potability']
print("\nScaled data preview:")
print(df_scaled.head())


Scaled data preview:
             ph  Hardness    Solids  Chloramines       Sulfate  Conductivity  \
0 -6.043133e-16  0.259195 -0.139471     0.112415  9.613574e-01      1.708954   
1 -2.289339e+00 -2.036414 -0.385987    -0.307694  3.145987e-15      2.062575   
2  6.928678e-01  0.847665 -0.240047     1.360594  3.145987e-15     -0.094032   
3  8.409504e-01  0.547651  0.000493     0.592008  6.395190e-01     -0.778830   
4  1.368569e+00 -0.464429 -0.460249    -0.363698 -6.541765e-01     -0.343939   

   Organic_carbon  Trihalomethanes  Turbidity  Potability  
0       -1.180651         1.306149  -1.286298         0.0  
1        0.270597        -0.638480   0.684218         0.0  
2        0.781117         0.001509  -1.167365         0.0  
3        1.255134         2.152874   0.848412         0.0  
4       -0.824357        -2.181599   0.138786         0.0  


In [8]:
correlation_matrix= df_scaled.corr()
print("\nCorrelationmatrix:")
print(correlation_matrix)


Correlationmatrix:
                       ph  Hardness    Solids  Chloramines   Sulfate  \
ph               1.000000  0.075833 -0.081884    -0.031811  0.014403   
Hardness         0.075833  1.000000 -0.046899    -0.030054 -0.092766   
Solids          -0.081884 -0.046899  1.000000    -0.070148 -0.149840   
Chloramines     -0.031811 -0.030054 -0.070148     1.000000  0.023791   
Sulfate          0.014403 -0.092766 -0.149840     0.023791  1.000000   
Conductivity     0.017192 -0.023915  0.013831    -0.020486 -0.014059   
Organic_carbon   0.040061  0.003610  0.010242    -0.012653  0.026909   
Trihalomethanes  0.002994 -0.012690 -0.008875     0.016627 -0.025605   
Turbidity       -0.036222 -0.014449  0.019546     0.002363 -0.009790   
Potability      -0.003287 -0.013837  0.033743     0.023779 -0.020619   

                 Conductivity  Organic_carbon  Trihalomethanes  Turbidity  \
ph                   0.017192        0.040061         0.002994  -0.036222   
Hardness            -0.023915    

In [9]:
z_scores= np.abs(zscore(df_scaled.iloc[:,:-1]))
outliers= (z_scores >3).any(axis=1)
print(f"\nNumberof outliersdetected:{outliers.sum()}")


Numberof outliersdetected:148


In [None]:
output_path = '/content/drive/MyDrive/water_potability_preprocessed.csv'
df_scaled.to_csv(output_path, index=False)