In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## #1 Importing the dataset

In [17]:
#Importing the final dataframe
file = 'final_df.csv'
df = pd.read_csv(file)

#Removing unnecessary column 'Unnamed : 0'
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [23]:
df

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,7.156857,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0.0
1,3.716080,129.422921,18630.057858,6.635246,336.094350,592.885359,15.180013,56.329076,4.500656,0.0
2,8.099124,224.236259,19909.541732,9.275884,330.449166,418.606213,16.868637,66.420093,3.055934,0.0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0.0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0.0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1.0
3272,7.808856,193.553212,17329.802160,8.061362,364.091541,392.449580,19.903225,64.327280,2.798243,1.0
3273,9.419510,175.762646,33155.578218,7.350233,327.357588,432.044783,11.039070,69.845400,3.298875,1.0
3274,5.126763,230.603758,11983.869376,6.303357,325.952434,402.883113,11.168946,77.488213,4.708658,1.0


**Because there are no categorical features, there is no need to create dummy features.**

___

## #2 Creating the train/test split and scaling the data for the model 

Since the target feature 'Potability' is a binary classifier, we will not need to scale it. However, we will need to scale everything else. I'm going to start with a Standard Scaler.

In [41]:
#Importing the StandardScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [36]:
#Seperating the target feature from the other features
X = df.drop('Potability', axis = 1).copy()

y = df['Potability'].copy()

In [42]:
#Performing the train/test split with a test size of 25%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25)

In [48]:
#Instantiating the Standard Scaler
scaler = StandardScaler()

In [49]:
#Scaling the X_train data
X_train_scaled = scaler.fit_transform(X_train)

In [50]:
#Scaling the X_test data
X_test_scaled = scaler.transform(X_test)

___

## #3 Saving the train/test split for modeling

In [81]:
#Saving the X_train_scaled data
pd.DataFrame(X_train_scaled).to_csv('X_train_scaled.csv')

In [82]:
X_train_scaled

array([[-0.04065752,  0.73354149,  0.50636585, ..., -0.30373045,
         2.05920773, -0.49214303],
       [ 0.49393646,  1.91845957, -0.13513942, ..., -1.4110529 ,
         0.34864795,  0.72592857],
       [ 0.55224323,  0.45417305, -0.70103366, ...,  1.17709757,
         3.63000018, -1.42983011],
       ...,
       [-0.92752051,  1.03779728, -0.58171413, ..., -0.00971582,
        -0.73473724, -0.4369237 ],
       [ 1.5446067 ,  0.88046308,  0.41923997, ...,  0.78516069,
        -0.45004194,  0.80234474],
       [-1.02890056,  1.00153961,  1.4931661 , ...,  0.19529911,
        -1.21512387, -1.50066819]])

In [83]:
#Saving the X_test_scaled data
pd.DataFrame(X_test_scaled).to_csv('X_test_scaled.csv')

In [84]:
X_test_scaled

array([[ 0.52054247, -1.21420804,  0.68995852, ...,  1.36163155,
        -0.84743376,  1.21524378],
       [ 1.15694553,  2.26259551, -1.12503563, ...,  0.50499985,
         0.10866589,  2.0524329 ],
       [ 0.28294415,  0.62589991, -1.4266366 , ..., -0.92124857,
        -0.82793462, -0.28975052],
       ...,
       [ 0.78521969, -0.34700025, -0.05562609, ..., -0.15015417,
         0.35466531, -1.74709731],
       [ 0.4949676 , -0.36790893, -1.38750794, ..., -0.58763812,
         1.52116374,  0.93949046],
       [ 0.59693533, -0.28715363, -0.07547201, ..., -0.11301154,
         1.25333958,  1.45121498]])

In [85]:
#Saving the y_train data
pd.DataFrame(y_train).to_csv('y_train.csv')

In [86]:
y_train

2935    0.0
1757    0.0
2376    1.0
2281    0.0
3246    1.0
       ... 
2402    1.0
22      0.0
302     1.0
2894    0.0
2128    0.0
Name: Potability, Length: 2457, dtype: float64

In [87]:
#Saving the y_test data
pd.DataFrame(y_test).to_csv('y_test.csv')

In [88]:
y_test

776     1.0
2647    0.0
2037    1.0
1564    1.0
2780    1.0
       ... 
390     1.0
1726    0.0
3156    1.0
1402    0.0
682     1.0
Name: Potability, Length: 819, dtype: float64