# Feature Scaling and Selection 

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
vehicles = pd.read_csv('https://raw.githubusercontent.com/loukjsmalbil/datasets_ws/master/vehicles.csv')

In [None]:
vehicles.head()

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.4375,2100
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.4375,2550


In [None]:
cars_numeric = vehicles.select_dtypes(include=['float64'])

In [None]:
cars_numeric.head()

Unnamed: 0,Engine Displacement,Cylinders,Fuel Barrels/Year,CO2 Emission Grams/Mile
0,2.5,4.0,19.388824,522.764706
1,4.2,6.0,25.354615,683.615385
2,2.5,4.0,20.600625,555.4375
3,4.2,6.0,25.354615,683.615385
4,3.8,6.0,20.600625,555.4375


## Example of Standardisation of one Feature

Recall this is simply an instantiation of the formula:

$$x_{i} = \frac{x_{i} - \mu}{\sigma}$$

In [None]:
eng_disp_stand = (cars_numeric['Engine Displacement'] - cars_numeric['Engine Displacement'].mean()) / cars_numeric['Engine Displacement'].std()
eng_disp_stand

0       -0.616813
1        0.633743
2       -0.616813
3        0.633743
4        0.339494
           ...   
35947   -1.720245
35948   -1.720245
35949   -1.720245
35950   -1.793807
35951   -1.793807
Name: Engine Displacement, Length: 35952, dtype: float64

## Standardisation for Numerical Columns using SKlearn



In [None]:
from sklearn import preprocessing

In [None]:
scaler = preprocessing.StandardScaler()
scaled_df = scaler.fit_transform(cars_numeric)
scaled_df = pd.DataFrame(scaled_df)

In [None]:
scaled_df.head()

Unnamed: 0,0,1,2,3
0,-0.616822,-1.005601,0.398406,0.398528
1,0.633752,0.133841,1.733866,1.749543
2,-0.616822,-1.005601,0.669671,0.672953
3,0.633752,0.133841,1.733866,1.749543
4,0.339499,0.133841,0.669671,0.672953


## Example of Normalisation of one Feature

Recall this is simply an instantiation of the formula:

$$x_{i} = \frac{x_{i} - x_{min}}{x_{max} - x_{min}}$$

In [None]:
eng_disp_norm = (cars_numeric['Engine Displacement'] - cars_numeric['Engine Displacement'].min()) / (cars_numeric['Engine Displacement'].max() - cars_numeric['Engine Displacement'].min())
eng_disp_norm

0        0.243590
1        0.461538
2        0.243590
3        0.461538
4        0.410256
           ...   
35947    0.051282
35948    0.051282
35949    0.051282
35950    0.038462
35951    0.038462
Name: Engine Displacement, Length: 35952, dtype: float64

## Normalisation for Numerical Columns using SKlearn


In [None]:
cars_numeric.head()

Unnamed: 0,Engine Displacement,Cylinders,Fuel Barrels/Year,CO2 Emission Grams/Mile
0,2.5,4.0,19.388824,522.764706
1,4.2,6.0,25.354615,683.615385
2,2.5,4.0,20.600625,555.4375
3,4.2,6.0,25.354615,683.615385
4,3.8,6.0,20.600625,555.4375


In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = preprocessing.MinMaxScaler()
scaled_df = scaler.fit_transform(cars_numeric)
scaled_df = pd.DataFrame(scaled_df)

In [None]:
scaled_df.head()

Unnamed: 0,0,1,2,3
0,0.24359,0.142857,0.411014,0.394107
1,0.461538,0.285714,0.537873,0.524607
2,0.24359,0.142857,0.436782,0.420615
3,0.461538,0.285714,0.537873,0.524607
4,0.410256,0.285714,0.436782,0.420615
