In [1]:
import pandas as pd

## Imputing missing values using Imputer

In [2]:
from sklearn.preprocessing import Imputer

In [10]:
import os
df = pd.read_csv('Data.csv')
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [11]:
# replace every occurrence of missing_values to one defined by strategy
# which can be mean, median, mode. Axis = 0 means column, 1 means rows

imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
df.iloc[:, 1:3] = imputer.fit_transform(df.iloc[:, 1:3])
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes


## Encoding categorical data  

In [7]:
# Label Encoder will replace every categorical variable with number. Useful for replacing yes by 1, no by 0.
# One Hot Encoder will create a separate column for every variable and give a value of 1 where the variable is present
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [15]:
lable_encoder = LabelEncoder()
temp = df.copy()
temp.iloc[:, 0] = lable_encoder.fit_transform(df.iloc[:, 0])
temp.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,No
1,2,27.0,48000.0,Yes
2,1,30.0,54000.0,No
3,2,38.0,61000.0,No
4,1,40.0,63777.777778,Yes


In [20]:
# you can pass an array of indices of categorical features
# one_hot_encoder = OneHotEncoder(categorical_features=[0])
# temp = df.copy()
# temp.iloc[:, 0] = one_hot_encoder.fit_transform(df.iloc[:, 0])

# you can achieve the same thing using get_dummies
pd.get_dummies(df.iloc[:, :-1])

Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Spain
0,44.0,72000.0,1,0,0
1,27.0,48000.0,0,0,1
2,30.0,54000.0,0,1,0
3,38.0,61000.0,0,0,1
4,40.0,63777.777778,0,1,0
5,35.0,58000.0,1,0,0
6,38.777778,52000.0,0,0,1
7,48.0,79000.0,1,0,0
8,50.0,83000.0,0,1,0
9,37.0,67000.0,1,0,0


## Train test split

In [30]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df[[x for x in df.columns if x != 'Purchased']], 
                                                    df['Purchased'], test_size=0.2)

In [31]:
x_train

Unnamed: 0,Country,Age,Salary
8,Germany,50.0,83000.0
4,Germany,40.0,63777.777778
0,France,44.0,72000.0
3,Spain,38.0,61000.0
2,Germany,30.0,54000.0
9,France,37.0,67000.0
7,France,48.0,79000.0
1,Spain,27.0,48000.0


In [32]:
x_test

Unnamed: 0,Country,Age,Salary
6,Spain,38.777778,52000.0
5,France,35.0,58000.0


## Feature Scaling

Because in Machine Learning models, features are mapped into n-dimensional space. So let's say there are two variables (x, y) which will be mapped in 2D co-ordinate system. If one variable, say y, is very huge and other, x, is very small, then the euclidean distance will be dominated by the bigger one and smaller one will be ignored. In this case we are losing valuable information, hence feature scaling is used to solve this problem. 

#### There are 2 ways to normalise. 
1. Normalization: 
$$X_{norm} = \frac{X - X_{min}}{X_{max} - X{min}}$$

2. Standardization:
$$X_{stand} = \frac{X - \mu}{\sigma}$$

Normalization scales the data in the range of [0, 1] This is useful when the parameters have to be on same positive scale. But in this case, the outliers are lost. 

Standardization scales the data to have a mean of 0, and standard deviation of 1

For most applications, Standardization is recommended

In [34]:
from sklearn.preprocessing import StandardScaler # performing Stardardization
scaler = StandardScaler()
print 'Before'
print x_train
x_train.iloc[:, 1:] = scaler.fit_transform(x_train.iloc[:, 1:])
x_test.iloc[:, 1:] = scaler.transform(x_test.iloc[:, 1:])   # note we have already fit it, so we only transform this time
print 'After'
print x_train

Before
   Country   Age        Salary
8  Germany  50.0  83000.000000
4  Germany  40.0  63777.777778
0   France  44.0  72000.000000
3    Spain  38.0  61000.000000
2  Germany  30.0  54000.000000
9   France  37.0  67000.000000
7   France  48.0  79000.000000
1    Spain  27.0  48000.000000
After
   Country       Age    Salary
8  Germany  1.421536  1.528422
4  Germany  0.099177 -0.196974
0   France  0.628121  0.541056
3    Spain -0.165295 -0.446309
2  Germany -1.223182 -1.074633
9   France -0.297531  0.092254
7   France  1.157064  1.169380
1    Spain -1.619890 -1.613195
