# Data Preprocessing

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [75]:
df = pd.read_csv('Data.csv')

In [4]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [5]:
df.shape

(10, 4)

In [6]:
df.columns

Index(['Country', 'Age', 'Salary', 'Purchased'], dtype='object')

In [40]:
df.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [43]:
df.Country.value_counts()

France     4
Spain      3
Germany    3
Name: Country, dtype: int64

In [44]:
df.apply(pd.Series.value_counts)     #getting value_counts for all features at once

Unnamed: 0,Country,Age,Salary,Purchased
27.0,,1.0,,
30.0,,1.0,,
35.0,,1.0,,
37.0,,1.0,,
38.0,,1.0,,
40.0,,1.0,,
44.0,,1.0,,
48.0,,1.0,,
50.0,,1.0,,
48000.0,,,1.0,


In [7]:
X = df.iloc[:,:3].values   #values is put for getting independenet variables in matrix form 
print(X)                   #gives out numpy array
X_table = df.iloc[:,:3]    #table form 
print(X_table)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
   Country   Age   Salary
0   France  44.0  72000.0
1    Spain  27.0  48000.0
2  Germany  30.0  54000.0
3    Spain  38.0  61000.0
4  Germany  40.0      NaN
5   France  35.0  58000.0
6    Spain   NaN  52000.0
7   France  48.0  79000.0
8  Germany  50.0  83000.0
9   France  37.0  67000.0


In [8]:
y = df.iloc[:,3].values   #values is put for getting independenet variables in matrix form, gives out a numpy array
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [9]:
type(y)

numpy.ndarray

### Missing Data

Finding and replacing NaNs in the data


Most probably we don't delete the data point with NaN in it to as other columns for same data point may contain some important information 

In [10]:
df.isnull()    #df.dropna() is used to drop all the rows containing even a single NaN

Unnamed: 0,Country,Age,Salary,Purchased
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,True,False
5,False,False,False,False
6,False,True,False,False
7,False,False,False,False
8,False,False,False,False
9,False,False,False,False


In [11]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [12]:
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)



In [13]:
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])     #Nan in Age and Salary column is replaced with mean of ages and salaries of other data points

In [14]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [15]:
x = df[['Country', 'Age', 'Salary']]
x['Age']

0    44.0
1    27.0
2    30.0
3    38.0
4    40.0
5    35.0
6     NaN
7    48.0
8    50.0
9    37.0
Name: Age, dtype: float64

In [16]:
imputer = Imputer(missing_values = 'NaN', strategy = 'most_frequent', axis = 0)   #used most_frequent number to fill the NaN
imputer = imputer.fit(x[['Age', 'Salary']])
x[['Age', 'Salary']] = imputer.transform(x[['Age', 'Salary']])
x                 #we can also use pandas "fillna" method

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,48000.0
5,France,35.0,58000.0
6,Spain,27.0,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [76]:
imputer = Imputer(missing_values = 'NaN', strategy = 'median', axis = 0)   #used most_frequent number to fill the NaN
imputer = imputer.fit(df[['Age', 'Salary']])
df[['Age', 'Salary']] = imputer.transform(df[['Age', 'Salary']])



### Categorical Data

Encoding categorical variables into numbers

In [17]:
from sklearn.preprocessing import LabelEncoder

In [18]:
labelencoder_x = LabelEncoder()
labelencoder_x.fit_transform(x['Country'])

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0])

In [19]:
x['Country_Encoded'] = labelencoder_x.fit_transform(x['Country'])
x

Unnamed: 0,Country,Age,Salary,Country_Encoded
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,2
2,Germany,30.0,54000.0,1
3,Spain,38.0,61000.0,2
4,Germany,40.0,48000.0,1
5,France,35.0,58000.0,0
6,Spain,27.0,52000.0,2
7,France,48.0,79000.0,0
8,Germany,50.0,83000.0,1
9,France,37.0,67000.0,0


The above method is NOT good, as we are mathematically giving high weightage to one class and low weightage to another. We don't want to discriminate between the country classes  

In such case do OneHot encoding 

In [20]:
from sklearn.preprocessing import OneHotEncoder

In [21]:
one_hot_encoder_x = OneHotEncoder(categorical_features= X[:,0],dtype=np.int8)

In [22]:
one_hot_encoder_x

OneHotEncoder(categorical_features=array(['France', 'Spain', 'Germany', 'Spain', 'Germany', 'France',
       'Spain', 'France', 'Germany', 'France'], dtype=object),
              categories=None, drop=None, dtype=<class 'numpy.int8'>,
              handle_unknown='error', n_values=None, sparse=True)

In [96]:
#one_hot_encoder_x.fit_transform(x)      #figure out why isn't this working

For one hot encoding multiple series, we can Also use pd.get_dummies 

In [24]:
pd.get_dummies(x['Country'])     
x = x.join(pd.get_dummies(x['Country']))
x

Unnamed: 0,Country,Age,Salary,Country_Encoded,France,Germany,Spain
0,France,44.0,72000.0,0,1,0,0
1,Spain,27.0,48000.0,2,0,0,1
2,Germany,30.0,54000.0,1,0,1,0
3,Spain,38.0,61000.0,2,0,0,1
4,Germany,40.0,48000.0,1,0,1,0
5,France,35.0,58000.0,0,1,0,0
6,Spain,27.0,52000.0,2,0,0,1
7,France,48.0,79000.0,0,1,0,0
8,Germany,50.0,83000.0,1,0,1,0
9,France,37.0,67000.0,0,1,0,0


In [31]:
z = pd.get_dummies(x['Country'])    
# pd.concat([x_copy,z],axis=1)

Unnamed: 0,Country,Age,Salary,Country_Encoded,France,Germany,Spain,France.1,Germany.1,Spain.1
0,France,44.0,72000.0,0,1,0,0,1,0,0
1,Spain,27.0,48000.0,2,0,0,1,0,0,1
2,Germany,30.0,54000.0,1,0,1,0,0,1,0
3,Spain,38.0,61000.0,2,0,0,1,0,0,1
4,Germany,40.0,48000.0,1,0,1,0,0,1,0
5,France,35.0,58000.0,0,1,0,0,1,0,0
6,Spain,27.0,52000.0,2,0,0,1,0,0,1
7,France,48.0,79000.0,0,1,0,0,1,0,0
8,Germany,50.0,83000.0,1,0,1,0,0,1,0
9,France,37.0,67000.0,0,1,0,0,1,0,0


In [26]:
x_copy = x.copy()

In [33]:
x_copy

Unnamed: 0,Country,Age,Salary,Country_Encoded,France,Germany,Spain
0,France,44.0,72000.0,0,1,0,0
1,Spain,27.0,48000.0,2,0,0,1
2,Germany,30.0,54000.0,1,0,1,0
3,Spain,38.0,61000.0,2,0,0,1
4,Germany,40.0,48000.0,1,0,1,0
5,France,35.0,58000.0,0,1,0,0
6,Spain,27.0,52000.0,2,0,0,1
7,France,48.0,79000.0,0,1,0,0
8,Germany,50.0,83000.0,1,0,1,0
9,France,37.0,67000.0,0,1,0,0


In [35]:
pd.get_dummies(x['Country'],drop_first=True)    #get k-1 dummies out of k categorical levels by removing the first level

Unnamed: 0,Germany,Spain
0,0,0
1,0,1
2,1,0
3,0,1
4,1,0
5,0,0
6,0,1
7,0,0
8,1,0
9,0,0


In [45]:
df    #for feature "Purchased" one hot encoding and label encoding would do the same thing as there are only two classes
labelencoder_x.fit_transform(df['Purchased'])

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [77]:
df = pd.concat([df, pd.get_dummies(df['Country']),pd.Series(labelencoder_x.fit_transform(df['Purchased']),name="Purchased_Encoded")], axis=1)
df

Unnamed: 0,Country,Age,Salary,Purchased,France,Germany,Spain,Purchased.1
0,France,44.0,72000.0,No,1,0,0,0
1,Spain,27.0,48000.0,Yes,0,0,1,1
2,Germany,30.0,54000.0,No,0,1,0,0
3,Spain,38.0,61000.0,No,0,0,1,0
4,Germany,40.0,61000.0,Yes,0,1,0,1
5,France,35.0,58000.0,Yes,1,0,0,1
6,Spain,38.0,52000.0,No,0,0,1,0
7,France,48.0,79000.0,Yes,1,0,0,1
8,Germany,50.0,83000.0,No,0,1,0,0
9,France,37.0,67000.0,Yes,1,0,0,1


In [47]:
from sklearn.preprocessing import StandardScaler

In [50]:
sc_x = StandardScaler()
x_tre = sc_x.fit_transform(x[['Salary','Age']])
x_tre

array([[ 0.82020574,  0.8273403 ],
       [-1.18846138, -1.37028238],
       [-0.6862946 , -0.98246661],
       [-0.10043336,  0.05170877],
       [-1.18846138,  0.31025261],
       [-0.35151675, -0.336107  ],
       [-0.85368353, -1.37028238],
       [ 1.40606699,  1.34442799],
       [ 1.74084484,  1.60297184],
       [ 0.40173343, -0.07756315]])

### Train-Test Split

Here we make two set of the given data, one being our training set and other set for validation(testing on some part of the data). This is done to see how is our model working our own data, if the model isn't working with similar accuracies as the training data we modify hyperparameters and train the model again on training set. TTS is also done to avoid overfiting  

After finilizing the model, we test this model on a completely new test data set

In [60]:
from sklearn.model_selection import train_test_split as tts

In [83]:
X = df[['Age', 'Salary', 'France', 'Germany', 'Spain']]
y = df['Purchased_Encoded']

In [84]:
X_train, X_test, y_train, y_test = tts(X,y, test_size=0.3, random_state=42)

In [86]:
X_train, X_test, y_train, y_test

(    Age   Salary  France  Germany  Spain
 0  44.0  72000.0       1        0      0
 7  48.0  79000.0       1        0      0
 2  30.0  54000.0       0        1      0
 9  37.0  67000.0       1        0      0
 4  40.0  61000.0       0        1      0
 3  38.0  61000.0       0        0      1
 6  38.0  52000.0       0        0      1,
     Age   Salary  France  Germany  Spain
 8  50.0  83000.0       0        1      0
 1  27.0  48000.0       0        0      1
 5  35.0  58000.0       1        0      0,
 0    0
 7    1
 2    0
 9    1
 4    1
 3    0
 6    0
 Name: Purchased_Encoded, dtype: int64,
 8    0
 1    1
 5    1
 Name: Purchased_Encoded, dtype: int64)

### Feature Scaling

When the features are not on same scale like in this example Age is in 50's scale whereas Salary is in 10000's scale. This is an issue in ML. 

Why is Feature Scalling done?
- To speed up the process of gradient descent
- Increase overall speed of training the model
- Keep all the data in same scale to avoid higher biases, no bias for one feature.
- Avoid domination of one feature on another (smaller numbers will be ignored/left behind with low weightage over large numbers by ML algorithms)

Various methods of Feature Scaling:
- Standardization
    - It is also called Z-score normalization. It calculates the z-score of each value and replaces the value with the calculated Z-score. The Z-score can be calculated by the following formula: [x' = (x-mean)/variance]  
    
    Where σ is the variance and x̄ is the mean. The features are then rescaled with x̄ =0 and σ=1
    
    Library used: StandardScalar
- Min-Max Scaling
    - It is also referred to as Normalization. The features are scaled between 0 and 1. The formula is given as: [x' = (x-min(x))/(max(x)-min(x))]
    
    Library used: MinMaxScaler
- Binarizing
    - It is used for binary thresholding of an array like matrix. 
    
    Library used: Binarizer
- Normalizer
    - It is used to rescale each sample.
    
    Library used: Normalizer
    
    

#### We don't need to apply feature scaling on "y" in case of classification problem


In [87]:
from sklearn.preprocessing import StandardScaler

In [91]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)     #used fit_transform bcoz, we fit and predict on the same data here
X_test = sc_X.transform(X_test)           #used transform only bcoz, already fitted in above statement, training and testing data should be fit on same scale

In [92]:
X_train     #dummie one-hot encoded features are also scalled to keep everything on same scale, but generally this the developers call and 
#depends on the context of the problem that whether to keep these dummie variables as it is or to scale them.

array([[ 0.8968186 ,  0.92684402,  1.15470054, -0.63245553, -0.63245553],
       [ 1.6577556 ,  1.70986742,  1.15470054, -0.63245553, -0.63245553],
       [-1.76646088, -1.08664471, -0.8660254 ,  1.58113883, -0.63245553],
       [-0.43482114,  0.36754159,  1.15470054, -0.63245553, -0.63245553],
       [ 0.13588161, -0.30362132, -0.8660254 ,  1.58113883, -0.63245553],
       [-0.24458689, -0.30362132, -0.8660254 , -0.63245553,  1.58113883],
       [-0.24458689, -1.31036568, -0.8660254 , -0.63245553,  1.58113883]])