# Hands-on Data Preprocessing

# 0. Importing Libraries

In [263]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# 1. Importing The Dataset

In [264]:
# 1. create a variable to store the dataset
data_set = pd.read_csv('Data.csv')

# 2. Create 2 new Entities(1 for the Metrics of feature(usually the first column) and 2nd for the dependent variable vector(usually the last column))
# The Features(country, Age, Salary): is the column with which we're going to predict the dependent variable(Purchase)
#    : mean all row, all column/feature except -1(-1 mean except index of the last column).
X = data_set.iloc[:, :-1].values     # iloc: stand for loc index, will take the index we want to extract form dataset.
                                     # iloc: without upper and lower band ie [:] mean we're interested in all the range/row/column
                                     # values mean we  are taking the value of the specify columns/features.

# The dependent Variable(i.e Purchase): is what we want to predict and is most f the time th last column.
#        all the row, -1 mean last column
Y = data_set.iloc[:, -1 ].values     # ioc: used to collect/extract the index of the row and column that we want.

# Our future ML-Model expect exactly X and Y in their input.

In [265]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [266]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


# 2. Handling Missing Data
Missing Data may cause some error when training the ML Model, therefor must be handle. <br>
- They are several ways to handle missing Data <br>
  - 1. By deleting it, especially when we have many data in our data points in our data set<br>
  - 2. Replacing the missing value by the average of all the values of the column in which the data is missing.

In [267]:
# 1. Replacing the missing value by the average of all the values of the column in which the data is missing.
# We are going to use sklearn to handle missing data, to do this we have to import the SimpleImputer class.
    # then create an object of SimpleImputer class which will allow us to store the mean/average
from sklearn.impute import SimpleImputer      # the SimpleImputer  class belong to the module impute
                                              # dot is used to access the module

# Creating an instance of the class i.e create an object to store the average
                        # first argument is to specify which value we want to replace.
                                                 # 2nd Argument is to specify with what we want to replace the missing value, here wih the mean
imputer = SimpleImputer(missing_values = np.nan, strategy='mean' )

# 2. Connecting the imputer to the features using the fit() method
# apply the imputer.fit()) only on numerical column.
imputer.fit(x[:, 1:3])  # Here we connect/fit imputer to the missing data from column 1 to (2 + 1)=3, +1 since the upperbound is excluded in python
                        # fit() will look for all the missing value in the numerical salary and age column
# fit() will also calculate the average
# transform() method will then replace the missing salary through the average and then return the new updated version of the matrix of feature.
    # the new updated version is then store in the matrix of feature.
X[:, 1:3] =imputer.transform(x[:, 1:3])

In [268]:
# Printing the matrix of feature to check if the update was successful.
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


# 3. Encoding Categorical data.
As we can see the data set contains one column with categories, i.e the first column
- Idea:
  - Here we perform One-Hot encoding
  - A popular method used to preprocess the data containing categorical variable.
    - One-Hot encoding consist of turn the country column/category into 3 separate 3 (i.e France, Spain, Germany)<br>
      because the are 3 (i.e France, Spain, Germany) different classes in the country category.
    - One-Hot encoding will create binary vector for each of the class/country.
      - France will for instance have the vector 100, spain 010, and Germany 001. So that the will not exist a numeric other <br>
        between  the two country.
        - i.e the country column will be replace by 3 new column containing their respective vector.

In [269]:
# 1. We need 2 classes i.e column-transform class from compose sklearn library and One-Hot encoder from the same sklearn library.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# 2. Create an object of the column-Transform class.
                # We have to give two arguments. 1. Transformer and 2. remainder
                # 1. Transformer: here we specify what kind of transformation and encoding we want to do and on which index of the column.
                # 2.remainder: here we specify the column which, we don't want to apply some transformation i.e. Age and Salary
                # must be done in a pair of square bracket and parenthesis.
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0] )], # encoder is the type of transformation,  OneHotEncoder type of encoder we want, apply on column 0
                       remainder= 'passthrough')     # passthrough mean we don't want to apply One-hot encoder to the remainder column(Age and Salary)

# Now we have to connect our ct object to our matrix of feature X
# x = ct.fit_transform(x)     # with fit_transform() it's going to fit into our x and transform our matrix X and then the result will be store in x
                              # fit_transform() does not return X as a numpy array, since X will expected later during training as a numpy array ...
                              # .... we have to convert X now as a numpy array
X = np.array(ct.fit_transform(X))


In [270]:
# Checking the result by printing X
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


As we can see France has been encoded as 100, 
France  as 100
spain   as 001
Germany as 010

That is each country has been assigned a unique ID, that is our country has been turn into a numerical value.

# 4. Encoding the Independent Variables
In order to do that we are going to use another class call **label encoder**, <br>
which will convert the trend into zero and 0ne respectively.

In [271]:
# 1. we are going to use the label encoder class from the sklearn library called preprocessing.
from sklearn.preprocessing import LabelEncoder

# 2. we then create an object of the class
le = LabelEncoder()    # we don't need to input any thing in the parenthesis because its ..
                       # just one single vector that will be need to encode i.e. either zero or one

Y = le.fit_transform(Y)    # will fit/connect to Y and then convert/transform Y into either null or zero, the updated result 
                           # ... will be return to Y

# Checking our result.
print(Y)

[0 1 0 0 1 1 0 1 0 1]


# 4. Splitting
- Each time we want to train a machine learning model, we have to split our data into
  - Training set: used to train the ML model on existing observation in order to understand the correlation inside data set.
    - When the ML-model is train so well on the training set, so well that it doesn't perform well on  new observation.
  - Test set: used to evaluate the ML-modeL on new observation, in order to check that the is not over fitting on new observation.

In [272]:
# 1. sklearn contain a module which itself contain the train, test and split method.
from sklearn.model_selection import train_test_split


# 2. This method will create 4 separate set, because we will actually create a pair of matrix of feature and dependent variable for...
# ... the training set and another pair of matrix of feature an independent variable for the test set.
# i.e. X_train which is the matrix and feature of the training set
#      X_test which the matrix and feature of the test set.
#      Y_train which is the dependent variable of the training set
#      Y_test  which is the dependent variable of the test set.
X_train, X_test , Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 1 )  #  expect data of feature, independent variable as parameter, split size...
                                                                                              #  80% of the data in the training and 20% in the test
                                                                                              #  random_state = 1 for teaching purpose, so ..
                                                                                              # ... that we all have the same random factor/split.

# Observing the result


In [273]:

print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [274]:

print(X_test)


[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [275]:

print(Y_train)


[0 1 0 0 1 1 0 1]


In [276]:

print(Y_test)

[0 1]


# 5. Feature scaling
**Note**: Feature scaling should always be applied after splitting the data into training and test set.<br> 
- Feature scaling: consist of putting our variables all on the same scale.
- Feature scaling: consist of scaling our variable/feature to make sure they all take the value on the same scale, so as <br>
  to prevent one feature/value to dominant the order.
- We are not suppose to work with the train set for the training.
- Feature scale is a technique to get the mean and standard deviation of our features.
- If feature Scaling is done before the splitting then it will actually get mean and standard deviation of all the values<br>
  including the one in the test set and this will cause some information leakage, which we are not suppose to get, only on <br>
  on new data and new observation.
  - Answer: is to prevent information leakage from the test set, which we are not suppose to have until the training is done.<br>

**Note**: Feature scaling is not applicable for all ML-Model.
- Two most common techniques are 
  - Normalization: only work for specific cases.
  - Standardization: it work all the time and only apply after the split on both test set and train set separately


- Feature scaling will be applied to both X_train and X_test separately and scaler will be fitted to only X_train


In [277]:
# 1. import the class standard scaler, which will perform standardization on both metric of feature of the training and test sets.
from sklearn.preprocessing import StandardScaler

# 2. Create an object of the class
sc = StandardScaler()                     # does not take parameter

# 3. fit/connect the standardization(sc) to the training set and only on the numeric i.e salary and Age
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])     #  3: mean from 3 and above
                                                      # fit will calculate the mean and the standard deviation of each feature
                                                      # transform: will apply the calculated std and mean to the metric.

#  4. Applying the same scaler of the training set on to the test set, so that we can get the same transformation
# only apply the transform method on the test set, because the feature of test set need to scale by the same scaler that was..
# ... use on the training set.
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [278]:
print(X_train)         # Our age and Variable was transformed so that they take new value in the range -3 and +3 

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [279]:
print(X_test)         # the age and salary were also scale so that they take value between -3 and +3

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]




# #################### Done ################## #

# ===> Building ML Model