# Data Preprocessing Tools

## Importing the libraries

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [4]:
dataset = pd.read_csv('Data.csv')

# spliting the dataset into the matrix of features, also called independent variables(usually the first columns which will be used for prediction) and dependent variable vector(what we want to predict, in this case is whether the customer bought a product or not )
# iloc means locate indices
# the first part of the iloc is rows and the second part is for columns
# the colon means all the rows(from python, a colon without lower or upper bound means all the bounds)
# -1 is the index of the last column
# again from python, when we say 2:9 it means take from index 2 to 9 but exclude index 9, meaning from 2 to 8
# the same applies here, take from index 0(since there is a colon without a lower bound) to index -1, but excclude index -1, meaning 0 to the second last index
X = dataset.iloc[:, :-1].values

# at the column part, since we don't have a colon, it means that it's not a range but we are only getting on index, -1
y = dataset.iloc[:, -1].values

In [8]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [0]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [10]:
# if you have a large dataset, you can ignore or delete the missing data, it won't make a diffrence coz you'll still have data to use
# if you have less data then use the following procedure(s)

# using a class(SimpleImputer) from sklearn to handle missing data
# impute is the module where the SimpleImputer class belongs
from sklearn.impute import SimpleImputer

# creating an instance/object of the SimpleImputer class that allows us to replace the missing with the average of the salaries
# the class has arguments, missing_values argument is what we want to replace, and strategy is what we want to replace with(in this case, mean, you can use median, mode, etc)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# using the 'fit' method/function from the SimpleImputer class to apply/connect the object to the matrix of features(X)
# the method will look at the missing values from the independent variable(matrix of features) and claculate the average
# the argument that method takes is where we want to apply the changes
# X as we know is the independent variables columns
# we are doing it from 1:3 since you cannot calculate the average of string values, hence excluding index 0
imputer.fit(X[:, 1:3])

# another method of the SimpleImputer class is 'transform' which now replaces the missing values with the calculated average
# it takes the arguments which is where we want to replace missing
# the 'imputer.transform(X[:, 1:3])' returns the new updated version of the independent variables X with the replacements
# X[:, 1:3] will now update missing data wih the the returned values
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [11]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

### Encoding the Independent Variable

In [13]:
# encoding categorical data is converting textual data into intteger format
# the purpose is to provide the converted integer values to models to give and improve predictions

# one hot encoding is the process that we are going to use
# ColumnTransformer class is for getting the desired column to be transformed
# OneHotEncoder class is for encoding the textual values to integers
# for accuracy, since the country column has three categories(germany, spain, and france), we won't use integers, we'll use vectors(matrices with either one row and multiple columns or vice versa) in this case we'll use the one with multiple columns(column vector)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# ct is the instance of the class ColumnTransformer, and it has 2 arguments, transformers and remainder
# transformers to specify what kind of transformation we want to do, and on which indices of the columns we want to transform
# remainder which specifies the columns that transformation won't be applied
# in transformers there are 3 things to be specified, type of transformation(encoding), type of encoding(onehotencoding), and indices of the columns we want to encode
# passthrough on the remainder argument means not transforming the remaining columns in the independent variable(which we'll connectt in the next line of code)
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')

# note the ColumnTransformer class allows us to call both the fit and transform functions at the same time
# np.array is necessary for storing the results in a numpy array for the purpose of future model training
X = np.array(ct.fit_transform(X))

# from the output, france is represented as 1.0.0, spain as 0.0.1, and germany as 0.1.0

In [14]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [16]:
# since the dependent variable has 2 categories(yes and no), we can use integers for encoding
from sklearn.preprocessing import LabelEncoder

# no arguments in this class coz there's only one single vector(0, 1)
le = LabelEncoder()

# a dependent variable doesn't have to be in a numpy array
y = le.fit_transform(y)

In [17]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [43]:
# training set is where we train our model
# test set is the set which we'll use to evaluate our ML model
# we are to split the set into 4(2 pairs), X_train and X_test for the independent variables, y_train and y_test for the dependent variables
# for input(training) we'll use X_train and y_train and the other two for testing/prediction/inference

# all that 👆 is implemented using the 'train_test_split' function from the module 'model_selection' from sklearn library
from sklearn.model_selection import train_test_split

# the function train_test_split has various arguments, in this case, dataset, test size and random state(which is not a must)
# X, y is the dataset in this case(independent and dependent variables)
# 'test_size = 0.2' means we want the size of our testing data to be 20%, meaning the size of our training data will be 80%. All these will happen randomly
# 'random_state = 1' is used to control the shuffling process of results. It ensures we get the same results in every execution
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [44]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [45]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [46]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [47]:
print(y_test)

[0 1]


## Feature Scaling

In [0]:
# this refers to normalizing/standardizing range of independent variables or features
# purpose is to ensure the ML model interprets these features in the same scale
# we do this to prevent one feature dominating the other(s), hence the ML model neglecting the other "non-dominant" feature
# N/B: Feature scaling is to be done after splitting the dataset into training and testing set coz test set is supposed to ba a brand new set(data) with new observations that wasn't used in training, hence not scaled
# performing scaling before splitting will provide information leackage to the test set
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [0]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [0]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
