# Data Preprocessing
### Steps to be followed are:
### i) Importing the libraries
### ii) Importing the dataset
### iii) Handling the missing values
### iv) Encoding the categorical data (both dependent and independent features)
### v) Splitting the dataset into training set and testing set
### vi) Feature Scaling

## Importing the libraries

In [1]:
import numpy as np  # Helps us in working with arrays
import matplotlib.pyplot as plt # Allows us to plot some charts
import pandas as pd # Helps us to import the dataset and also create matrix of features and dependent variable

## Importing the dataset

In [2]:
df = pd.read_csv(r'C:\Users\nisho\Documents\nishoak docs\Studies\Machine Learning\Coding_Part\02_Data_Preprocessing\02_Data.csv')

In [3]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [4]:
df.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [5]:
# X for the matrix features
X = df.iloc[ : , :-1].values  #[:, :-1] at start [:,] as we r selecting all the rows. [, :-1] --> All columns except last column
print(X) # iloc function helps us to take the indexes of the columns we want to extract from the dataset.   

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [6]:
y = df.iloc[ : , -1].values # y for the dependent variable vector
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [7]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [8]:
from sklearn.impute import SimpleImputer  # We r importing SimpleImputer class from sklearn.impute module.
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])  # Applying this imputer object on the matrix of features
imputer.transform(X[:, 1:3]) # Calling the transformer object so that the missing values will be replaced by the mean of the respective feature
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [9]:
print(X) # We can see that there are no missing values present  

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [10]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer ( missing_values=np.nan , strategy = 'median')
imputer.fit(X[ : , 1:3 ])
X[ : , 1:3 ] = imputer.transform(X[ : , 1:3 ])
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

### Encoding the Independent Variable

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# ct = ColumnTransformer(transformers=, remainder=)
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
# transformers must contain 3 things i) What we want to apply ii) What kind of encoding we want to apply iii) The columns on which we want to apply
# remainders = 'passthrough' so that it will even print the columns on which we haven't done any encoding
ct.fit_transform(X)
X = np.array(ct.fit_transform(X))

In [12]:
print(X) # We can see that 3 new columns will be added

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [14]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [15]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split( X , y , test_size = 0.2 , random_state = 1)

In [16]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [17]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [18]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [19]:
print(y_test)

[0 1]


## Feature Scaling - Must be done after splitting the dataset
### i) Standardisation, X = ( X- mean(X) ) / standard deviation  --> Works all the time
### ii) Normalisation, X = ( X- min(X) ) / ( max(X) - min(X) )

In [20]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler() # We should not apply feature scaling on the dummy variables
X_train[ : , 3: ] = sc.fit_transform(X_train[ : , 3: ])
X_test[ : , 3: ] = sc.fit(X_test[ : , 3: ])

In [21]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [22]:
print(X_test)

[[0.0 1.0 0.0 StandardScaler() StandardScaler()]
 [1.0 0.0 0.0 StandardScaler() StandardScaler()]]
