# ML 101

This notebook contains the common methods to do dataset pre-processing, cleaning and normalization.

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

## Loading a sample dataset

Let us consider a toy dataset with only four features:
1. Country (String)
2. Age (Int)
3. Salary (Int)
4. Purchased (Yes/No)

In [8]:
# import dataset
df = pd.read_csv('../datasets/data_prep.csv')
# print the first rows of the dataset
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Nigeria,18.0,15000.0,No


In [9]:
# print the last rows of the dataset
df.tail()

Unnamed: 0,Country,Age,Salary,Purchased
24,France,37.0,23000.0,Yes
25,Germany,45.0,50000.0,No
26,France,37.0,67000.0,Yes
27,Nigeria,30.0,30000.0,Yes
28,Nigeria,29.0,15000.0,No


In [11]:
# viewing statistical info about dataset
df.describe()

Unnamed: 0,Age,Salary
count,27.0,28.0
mean,36.925926,53642.857143
std,8.757089,19216.532785
min,18.0,15000.0
25%,30.0,44750.0
50%,37.0,53000.0
75%,44.0,67000.0
max,50.0,83000.0


The dataset may contain duplicated rows due to any error on the acquisition.

In [17]:
# dropping duplicate values
duplicate_rows = df[df.duplicated()]
print(f'{duplicate_rows}')
df = df.drop_duplicates()
duplicate_rows = df[df.duplicated()]
print(f'{duplicate_rows}')
df.describe()

Empty DataFrame
Columns: [Country, Age, Salary, Purchased]
Index: []
Empty DataFrame
Columns: [Country, Age, Salary, Purchased]
Index: []


Unnamed: 0,Age,Salary
count,23.0,24.0
mean,36.782609,51541.666667
std,8.852101,19352.517344
min,18.0,15000.0
25%,30.0,43750.0
50%,37.0,51000.0
75%,44.0,62500.0
max,50.0,83000.0


Another common issue is the presence of missing values.

In [22]:
# checking for missing values
# checking the number of missing data
df.isnull().sum()

Country      1
Age          2
Salary       1
Purchased    1
dtype: int64

In [10]:
# Dropping categorical data rows with missing values
dataset.dropna(how='any', subset=['Country', 'Purchased'], inplace=True)

In [None]:
print(dataset)

In [None]:
dataset.describe()

In [11]:
# Splitting dataset into independent and dependent variable
X = dataset[['Country', 'Age', 'Salary']].values
y = dataset['Purchased'].values

In [12]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Nigeria' 18.0 15000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]
 ['Nigeria' 50.0 60000.0]
 ['France' 22.0 30000.0]
 ['Nigeria' 35.0 43000.0]
 ['Spain' 34.0 44000.0]
 ['Spain' 33.0 48000.0]
 ['Nigeria' 29.0 77000.0]
 ['Spain' nan 57000.0]
 ['France' 44.0 48000.0]
 ['France' 37.0 23000.0]
 ['Germany' 45.0 50000.0]
 ['Nigeria' 30.0 30000.0]
 ['Nigeria' 29.0 15000.0]]


In [13]:
print(y)

['No' 'Yes' 'No' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes' 'Yes' 'No'
 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'No']


In [14]:
# replacing the missing values in the age and salary column with the mean
# import the SimpleImputer class from the sklearn library
from sklearn.impute import SimpleImputer
# help(SimpleImputer)
print(X[:, 1:3])

[[44.0 72000.0]
 [27.0 48000.0]
 [30.0 54000.0]
 [38.0 61000.0]
 [18.0 15000.0]
 [40.0 nan]
 [35.0 58000.0]
 [nan 52000.0]
 [48.0 79000.0]
 [50.0 83000.0]
 [37.0 67000.0]
 [50.0 60000.0]
 [22.0 30000.0]
 [35.0 43000.0]
 [34.0 44000.0]
 [33.0 48000.0]
 [29.0 77000.0]
 [nan 57000.0]
 [44.0 48000.0]
 [37.0 23000.0]
 [45.0 50000.0]
 [30.0 30000.0]
 [29.0 15000.0]]


In [15]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [16]:
print(X[:, 1:3])

[[44.0 72000.0]
 [27.0 48000.0]
 [30.0 54000.0]
 [38.0 61000.0]
 [18.0 15000.0]
 [40.0 50636.36363636364]
 [35.0 58000.0]
 [35.95238095238095 52000.0]
 [48.0 79000.0]
 [50.0 83000.0]
 [37.0 67000.0]
 [50.0 60000.0]
 [22.0 30000.0]
 [35.0 43000.0]
 [34.0 44000.0]
 [33.0 48000.0]
 [29.0 77000.0]
 [35.95238095238095 57000.0]
 [44.0 48000.0]
 [37.0 23000.0]
 [45.0 50000.0]
 [30.0 30000.0]
 [29.0 15000.0]]


In [17]:
# Handling Categorical Data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('enconder', OneHotEncoder(), [0])],
                       remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [18]:
print(X)

[[1.0 0.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 0.0 30.0 54000.0]
 [0.0 0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 0.0 18.0 15000.0]
 [0.0 1.0 0.0 0.0 40.0 50636.36363636364]
 [1.0 0.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 0.0 1.0 35.95238095238095 52000.0]
 [1.0 0.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 0.0 37.0 67000.0]
 [0.0 0.0 1.0 0.0 50.0 60000.0]
 [1.0 0.0 0.0 0.0 22.0 30000.0]
 [0.0 0.0 1.0 0.0 35.0 43000.0]
 [0.0 0.0 0.0 1.0 34.0 44000.0]
 [0.0 0.0 0.0 1.0 33.0 48000.0]
 [0.0 0.0 1.0 0.0 29.0 77000.0]
 [0.0 0.0 0.0 1.0 35.95238095238095 57000.0]
 [1.0 0.0 0.0 0.0 44.0 48000.0]
 [1.0 0.0 0.0 0.0 37.0 23000.0]
 [0.0 1.0 0.0 0.0 45.0 50000.0]
 [0.0 0.0 1.0 0.0 30.0 30000.0]
 [0.0 0.0 1.0 0.0 29.0 15000.0]]


In [19]:
print(y)

['No' 'Yes' 'No' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes' 'Yes' 'No'
 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'No']


In [20]:
# Encoding the target variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [21]:
print(y)

[0 1 0 0 0 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1 0 1 0]


In [22]:
# Splitting Dataset into Training and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [23]:
print(X_train)

[[0.0 0.0 1.0 0.0 29.0 77000.0]
 [0.0 0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 0.0 0.0 22.0 30000.0]
 [0.0 1.0 0.0 0.0 50.0 83000.0]
 [0.0 0.0 1.0 0.0 18.0 15000.0]
 [1.0 0.0 0.0 0.0 37.0 67000.0]
 [0.0 1.0 0.0 0.0 40.0 50636.36363636364]
 [0.0 0.0 0.0 1.0 35.95238095238095 57000.0]
 [0.0 0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 0.0 30.0 54000.0]
 [0.0 0.0 0.0 1.0 35.95238095238095 52000.0]
 [1.0 0.0 0.0 0.0 37.0 23000.0]
 [1.0 0.0 0.0 0.0 44.0 48000.0]
 [0.0 0.0 1.0 0.0 50.0 60000.0]
 [0.0 0.0 1.0 0.0 29.0 15000.0]
 [0.0 0.0 1.0 0.0 35.0 43000.0]
 [0.0 0.0 0.0 1.0 33.0 48000.0]
 [1.0 0.0 0.0 0.0 48.0 79000.0]]


In [24]:
print(X_test)

[[0.0 1.0 0.0 0.0 45.0 50000.0]
 [1.0 0.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 0.0 30.0 30000.0]
 [1.0 0.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 0.0 1.0 34.0 44000.0]]


In [25]:
print(y_train)

[1 0 0 0 0 1 1 1 1 0 0 1 1 1 0 1 1 1]


In [26]:
print(y_test)

[0 0 1 1 1]


In [27]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 4:] = sc.fit_transform(X_train[:, 4:])

In [28]:
print(X_train)

[[0.0 0.0 1.0 0.0 -0.746521345589145 1.351588046193331]
 [0.0 0.0 0.0 1.0 0.28796566772827725 0.5327257531467691]
 [1.0 0.0 0.0 0.0 -1.55112235594714 -1.0538199396309444]
 [0.0 1.0 0.0 0.0 1.6672816854848402 1.6586614060857916]
 [0.0 0.0 1.0 0.0 -2.0108943618659945 -1.8215033393620963]
 [1.0 0.0 0.0 0.0 0.1730226662485637 0.8397991130392297]
 [0.0 1.0 0.0 0.0 0.5178516706877044 0.0023263133325189464]
 [0.0 0.0 0.0 1.0 0.05260618850791099 0.3280101798851286]
 [0.0 0.0 0.0 1.0 -0.9764073485485721 -0.13259985995356244]
 [0.0 1.0 0.0 0.0 -0.6315783441094314 0.17447349993889827]
 [0.0 0.0 0.0 1.0 0.05260618850791099 0.07211571330807803]
 [1.0 0.0 0.0 0.0 0.1730226662485637 -1.4120721928388154]
 [1.0 0.0 0.0 0.0 0.9776236766065588 -0.13259985995356244]
 [0.0 0.0 1.0 0.0 1.6672816854848402 0.481546859831359]
 [0.0 0.0 1.0 0.0 -0.746521345589145 -1.8215033393620963]
 [0.0 0.0 1.0 0.0 -0.056863336710863466 -0.388494326530613]
 [0.0 0.0 0.0 1.0 -0.28674933967029065 -0.13259985995356244]
 [1.0 0.

In [29]:
print(X_test)

[[0.0 1.0 0.0 0.0 45.0 50000.0]
 [1.0 0.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 0.0 30.0 30000.0]
 [1.0 0.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 0.0 1.0 34.0 44000.0]]


In [30]:
X_test[:, 4:] = sc.transform(X_test[:, 4:])

In [31]:
print(X_test)

[[0.0 1.0 0.0 0.0 1.0925666780862724 -0.030242073322742206]
 [1.0 0.0 0.0 0.0 0.9776236766065588 1.0956935796162803]
 [0.0 0.0 1.0 0.0 -0.6315783441094314 -1.0538199396309444]
 [1.0 0.0 0.0 0.0 -0.056863336710863466 0.37918907320053874]
 [0.0 0.0 0.0 1.0 -0.17180633819057706 -0.3373154332152029]]
