# Encoding Categorical Data | Ordinal Encoding | Label Encoding

In [126]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [127]:
df = pd.read_csv('customer.csv')

In [128]:
df.sample(10)

Unnamed: 0,age,gender,review,education,purchased
23,96,Female,Good,School,No
25,57,Female,Good,School,No
26,53,Female,Poor,PG,No
17,22,Female,Poor,UG,Yes
30,73,Male,Average,UG,No
5,31,Female,Average,School,Yes
2,70,Female,Good,PG,No
27,69,Female,Poor,PG,No
31,22,Female,Poor,School,Yes
10,98,Female,Good,UG,Yes


In [129]:
#since gender,review,education and purchased is a categorical column we assign only them to our dataframe
#review, education : ordinal categorical variable
#purchased, gender : nominal categorical variable
#gender --> OneHotEncoder
#review.education --> Ordinal encoder
#purchased --> label encoder
'''
    we can use columntransformer of scikit learn to do all the transformation but for now,
    let's do this only in the review, education and purchased
'''
df = df.iloc[:,2:]

In [130]:
df

Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No
5,Average,School,Yes
6,Good,School,No
7,Poor,School,Yes
8,Average,UG,No
9,Good,UG,Yes


In [168]:
from sklearn.model_selection import train_test_split
X_train, X_test,Y_train, Y_test = train_test_split(df.iloc[:,0:2].values,df.iloc[:, -1],test_size = 0.2 )

In [169]:
X_train

array([['Average', 'School'],
       ['Good', 'PG'],
       ['Average', 'PG'],
       ['Average', 'PG'],
       ['Good', 'UG'],
       ['Poor', 'UG'],
       ['Average', 'UG'],
       ['Poor', 'PG'],
       ['Good', 'PG'],
       ['Poor', 'UG'],
       ['Poor', 'UG'],
       ['Good', 'UG'],
       ['Poor', 'PG'],
       ['Poor', 'UG'],
       ['Poor', 'PG'],
       ['Poor', 'PG'],
       ['Average', 'UG'],
       ['Poor', 'School'],
       ['Good', 'PG'],
       ['Poor', 'PG'],
       ['Good', 'UG'],
       ['Average', 'UG'],
       ['Good', 'PG'],
       ['Good', 'PG'],
       ['Good', 'School'],
       ['Poor', 'School'],
       ['Good', 'UG'],
       ['Poor', 'School'],
       ['Good', 'UG'],
       ['Good', 'School'],
       ['Good', 'School'],
       ['Good', 'School'],
       ['Average', 'School'],
       ['Poor', 'PG'],
       ['Poor', 'School'],
       ['Poor', 'PG'],
       ['Average', 'School'],
       ['Poor', 'PG'],
       ['Average', 'PG'],
       ['Good', 'PG']], dtype=ob

In [170]:
Y_train

20    Yes
33    Yes
24    Yes
37    Yes
11    Yes
17    Yes
32    Yes
43     No
47    Yes
1      No
16    Yes
9     Yes
19    Yes
15     No
39     No
45    Yes
29    Yes
28     No
2      No
26     No
10    Yes
8      No
3      No
42    Yes
38     No
7     Yes
36    Yes
12     No
49     No
18     No
6      No
25     No
34     No
14    Yes
35    Yes
27     No
13     No
46     No
21     No
41    Yes
Name: purchased, dtype: object

In [171]:
X_test

array([['Average', 'School'],
       ['Average', 'School'],
       ['Average', 'UG'],
       ['Poor', 'School'],
       ['Good', 'School'],
       ['Good', 'School'],
       ['Average', 'UG'],
       ['Good', 'UG'],
       ['Average', 'UG'],
       ['Poor', 'PG']], dtype=object)

In [172]:
Y_test

5     Yes
0      No
4      No
31    Yes
40     No
23     No
30     No
48    Yes
44     No
22    Yes
Name: purchased, dtype: object

In [173]:
from sklearn.preprocessing import OrdinalEncoder

In [174]:
oe = OrdinalEncoder(categories = [['Poor','Average','Good'],['School','UG','PG']])

In [175]:
oe.fit(X_train)

In [176]:
X_train = oe.transform(X_train)

In [177]:
X_test = oe.transform(X_test)

In [73]:
print(X_train)

[[1. 0.]
 [1. 1.]
 [1. 2.]
 [2. 1.]
 [2. 1.]
 [2. 2.]
 [1. 1.]
 [1. 1.]
 [1. 0.]
 [0. 2.]
 [0. 2.]
 [2. 1.]
 [1. 0.]
 [2. 2.]
 [0. 1.]
 [0. 1.]
 [2. 1.]
 [2. 1.]
 [1. 0.]
 [0. 0.]
 [2. 0.]
 [1. 2.]
 [0. 2.]
 [1. 0.]
 [1. 2.]
 [0. 0.]
 [1. 1.]
 [2. 0.]
 [0. 2.]
 [1. 1.]
 [0. 0.]
 [0. 2.]
 [2. 0.]
 [2. 2.]
 [0. 0.]
 [2. 0.]
 [0. 0.]
 [2. 2.]
 [0. 1.]
 [0. 2.]]


In [178]:
X_test

array([[1., 0.],
       [1., 0.],
       [1., 1.],
       [0., 0.],
       [2., 0.],
       [2., 0.],
       [1., 1.],
       [2., 1.],
       [1., 1.],
       [0., 2.]])

In [179]:
oe.categories_

[array(['Poor', 'Average', 'Good'], dtype=object),
 array(['School', 'UG', 'PG'], dtype=object)]

In [180]:
from sklearn.preprocessing import LabelEncoder

In [181]:
le = LabelEncoder()

In [183]:
le.fit(Y_train)

In [184]:
le.classes_

array(['No', 'Yes'], dtype=object)

In [185]:
Y_train = le.transform(Y_train)
Y_test = le.transform(Y_test)

In [186]:
Y_train

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1])