# Encoding Categorical Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('customer.csv')

In [3]:
df.sample(5)

Unnamed: 0,age,gender,review,education,purchased
46,64,Female,Poor,PG,No
37,94,Male,Average,PG,Yes
20,57,Female,Average,School,Yes
10,98,Female,Good,UG,Yes
36,34,Female,Good,UG,Yes


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        50 non-null     int64 
 1   gender     50 non-null     object
 2   review     50 non-null     object
 3   education  50 non-null     object
 4   purchased  50 non-null     object
dtypes: int64(1), object(4)
memory usage: 2.1+ KB


In [5]:
df = df.iloc[:,2:]

In [6]:
df

Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No
5,Average,School,Yes
6,Good,School,No
7,Poor,School,Yes
8,Average,UG,No
9,Good,UG,Yes


In [50]:
from sklearn.model_selection import train_test_split


X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:2], df['purchased'], test_size = 0.20, random_state = 0) 
# df.iloc[:,0:2] or only use df

In [8]:
X_train

Unnamed: 0,review,education
33,Good,PG
35,Poor,School
26,Poor,PG
34,Average,School
18,Good,School
7,Poor,School
14,Poor,PG
45,Poor,PG
48,Good,UG
29,Average,UG


In [9]:
y_train

33    Yes
35    Yes
26     No
34     No
18     No
7     Yes
14    Yes
45    Yes
48    Yes
29    Yes
15     No
30     No
32    Yes
16    Yes
42    Yes
20    Yes
43     No
8      No
13     No
25     No
5     Yes
17    Yes
40     No
49     No
1      No
12     No
37    Yes
24    Yes
6      No
23     No
36    Yes
21     No
19    Yes
9     Yes
39     No
46     No
3      No
0      No
47    Yes
44     No
Name: purchased, dtype: object

In [10]:

from sklearn.preprocessing import OrdinalEncoder

In [11]:
#for by default use this -- oe = OrdinalEncoder()    
oe= OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']])   # inorder by user

In [91]:
oe

In [12]:
oe.fit(X_train)

In [13]:
X_train_name = oe.transform(X_train)
X_test_name = oe.transform(X_test)

In [14]:
X_train_name

array([[2., 2.],
       [0., 0.],
       [0., 2.],
       [1., 0.],
       [2., 0.],
       [0., 0.],
       [0., 2.],
       [0., 2.],
       [2., 1.],
       [1., 1.],
       [0., 1.],
       [1., 1.],
       [1., 1.],
       [0., 1.],
       [2., 2.],
       [1., 0.],
       [0., 2.],
       [1., 1.],
       [1., 0.],
       [2., 0.],
       [1., 0.],
       [0., 1.],
       [2., 0.],
       [2., 1.],
       [0., 1.],
       [0., 0.],
       [1., 2.],
       [1., 2.],
       [2., 0.],
       [2., 0.],
       [2., 1.],
       [1., 2.],
       [0., 2.],
       [2., 1.],
       [0., 2.],
       [0., 2.],
       [2., 2.],
       [1., 0.],
       [2., 2.],
       [1., 1.]])

In [15]:
X_test_name

array([[0., 0.],
       [2., 1.],
       [2., 1.],
       [2., 2.],
       [2., 2.],
       [0., 2.],
       [2., 0.],
       [0., 0.],
       [0., 2.],
       [1., 1.]])

In [16]:
oe.categories_

[array(['Poor', 'Average', 'Good'], dtype=object),
 array(['School', 'UG', 'PG'], dtype=object)]

In [17]:
X_train

Unnamed: 0,review,education
33,Good,PG
35,Poor,School
26,Poor,PG
34,Average,School
18,Good,School
7,Poor,School
14,Poor,PG
45,Poor,PG
48,Good,UG
29,Average,UG


In [18]:
#Task
X_train_name = pd.DataFrame(X_train_name, columns=['review','education'])
X_test_name= pd.DataFrame(X_test_name, columns=['review','education'])

In [19]:
X_train_name

Unnamed: 0,review,education
0,2.0,2.0
1,0.0,0.0
2,0.0,2.0
3,1.0,0.0
4,2.0,0.0
5,0.0,0.0
6,0.0,2.0
7,0.0,2.0
8,2.0,1.0
9,1.0,1.0


In [20]:
X_test

Unnamed: 0,review,education
28,Poor,School
11,Good,UG
10,Good,UG
41,Good,PG
2,Good,PG
27,Poor,PG
38,Good,School
31,Poor,School
22,Poor,PG
4,Average,UG


In [21]:
X_test_name

Unnamed: 0,review,education
0,0.0,0.0
1,2.0,1.0
2,2.0,1.0
3,2.0,2.0
4,2.0,2.0
5,0.0,2.0
6,2.0,0.0
7,0.0,0.0
8,0.0,2.0
9,1.0,1.0


In [22]:
from sklearn.preprocessing import LabelEncoder

In [23]:
le = LabelEncoder()

In [24]:
le.fit(y_train)

In [25]:
le.classes_

array(['No', 'Yes'], dtype=object)

In [26]:
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [27]:
y_train

array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0])

In [28]:
#Task 
y_train_name = pd.DataFrame(y_train,columns=['class'])
y_test_name= pd.DataFrame(y_test,columns=['class'])

In [65]:
y_train_name

Unnamed: 0,class
0,1
1,1
2,0
3,0
4,0
5,1
6,1
7,1
8,1
9,1


In [30]:
y_test_name

Unnamed: 0,class
0,0
1,1
2,1
3,1
4,0
5,0
6,0
7,1
8,1
9,0
