## One Hot Encoding - variables with many categories

In [22]:
import pandas as pd
import numpy as np

data = pd.read_csv("../../../data/mercedes-benz/train.csv", usecols=["X1", "X2", "X3", "X4", "X5", "X6"])
data.head()
# data.shape

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [23]:
# check unique in each columns
for col in data.columns: 
    print(f"{col} : {len(data[col].unique())} labels")

X1 : 27 labels
X2 : 44 labels
X3 : 7 labels
X4 : 4 labels
X5 : 29 labels
X6 : 12 labels


In [24]:
np.sum([len(data[col].unique()) for col in data.columns])

123

In [25]:
pd.get_dummies(data).shape

(4209, 123)

In [26]:
pd.get_dummies(data, drop_first=True).shape

(4209, 117)

* The would make one binary variable for each of the 10 most frequency labels only

In [27]:
# Let's find the top 10 most frequent categories in X2 column
data["X2"].value_counts().sort_values(ascending=False).head(20)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
k       25
i       25
b       21
ao      20
ag      19
z       19
Name: X2, dtype: int64

In [28]:
# list with the most frequent categories of the variable

top_10 = [x for x in data["X2"].value_counts().sort_values(ascending=False).head(10).index]
top_10

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [29]:
for label in top_10:
    data[label] = np.where(data["X2"] == label, 1, 0)

data[['X2'] + top_10].head(20)

Unnamed: 0,X2,as,ae,ai,m,ak,r,n,s,f,e
0,at,0,0,0,0,0,0,0,0,0,0
1,av,0,0,0,0,0,0,0,0,0,0
2,n,0,0,0,0,0,0,1,0,0,0
3,n,0,0,0,0,0,0,1,0,0,0
4,n,0,0,0,0,0,0,1,0,0,0
5,e,0,0,0,0,0,0,0,0,0,1
6,e,0,0,0,0,0,0,0,0,0,1
7,as,1,0,0,0,0,0,0,0,0,0
8,as,1,0,0,0,0,0,0,0,0,0
9,aq,0,0,0,0,0,0,0,0,0,0


In [30]:
def one_hot_top_x(df, variable, top_x_labels):
    for label in top_x_labels:
        df[variable + "_" + label] = np.where(data[variable] == label, 1, 0)

data = pd.read_csv("../../../data/mercedes-benz/train.csv", usecols=["X1", "X2", "X3", "X4", "X5", "X6"])

one_hot_top_x(data, 'X2', top_10)

data.head(15)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,at,a,d,u,j,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,0,0,1,0,0,0
5,b,e,c,d,g,h,0,0,0,0,0,0,0,0,0,1
6,r,e,f,d,f,h,0,0,0,0,0,0,0,0,0,1
7,l,as,f,d,f,j,1,0,0,0,0,0,0,0,0,0
8,s,as,e,d,f,i,1,0,0,0,0,0,0,0,0,0
9,b,aq,c,d,f,a,0,0,0,0,0,0,0,0,0,0


In [45]:
top_10 = data["X1"].value_counts().sort_values(ascending=False).head(10).index.to_list()

data = pd.read_csv("../../../data/mercedes-benz/train.csv", usecols=["X1", "X2", "X3", "X4", "X5"])

for label in top_10:
    data["X1_"+label] = np.where(data["X1"] == label, 1, 0)

data.head(10)

Unnamed: 0,X1,X2,X3,X4,X5,X1_aa,X1_s,X1_b,X1_l,X1_v,X1_r,X1_i,X1_a,X1_c,X1_o
0,v,at,a,d,u,0,0,0,0,1,0,0,0,0,0
1,t,av,e,d,y,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,0,0,0,0,0,0,0,0,0,0
3,t,n,f,d,x,0,0,0,0,0,0,0,0,0,0
4,v,n,f,d,h,0,0,0,0,1,0,0,0,0,0
5,b,e,c,d,g,0,0,1,0,0,0,0,0,0,0
6,r,e,f,d,f,0,0,0,0,0,1,0,0,0,0
7,l,as,f,d,f,0,0,0,1,0,0,0,0,0,0
8,s,as,e,d,f,0,1,0,0,0,0,0,0,0,0
9,b,aq,c,d,f,0,0,1,0,0,0,0,0,0,0


1. Nominal Encoding
    * One hot Encoding
    * One hot Encoding with many Categorical (Dummy)
    * Mean Encoding 
2. Ordinal Encoding: 
    * Label Encoding
    * Target guided Ordinal Encoding