### One Hot Encoding - variables with many categories

#https://www.kaggle.com/aditya1702/mercedes-benz-data-exploration/data

In [1]:
import pandas as pd
import numpy as np

In [20]:
df = pd.read_csv('mercedesbenz.csv', usecols=['X1', 'X2', 'X3', 'X4', 'X5', 'X6'])
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [21]:
for col in df.columns:
    print(col, 'has', len(df[col].unique()), ' unique labels')

X1 has 27  unique labels
X2 has 44  unique labels
X3 has 7  unique labels
X4 has 4  unique labels
X5 has 29  unique labels
X6 has 12  unique labels


In [22]:
df.shape

(4209, 6)

In [23]:
pd.get_dummies(df, drop_first=True).shape

(4209, 117)

With just 6 categorical features we are getting 117 features with the help of one hot encoding but are these all features useful? More number of features can increase of cost function and can cause in overfitting as well.
What can we do instead?

http://proceedings.mlr.press/v7/niculescu09/niculescu09.pdf In the winning solution of the KDD 2009 cup: "Winning the KDD Cup Orange Challenge with Ensemble

The Team suggested using 10 most frequent labels convert them into dummy variables using onehotencoding

In [28]:
df.X2.value_counts().head(10)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
Name: X2, dtype: int64

In [29]:
top_10 = [i for i in df.X2.value_counts().head(10).index]
top_10

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [33]:
for label in top_10:
    df[label] = np.where(df['X2']==label, 1, 0)
df[['X2']+top_10].head(10)

Unnamed: 0,X2,as,ae,ai,m,ak,r,n,s,f,e
0,at,0,0,0,0,0,0,0,0,0,0
1,av,0,0,0,0,0,0,0,0,0,0
2,n,0,0,0,0,0,0,1,0,0,0
3,n,0,0,0,0,0,0,1,0,0,0
4,n,0,0,0,0,0,0,1,0,0,0
5,e,0,0,0,0,0,0,0,0,0,1
6,e,0,0,0,0,0,0,0,0,0,1
7,as,1,0,0,0,0,0,0,0,0,0
8,as,1,0,0,0,0,0,0,0,0,0
9,aq,0,0,0,0,0,0,0,0,0,0


In [34]:
# function to create the dummy variables for the most frequent labels
def one_hot_encoding_top_x(df, variable, top_x_labels):    
    for label in top_x_labels:
        df[variable+'_'+label] = np.where(df[variable]==label, 1, 0)

In [38]:
data = pd.read_csv('mercedesbenz.csv', usecols=['X1', 'X2', 'X3', 'X4', 'X5', 'X6'])

one_hot_encoding_top_x(data, 'X2', top_10)
data.head(10)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,at,a,d,u,j,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,0,0,1,0,0,0
5,b,e,c,d,g,h,0,0,0,0,0,0,0,0,0,1
6,r,e,f,d,f,h,0,0,0,0,0,0,0,0,0,1
7,l,as,f,d,f,j,1,0,0,0,0,0,0,0,0,0
8,s,as,e,d,f,i,1,0,0,0,0,0,0,0,0,0
9,b,aq,c,d,f,a,0,0,0,0,0,0,0,0,0,0
