# One Hot Encoding

In [2]:
import pandas as pd
import numpy as np

In [16]:
data= pd.read_csv('mercedesbenz.csv', usecols=['X1','X2','X3','X4','X5','X6'])

In [51]:
# to See all columns

pd.set_option("display.max_columns", None)

In [52]:
data.head()
data.shape

(4209, 26)

In [20]:
# Lets have unique category in each column

for cols in data.columns:
    print(cols,':', len(data[cols].unique()), 'labels'  )

X1 : 27 labels
X2 : 44 labels
X3 : 7 labels
X4 : 4 labels
X5 : 29 labels
X6 : 12 labels


In [22]:
# Lets find out how many more columns we will obtain after Hot Encoding
pd.get_dummies(data,drop_first=True).shape

# So , Originally 6 columns will be increased to 117 columns

(4209, 117)

In [42]:
## KDD 2009 cup:  One Hot Encoding with 10 most frequent labels

## Lets find top 10 variables in a column X1 which have 27 labels

data.X1.value_counts(ascending=False).head(10)

# Put top 10 variables in a list
top_10=[x for x in data.X1.value_counts().head(10).index]

# Create new columns with these top 10 variables

for i in top_10:
    data[i]= np.where(data['X1']==i,1,0)
    
data[['X1']+top_10  ].head(10)

Unnamed: 0,X1,aa,s,b,l,v,r,i,a,c,o
0,v,0,0,0,0,1,0,0,0,0,0
1,t,0,0,0,0,0,0,0,0,0,0
2,w,0,0,0,0,0,0,0,0,0,0
3,t,0,0,0,0,0,0,0,0,0,0
4,v,0,0,0,0,1,0,0,0,0,0
5,b,0,0,1,0,0,0,0,0,0,0
6,r,0,0,0,0,0,1,0,0,0,0
7,l,0,0,0,1,0,0,0,0,0,0
8,s,0,1,0,0,0,0,0,0,0,0
9,b,0,0,1,0,0,0,0,0,0,0


In [44]:
## Now do this for all 6 columns

def one_hot_top10(df,column,top_10   ):
    for i in top_10:
        df[column+"_"+i ]= np.where(data[column]==i,1,0)
    
    
#Now find top 10 variables from each column and call function    

In [49]:
# For X2  because X1 already has been done directly

top_10 =[x for x in data.X2.value_counts().head(10).index  ]

one_hot_top10(data,'X2',top_10)

In [53]:
data

Unnamed: 0,X1,X2,X3,X4,X5,X6,aa,s,b,l,v,r,i,a,c,o,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,s,as,c,d,aa,d,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4205,o,t,d,d,aa,h,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4206,v,r,a,d,aa,g,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4207,r,e,f,d,aa,l,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [57]:
# For X3:

top_10=[x for x in data.X3.value_counts().head(10).index]
top_10

one_hot_top10(data, 'X3', top_10)
data

Unnamed: 0,X1,X2,X3,X4,X5,X6,aa,s,b,l,v,r,i,a,c,o,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e,X3_c,X3_f,X3_a,X3_d,X3_g,X3_e,X3_b
0,v,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,s,as,c,d,aa,d,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4205,o,t,d,d,aa,h,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4206,v,r,a,d,aa,g,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4207,r,e,f,d,aa,l,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0


In [60]:
# for X4:

top_10= [ x for x in data.X4.value_counts().head(10).index]
top_10

one_hot_top10(data, 'X4', top_10)
data

Unnamed: 0,X1,X2,X3,X4,X5,X6,aa,s,b,l,v,r,i,a,c,o,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e,X3_c,X3_f,X3_a,X3_d,X3_g,X3_e,X3_b,X4_d,X4_a,X4_b,X4_c
0,v,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,s,as,c,d,aa,d,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
4205,o,t,d,d,aa,h,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
4206,v,r,a,d,aa,g,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
4207,r,e,f,d,aa,l,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0


In [61]:
## for X5:

top_10= [ x for x in data.X5.value_counts().head(10).index]
top_10

one_hot_top10(data, 'X5', top_10)
data


Unnamed: 0,X1,X2,X3,X4,X5,X6,aa,s,b,l,v,r,i,a,c,o,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e,X3_c,X3_f,X3_a,X3_d,X3_g,X3_e,X3_b,X4_d,X4_a,X4_b,X4_c,X5_w,X5_v,X5_q,X5_r,X5_s,X5_d,X5_n,X5_p,X5_m,X5_i
0,v,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,s,as,c,d,aa,d,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4205,o,t,d,d,aa,h,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4206,v,r,a,d,aa,g,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4207,r,e,f,d,aa,l,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [62]:
## for X6:

top_10= [ x for x in data.X6.value_counts().head(10).index]
top_10

one_hot_top10(data, 'X6', top_10)
data

Unnamed: 0,X1,X2,X3,X4,X5,X6,aa,s,b,l,v,r,i,a,c,o,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e,X3_c,X3_f,X3_a,X3_d,X3_g,X3_e,X3_b,X4_d,X4_a,X4_b,X4_c,X5_w,X5_v,X5_q,X5_r,X5_s,X5_d,X5_n,X5_p,X5_m,X5_i,X6_g,X6_j,X6_d,X6_i,X6_l,X6_a,X6_h,X6_k,X6_c,X6_b
0,v,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,s,as,c,d,aa,d,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4205,o,t,d,d,aa,h,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4206,v,r,a,d,aa,g,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4207,r,e,f,d,aa,l,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [None]:
# Here, we have handled 6 categorical features with One Hot Encoding top 10. thanks