# One Hot Encoding, Variables with many categories

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Accessing only the categorical features from Mercedez Benz data 
data = pd.read_csv('Mercedez Benz.csv',usecols = ['X1','X2','X3','X4','X5','X6'])

In [3]:
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [4]:
# Checking how many unique categories there are in each feature
for i in data.columns:
    print(i , ':', len(data[i].unique()), 'Labels')

X1 : 27 Labels
X2 : 44 Labels
X3 : 7 Labels
X4 : 4 Labels
X5 : 29 Labels
X6 : 12 Labels


In [5]:
# How many columns we'll be getting after performing one hot encoding and also converting categorical variables into dummy values
pd.get_dummies(data, drop_first = True).shape

(4209, 117)

In [6]:
# Check top 20 categories in X2 column
data['X2'].value_counts().head(20)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
i       25
k       25
b       21
ao      20
ag      19
z       19
Name: X2, dtype: int64

In [7]:
# Make a list of top 10 categories from X2 column
top_10 = [x for x in data['X2'].value_counts().head(10).index]
top_10

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [8]:
# Convert the top 10 categories into 1 and 0
for label in top_10:
    data[label] = np.where(data['X2'] == label, 1, 0)
data[['X2'] + top_10].head(40)

Unnamed: 0,X2,as,ae,ai,m,ak,r,n,s,f,e
0,at,0,0,0,0,0,0,0,0,0,0
1,av,0,0,0,0,0,0,0,0,0,0
2,n,0,0,0,0,0,0,1,0,0,0
3,n,0,0,0,0,0,0,1,0,0,0
4,n,0,0,0,0,0,0,1,0,0,0
5,e,0,0,0,0,0,0,0,0,0,1
6,e,0,0,0,0,0,0,0,0,0,1
7,as,1,0,0,0,0,0,0,0,0,0
8,as,1,0,0,0,0,0,0,0,0,0
9,aq,0,0,0,0,0,0,0,0,0,0


In [9]:
def one_hot_top_10(data, column, top_x_labels):
    
    # Converting the categorical variables into binary value
    for label in top_x_labels:
        data[column+'_'+ label] = np.where(data[column] == label, 1, 0)
    
    
    
# Read data    
data =  pd.read_csv('Mercedez Benz.csv', usecols = ['X1','X2','X3','X4','X5','X6'])

# Call function for X2 column
one_hot_top_10(data, 'X2', top_10)

# top 5 rows
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,at,a,d,u,j,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,0,0,1,0,0,0


In [10]:
# Read Fresh data and apply one hot encoding for all of the 6 columns

data = pd.read_csv('Mercedez Benz.csv', usecols=['X1','X2','X3','X4','X5','X6'])
cols = ['X1','X2','X3','X4','X5','X6']
for columns in cols:
    top_10 = [x for x in data[columns].value_counts().head(10).index]
    one_hot_top_10(data, columns, top_10)

In [11]:
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X1_aa,X1_s,X1_b,X1_l,...,X6_g,X6_j,X6_d,X6_i,X6_l,X6_a,X6_h,X6_k,X6_c,X6_b
0,v,at,a,d,u,j,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
