### One Hot Encoding - variables with many categories

#https://www.kaggle.com/aditya1702/mercedes-benz-data-exploration/data

In [1]:
import pandas as pd
import numpy as np

In [20]:
df = pd.read_csv('mercedesbenz.csv', usecols=['X1', 'X2', 'X3', 'X4', 'X5', 'X6'])
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [21]:
for col in df.columns:
    print(col, 'has', len(df[col].unique()), ' unique labels')

X1 has 27  unique labels
X2 has 44  unique labels
X3 has 7  unique labels
X4 has 4  unique labels
X5 has 29  unique labels
X6 has 12  unique labels


In [22]:
df.shape

(4209, 6)

In [23]:
pd.get_dummies(df, drop_first=True).shape

(4209, 117)

With just 6 categorical features we are getting 117 features with the help of one hot encoding but are these all features useful? More number of features can increase of cost function and can cause in overfitting as well.
What can we do instead?

http://proceedings.mlr.press/v7/niculescu09/niculescu09.pdf In the winning solution of the KDD 2009 cup: "Winning the KDD Cup Orange Challenge with Ensemble

The Team suggested using 10 most frequent labels convert them into dummy variables using onehotencoding

In [28]:
df.X2.value_counts().head(10)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
Name: X2, dtype: int64

In [29]:
top_10 = [i for i in df.X2.value_counts().head(10).index]
top_10

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [33]:
for label in top_10:
    df[label] = np.where(df['X2']==label, 1, 0)
df[['X2']+top_10].head(10)

Unnamed: 0,X2,as,ae,ai,m,ak,r,n,s,f,e
0,at,0,0,0,0,0,0,0,0,0,0
1,av,0,0,0,0,0,0,0,0,0,0
2,n,0,0,0,0,0,0,1,0,0,0
3,n,0,0,0,0,0,0,1,0,0,0
4,n,0,0,0,0,0,0,1,0,0,0
5,e,0,0,0,0,0,0,0,0,0,1
6,e,0,0,0,0,0,0,0,0,0,1
7,as,1,0,0,0,0,0,0,0,0,0
8,as,1,0,0,0,0,0,0,0,0,0
9,aq,0,0,0,0,0,0,0,0,0,0


In [34]:
# function to create the dummy variables for the most frequent labels
def one_hot_encoding_top_x(df, variable, top_x_labels):    
    for label in top_x_labels:
        df[variable+'_'+label] = np.where(df[variable]==label, 1, 0)

In [38]:
data = pd.read_csv('mercedesbenz.csv', usecols=['X1', 'X2', 'X3', 'X4', 'X5', 'X6'])

one_hot_encoding_top_x(data, 'X2', top_10)
data.head(10)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,at,a,d,u,j,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,0,0,1,0,0,0
5,b,e,c,d,g,h,0,0,0,0,0,0,0,0,0,1
6,r,e,f,d,f,h,0,0,0,0,0,0,0,0,0,1
7,l,as,f,d,f,j,1,0,0,0,0,0,0,0,0,0
8,s,as,e,d,f,i,1,0,0,0,0,0,0,0,0,0
9,b,aq,c,d,f,a,0,0,0,0,0,0,0,0,0,0


**pandas.get_dummies-** Used when we need to need all categories of that feature/column

In [3]:
import pandas as pd
df_car=pd.read_csv('carprices.csv')
df_car

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


In [5]:
dummies = pd.get_dummies(df_car['Car Model'], drop_first=True)
dummies

Unnamed: 0,BMW X5,Mercedez Benz C class
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,1


In [6]:
merged = pd.concat([df_car,dummies],axis='columns')

In [7]:
merged

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs),BMW X5,Mercedez Benz C class
0,BMW X5,69000,18000,6,1,0
1,BMW X5,35000,34000,3,1,0
2,BMW X5,57000,26100,5,1,0
3,BMW X5,22500,40000,2,1,0
4,BMW X5,46000,31500,4,1,0
5,Audi A5,59000,29400,5,0,0
6,Audi A5,52000,32000,5,0,0
7,Audi A5,72000,19300,6,0,0
8,Audi A5,91000,12000,8,0,0
9,Mercedez Benz C class,67000,22000,6,0,1


In [9]:
merged.drop('Car Model', axis='columns', inplace=True)
merged

Unnamed: 0,Mileage,Sell Price($),Age(yrs),BMW X5,Mercedez Benz C class
0,69000,18000,6,1,0
1,35000,34000,3,1,0
2,57000,26100,5,1,0
3,22500,40000,2,1,0
4,46000,31500,4,1,0
5,59000,29400,5,0,0
6,52000,32000,5,0,0
7,72000,19300,6,0,0
8,91000,12000,8,0,0
9,67000,22000,6,0,1


In [10]:
X=merged.drop('Sell Price($)',axis='columns')
X.head()

Unnamed: 0,Mileage,Age(yrs),BMW X5,Mercedez Benz C class
0,69000,6,1,0
1,35000,3,1,0
2,57000,5,1,0
3,22500,2,1,0
4,46000,4,1,0


In [11]:
y=merged['Sell Price($)']
y.head()

0    18000
1    34000
2    26100
3    40000
4    31500
Name: Sell Price($), dtype: int64

In [12]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [13]:
model.fit(X,y)

LinearRegression()

In [14]:
model.score(X,y)

0.9417050937281083

In [16]:
#Price of mercedez benz that is 4 yr old with mileage 45000model.predict([[45000,4,0,0]])
model.predict([[45000,4,0,0]])

array([34537.77647335])