### One Hot Encoding - variables with many categories

#### 1.Importing libraries

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('mercedesbenz.csv', usecols=['X1','X2'])
df.tail()

Unnamed: 0,X1,X2
4204,s,as
4205,o,t
4206,v,r
4207,r,e
4208,r,ae


In [2]:
for col in df:
    print(df[col].unique())

['v' 't' 'w' 'b' 'r' 'l' 's' 'aa' 'c' 'a' 'e' 'h' 'z' 'j' 'o' 'u' 'p' 'n'
 'i' 'y' 'd' 'f' 'm' 'k' 'g' 'q' 'ab']
['at' 'av' 'n' 'e' 'as' 'aq' 'r' 'ai' 'ak' 'm' 'a' 'k' 'ae' 's' 'f' 'd'
 'ag' 'ay' 'ac' 'ap' 'g' 'i' 'aw' 'y' 'b' 'ao' 'al' 'h' 'x' 'au' 't' 'an'
 'z' 'ah' 'p' 'am' 'j' 'q' 'af' 'l' 'aa' 'c' 'o' 'ar']


In [3]:
len(df['X1'].unique()), len(df['X2'].unique())

(27, 44)

In [4]:
pd.get_dummies(df, drop_first=True).shape

(4209, 69)

In [5]:
df['X2'].value_counts().sort_values(ascending=False).head(10)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
Name: X2, dtype: int64

In [6]:
top_10_labels = [y for y in df['X2'].value_counts().sort_values(ascending=False).head(10).index]
top_10_labels

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [7]:
def one_hot_encoding_top_x(df,variable, top_x_labels):
    for label in top_x_labels:
        df[variable+'_'+label] = np.where(df[variable]== label,1,0)

In [8]:
one_hot_encoding_top_x(df,'X2',top_10_labels)
df.head()

Unnamed: 0,X1,X2,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,at,0,0,0,0,0,0,0,0,0,0
1,t,av,0,0,0,0,0,0,0,0,0,0
2,w,n,0,0,0,0,0,0,1,0,0,0
3,t,n,0,0,0,0,0,0,1,0,0,0
4,v,n,0,0,0,0,0,0,1,0,0,0


# Count or frequency encoding

In [9]:
df = pd.read_csv('mercedesbenz.csv', usecols=['X1','X2'])

In [10]:
df.shape

(4209, 2)

In [11]:
pd.get_dummies(df).shape

(4209, 71)

In [12]:
df['X2'].value_counts().to_dict()

{'as': 1659,
 'ae': 496,
 'ai': 415,
 'm': 367,
 'ak': 265,
 'r': 153,
 'n': 137,
 's': 94,
 'f': 87,
 'e': 81,
 'aq': 63,
 'ay': 54,
 'a': 47,
 't': 29,
 'i': 25,
 'k': 25,
 'b': 21,
 'ao': 20,
 'ag': 19,
 'z': 19,
 'd': 18,
 'ac': 13,
 'g': 12,
 'ap': 11,
 'y': 11,
 'x': 10,
 'aw': 8,
 'at': 6,
 'h': 6,
 'al': 5,
 'q': 5,
 'an': 5,
 'av': 4,
 'ah': 4,
 'p': 4,
 'au': 3,
 'j': 1,
 'aa': 1,
 'o': 1,
 'af': 1,
 'ar': 1,
 'c': 1,
 'l': 1,
 'am': 1}

In [13]:
df_frequency_map = df['X2'].value_counts().to_dict()

In [14]:
df.head(10)

Unnamed: 0,X1,X2
0,v,at
1,t,av
2,w,n
3,t,n
4,v,n
5,b,e
6,r,e
7,l,as
8,s,as
9,b,aq


In [15]:
df.X2 = df.X2.map(df_frequency_map)
df.head(10)

Unnamed: 0,X1,X2
0,v,6
1,t,4
2,w,137
3,t,137
4,v,137
5,b,81
6,r,81
7,l,1659
8,s,1659
9,b,63


# Ordinal Encoding

In [16]:
import pandas as pd
import datetime

In [17]:
df_base = datetime.datetime.today()
df_date_list = [df_base - datetime.timedelta(days=x) for x in range(0,20)]
df = pd.DataFrame(df_date_list)
df.columns=['Day']
df

Unnamed: 0,Day
0,2021-05-27 15:27:11.204521
1,2021-05-26 15:27:11.204521
2,2021-05-25 15:27:11.204521
3,2021-05-24 15:27:11.204521
4,2021-05-23 15:27:11.204521
5,2021-05-22 15:27:11.204521
6,2021-05-21 15:27:11.204521
7,2021-05-20 15:27:11.204521
8,2021-05-19 15:27:11.204521
9,2021-05-18 15:27:11.204521


In [18]:
[datetime.timedelta(days=x) for x in range(0,20)]

[datetime.timedelta(0),
 datetime.timedelta(1),
 datetime.timedelta(2),
 datetime.timedelta(3),
 datetime.timedelta(4),
 datetime.timedelta(5),
 datetime.timedelta(6),
 datetime.timedelta(7),
 datetime.timedelta(8),
 datetime.timedelta(9),
 datetime.timedelta(10),
 datetime.timedelta(11),
 datetime.timedelta(12),
 datetime.timedelta(13),
 datetime.timedelta(14),
 datetime.timedelta(15),
 datetime.timedelta(16),
 datetime.timedelta(17),
 datetime.timedelta(18),
 datetime.timedelta(19)]

In [19]:
df['day_of_week'] = df['Day'].dt.strftime("%A")
df.head()

Unnamed: 0,Day,day_of_week
0,2021-05-27 15:27:11.204521,Thursday
1,2021-05-26 15:27:11.204521,Wednesday
2,2021-05-25 15:27:11.204521,Tuesday
3,2021-05-24 15:27:11.204521,Monday
4,2021-05-23 15:27:11.204521,Sunday


In [20]:
weekday_map = {'Monday':1,'Tuesday':2,'Wednesday':3, 'Thursday':4, 'Friday':5, 'Saturday':6, 'Sunday':7}
df['day_ordinal'] = df['day_of_week'].map(weekday_map)
df.head(10)

Unnamed: 0,Day,day_of_week,day_ordinal
0,2021-05-27 15:27:11.204521,Thursday,4
1,2021-05-26 15:27:11.204521,Wednesday,3
2,2021-05-25 15:27:11.204521,Tuesday,2
3,2021-05-24 15:27:11.204521,Monday,1
4,2021-05-23 15:27:11.204521,Sunday,7
5,2021-05-22 15:27:11.204521,Saturday,6
6,2021-05-21 15:27:11.204521,Friday,5
7,2021-05-20 15:27:11.204521,Thursday,4
8,2021-05-19 15:27:11.204521,Wednesday,3
9,2021-05-18 15:27:11.204521,Tuesday,2
