## One Hot Encoding

In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('titanic_train.csv',usecols = ['Sex'])

In [4]:
df.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


In [5]:
# one hot encoding
pd.get_dummies(df)

Unnamed: 0,Sex_female,Sex_male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
886,0,1
887,1,0
888,1,0
889,0,1


In [7]:
# Drop first feature as we do in one hot encoding
pd.get_dummies(df, drop_first = True).head()

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [9]:
df = pd.read_csv('titanic_train.csv',usecols = ['Embarked'])

In [11]:
# Check unique categories/ values in the column
df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [15]:
# Drop NaN values
df.dropna(inplace=True)

In [16]:
df.head()

Unnamed: 0,Embarked
0,S
1,C
2,S
3,S
4,S


In [19]:
# one hot encoding
pd.get_dummies(df, drop_first=True).head()

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


## One hot encoding with many categories in a feature

In [20]:
df = pd.read_csv('Mercedez Benz.csv', usecols = ['X0','X1','X2','X3','X4','X5','X6'])

In [21]:
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [56]:
# count the number of times each catrgory is present in a column
df['X6'].value_counts()

g    1042
j    1039
d     625
i     488
l     478
a     206
h     190
k      43
c      38
b      28
f      20
e      12
Name: X6, dtype: int64

In [59]:
# print the number of unique categories in all the columns
for i in df.columns:
    print(len(df[i].unique()))

47
27
44
7
4
29
12


In [65]:
# Pick up 10 most frequent categories from a column
df['X1'].value_counts().sort_values(ascending=False).head(10)

aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
Name: X1, dtype: int64

In [71]:
lst_10 = df['X1'].value_counts().sort_values(ascending=False).head(10).index
lst_10 = list(lst_10)

In [72]:
lst_10

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [75]:
# one hot encoding of top 10 categories in a feature
for categories in lst_10:
    df[categories] = np.where(df['X1'] == categories,1,0)

In [79]:
lst_10.append('X1')

In [82]:
df[lst_10].head()

Unnamed: 0,aa,s,b,l,v,r,i,a,c,o,X1
0,0,0,0,0,1,0,0,0,0,0,v
1,0,0,0,0,0,0,0,0,0,0,t
2,0,0,0,0,0,0,0,0,0,0,w
3,0,0,0,0,0,0,0,0,0,0,t
4,0,0,0,0,1,0,0,0,0,0,v


## Ordinal Number Encoding

In [3]:
import datetime

In [4]:
# get today's date
today_date = datetime.datetime.today()

In [5]:
today_date

datetime.datetime(2021, 2, 15, 22, 57, 34, 189800)

In [6]:
# Get previous day's data 
today_date-datetime.timedelta(1)

datetime.datetime(2021, 2, 14, 22, 57, 34, 189800)

In [7]:
# Get past 15 days data
days = [today_date-datetime.timedelta(x) for x in range(0,15)]

In [8]:
# convert to a dataframe
data = pd.DataFrame(days)
data.columns = ["Day"]

In [9]:
data.head()

Unnamed: 0,Day
0,2021-02-15 22:57:34.189800
1,2021-02-14 22:57:34.189800
2,2021-02-13 22:57:34.189800
3,2021-02-12 22:57:34.189800
4,2021-02-11 22:57:34.189800


In [10]:
# get weekdays name
data["day_name"] = data['Day'].dt.day_name()

In [11]:
data.head()

Unnamed: 0,Day,day_name
0,2021-02-15 22:57:34.189800,Monday
1,2021-02-14 22:57:34.189800,Sunday
2,2021-02-13 22:57:34.189800,Saturday
3,2021-02-12 22:57:34.189800,Friday
4,2021-02-11 22:57:34.189800,Thursday


In [12]:
# Ordinal Number Encoding
dictionary = {'Monday':1,
             'Tuesday':2,
             'Wednesday':3,
             'Thursday':4,
             'Friday':5,
             'Saturday':6,
             'Sunday':7,
             }

In [13]:
 dictionary

{'Monday': 1,
 'Tuesday': 2,
 'Wednesday': 3,
 'Thursday': 4,
 'Friday': 5,
 'Saturday': 6,
 'Sunday': 7}

In [19]:
# Apply the ordinal number encoding on a particular column
data["weekday_ordinal"] = data['day_name'].map(dictionary) 

In [20]:
data

Unnamed: 0,Day,day_name,weekday_ordinal
0,2021-02-15 22:57:34.189800,Monday,1
1,2021-02-14 22:57:34.189800,Sunday,7
2,2021-02-13 22:57:34.189800,Saturday,6
3,2021-02-12 22:57:34.189800,Friday,5
4,2021-02-11 22:57:34.189800,Thursday,4
5,2021-02-10 22:57:34.189800,Wednesday,3
6,2021-02-09 22:57:34.189800,Tuesday,2
7,2021-02-08 22:57:34.189800,Monday,1
8,2021-02-07 22:57:34.189800,Sunday,7
9,2021-02-06 22:57:34.189800,Saturday,6


## Count/ Frequency Encoding

In [16]:
df = pd.read_csv('Mercedez Benz.csv', usecols = ['X0','X1','X2','X3','X4','X5','X6'])

In [17]:
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [21]:
# Get the number of categories present in each column
for feature in df.columns:
    print(feature, " :", len(df[feature].unique()), " labels")

X0  : 47  labels
X1  : 27  labels
X2  : 44  labels
X3  : 7  labels
X4  : 4  labels
X5  : 29  labels
X6  : 12  labels


In [25]:
# check the number of times each category is present in a feature
column_map = df['X0'].value_counts().to_dict()

In [26]:
# Apply count / frequency encoding
df['X0'] = df['X0'].map(column_map)

In [28]:
df.head(20)

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,11,v,at,a,d,u,j
1,11,t,av,e,d,y,l
2,175,w,n,c,d,x,j
3,175,t,n,f,d,x,l
4,175,v,n,f,d,h,d
5,306,b,e,c,d,g,h
6,67,r,e,f,d,f,h
7,269,l,as,f,d,f,j
8,182,s,as,e,d,f,i
9,181,b,aq,c,d,f,a


## Target Guided Ordinal Encoding

In [69]:
df = pd.read_csv('titanic_train.csv',usecols = ['Cabin', 'Survived'])

In [70]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [71]:
# Handle missing values
df['Cabin'].fillna("Missing", inplace = True)

In [72]:
# Convert cabin column into a string and get the first letter of each observation
df['Cabin'] = df['Cabin'].astype(str).str[0]

In [73]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [74]:
# check the unique values in cabin feature
df.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [75]:
# Find the mean of the survived column based on the cabin column to assign rank
df.groupby(['Cabin'])['Survived'].mean()

Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299854
T    0.000000
Name: Survived, dtype: float64

In [76]:
df.groupby(['Cabin'])['Survived'].mean().sort_values().index

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [77]:
ordinal_labels = df.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [78]:
# create a dictionary comprehension assigning the labels for ranking purposes based on the mean calculated above
ordinal_labels_2 = {k:i for i,k in enumerate(ordinal_labels,0)}

In [79]:
ordinal_labels_2

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [80]:
# map the dictionary to the column
df["Cabin_Ordinal_Labels"] = df["Cabin"].map(ordinal_labels_2)

In [81]:
df.head()

Unnamed: 0,Survived,Cabin,Cabin_Ordinal_Labels
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


## Mean Encoding

In [87]:
# Replace each category with the corresponding mean value
mean_ordinal = df.groupby(['Cabin'])['Survived'].mean().to_dict()

In [88]:
df['Mean_Ordinal_Cabin'] = df['Cabin'].map(mean_ordinal)

In [89]:
df.head()

Unnamed: 0,Survived,Cabin,Cabin_Ordinal_Labels,Mean_Ordinal_Cabin
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854


## Probability Ratio Encoding

In [2]:
df = pd.read_csv("titanic_train.csv", usecols = ['Cabin', 'Survived'])

In [3]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [4]:
# Handle missing values
df["Cabin"] = df["Cabin"].fillna("Missing")

In [5]:
df.head(20)

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing
5,0,Missing
6,0,E46
7,0,Missing
8,1,Missing
9,1,Missing


In [6]:
# pick up only first character from cabin categories
df["Cabin"] = df["Cabin"].astype(str).str[0]

In [7]:
df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [8]:
df["Cabin"].unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [9]:
# Calculate the percentage or mean of each category of cabin column with respect to survived column
prob_df = df.groupby("Cabin")["Survived"].mean()

In [10]:
# convert to a data frame
prob_df = pd.DataFrame(prob_df)

In [11]:
prob_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [15]:
# make a died column form survived column
prob_df["Died"] = 1-prob_df["Survived"]

In [16]:
prob_df.head()

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25


In [19]:
# find probability ratio
prob_df["Probability_Ratio"] = prob_df["Survived"]/prob_df["Died"]

In [20]:
prob_df.head()

Unnamed: 0_level_0,Survived,Died,Probability_Ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0


In [24]:
# convert probability ratio values to a dictionary
prob_encoded = prob_df["Probability_Ratio"].to_dict()

In [28]:
# apply the encoding
df["Cabin_Encoded"] = df["Cabin"].map(prob_encoded)

In [29]:
df.head(20)

Unnamed: 0,Survived,Cabin,Cabin_Encoded
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274
5,0,M,0.428274
6,0,E,3.0
7,0,M,0.428274
8,1,M,0.428274
9,1,M,0.428274
