## Handling Categorical Features

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('titanic_train.csv',usecols=['Sex'])

In [3]:
df.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


### One Hot Encoding

In [4]:
pd.get_dummies(df).head()

Unnamed: 0,Sex_female,Sex_male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [5]:
pd.get_dummies(df,drop_first=True).head()

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [6]:
df1 = pd.read_csv('titanic_train.csv',usecols=['Embarked'])

In [7]:
pd.get_dummies(df1).head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [8]:
df2 = pd.read_csv('mercedesbenz.csv',usecols=['X0','X1','X2','X3','X4','X5','X6'])

In [9]:
df2.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


### Ordinal Number Encoding

#### Label Encoding

In [10]:
df3 = pd.read_csv('days.csv',index_col=None)

In [11]:
df3.head()

Unnamed: 0,day,day_of_week,day_ordinal
0,2021-02-09 21:10:32.129658,Tuesday,2
1,2021-02-08 21:10:32.129658,Monday,1
2,2021-02-07 21:10:32.129658,Sunday,7
3,2021-02-06 21:10:32.129658,Saturday,6
4,2021-02-05 21:10:32.129658,Friday,5


#### Count or Frequency Encoding

In [12]:
df4 = pd.read_csv('adult_data.csv',usecols=['Employment','Degree','Status','Designation','Family','Race','Sex','Country'])

In [13]:
df4.head()

Unnamed: 0,Employment,Degree,Status,Designation,Family,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [14]:
for feature in df4.columns[:]:
    print(feature,':',len(df4[feature].unique()),'labels')

Employment : 9 labels
Degree : 16 labels
Status : 7 labels
Designation : 15 labels
Family : 6 labels
Race : 5 labels
Sex : 2 labels
Country : 42 labels


In [15]:
df4['Country_ordinal'] = df4['Country'].map(df4['Country'].value_counts().to_dict())

In [16]:
df4.head()

Unnamed: 0,Employment,Degree,Status,Designation,Family,Race,Sex,Country,Country_ordinal
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,95


#### Target Guided Ordinal Encoding
1. Ordering labels according to the target
2. Replace the labels by the joint probability of being 1 or 0

In [17]:
df5 = pd.read_csv('titanic_train.csv',usecols=['Cabin','Survived'])

In [18]:
df5['Cabin'].fillna('Missing',inplace=True)

In [19]:
df5['Cabin'] = df5['Cabin'].astype(str).str[0]

In [20]:
df5.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [21]:
df5.Cabin.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [22]:
df5.groupby(['Cabin'])['Survived'].mean().sort_values().index

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [23]:
ordinal_labels = df5.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [24]:
ordinal_labels_rank = {k:v for v,k in enumerate(ordinal_labels,0)}
ordinal_labels_rank

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [25]:
df5['Cabin_ordinal_labels'] = df5.Cabin.map(ordinal_labels_rank)

In [26]:
df5.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


#### Mean Encoding
1. Captures information within the label
2. Leads to overfitting

In [27]:
df6 = pd.read_csv('titanic_train.csv',usecols=['Cabin','Survived'])
df6['Cabin'].fillna('Missing',inplace=True)
df6['Cabin'] = df6['Cabin'].astype(str).str[0]
df6['Cabin_ordinal_labels'] = df6.Cabin.map(df6.groupby(['Cabin'])['Survived'].mean().to_dict())

In [28]:
df6.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels
0,0,M,0.299854
1,1,C,0.59322
2,1,M,0.299854
3,1,C,0.59322
4,0,M,0.299854


#### Probability Ratio Encoding

In [29]:
df7 = pd.read_csv('titanic_train.csv',usecols=['Cabin','Survived'])

In [31]:
df7.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [32]:
df7.fillna('Missing',inplace=True)

In [35]:
df7['Cabin'] = df7['Cabin'].astype(str).str[0]

In [36]:
df7.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [40]:
probability_df = pd.DataFrame(df7.groupby(['Cabin'])['Survived'].mean())

In [41]:
probability_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [42]:
probability_df['Died'] = 1 - probability_df['Survived']

In [43]:
probability_df

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25
F,0.615385,0.384615
G,0.5,0.5
M,0.299854,0.700146
T,0.0,1.0


In [44]:
probability_df['Probabilty_ratio'] = probability_df.Survived / probability_df.Died

In [45]:
probability_df

Unnamed: 0_level_0,Survived,Died,Probabilty_ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0
F,0.615385,0.384615,1.6
G,0.5,0.5,1.0
M,0.299854,0.700146,0.428274
T,0.0,1.0,0.0


In [53]:
df7['Cabin_encoded'] = df7.Cabin.map(probability_df['Probabilty_ratio'].to_dict())

In [54]:
df7.head()

Unnamed: 0,Survived,Cabin,Cabin_encoded
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274
