### Handle Categorical Features
- One Hot Encoding

In [1]:
import pandas as pd

In [3]:
df=pd.read_csv('titanic.csv',usecols=['sex'])


In [4]:
df.head()

Unnamed: 0,sex
0,male
1,female
2,female
3,female
4,male


In [6]:
pd.get_dummies(df,drop_first=True)

Unnamed: 0,sex_male
0,1
1,0
2,0
3,0
4,1
...,...
886,1
887,0
888,0
889,1


In [13]:
df=pd.read_csv('titanic.csv',usecols=['embarked'])

In [14]:
df['embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [15]:
df.dropna(inplace=True)

In [19]:
len(df['embarked'].unique())

3

In [18]:
pd.get_dummies(df,drop_first=True).head()

Unnamed: 0,embarked_Q,embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


### Count or Frequency Encoding

In [20]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' , header = None,index_col=None)
train_set.head()  

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [24]:
train_set[1].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [25]:
columns=[1,3,5,6,7,8,9,13]

In [26]:
train_set=train_set[columns]

In [27]:
train_set.columns=['Employment','Degree','Status','Designation','family_job','Race','Sex','Country']

In [28]:
train_set.head()

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [30]:
for i in train_set.columns[:]:
    print(i,':',len(train_set[i].unique()),'labels')

Employment : 9 labels
Degree : 16 labels
Status : 7 labels
Designation : 15 labels
family_job : 6 labels
Race : 5 labels
Sex : 2 labels
Country : 42 labels


In [35]:
country_map=train_set['Country'].value_counts().to_dict()

In [39]:
country_map

{' United-States': 29170,
 ' Mexico': 643,
 ' ?': 583,
 ' Philippines': 198,
 ' Germany': 137,
 ' Canada': 121,
 ' Puerto-Rico': 114,
 ' El-Salvador': 106,
 ' India': 100,
 ' Cuba': 95,
 ' England': 90,
 ' Jamaica': 81,
 ' South': 80,
 ' China': 75,
 ' Italy': 73,
 ' Dominican-Republic': 70,
 ' Vietnam': 67,
 ' Guatemala': 64,
 ' Japan': 62,
 ' Poland': 60,
 ' Columbia': 59,
 ' Taiwan': 51,
 ' Haiti': 44,
 ' Iran': 43,
 ' Portugal': 37,
 ' Nicaragua': 34,
 ' Peru': 31,
 ' France': 29,
 ' Greece': 29,
 ' Ecuador': 28,
 ' Ireland': 24,
 ' Hong': 20,
 ' Cambodia': 19,
 ' Trinadad&Tobago': 19,
 ' Laos': 18,
 ' Thailand': 18,
 ' Yugoslavia': 16,
 ' Outlying-US(Guam-USVI-etc)': 14,
 ' Honduras': 13,
 ' Hungary': 13,
 ' Scotland': 12,
 ' Holand-Netherlands': 1}

In [36]:
train_set['Country']=train_set['Country'].map(country_map)

In [38]:
train_set.head(50)

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,95
5,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,29170
6,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,81
7,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170
8,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,29170
9,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,29170


### Advantages
- Easy To Use
- Not increasing feature space
### Disadvantages
- It will provide same weight if the frequencies are same

### Target Guided Ordinal Encoding
1. Ordering the labels according to the target
2. Replace the labels by the joint probability of being 1 or 0

In [42]:
import pandas as pd
df=pd.read_csv('titanic.csv', usecols=['deck','survived'])
df.head()

Unnamed: 0,survived,deck
0,0,
1,1,C
2,1,
3,1,C
4,0,


In [43]:
df['deck'].fillna('Missing',inplace=True)

In [44]:
df.head(20)

Unnamed: 0,survived,deck
0,0,Missing
1,1,C
2,1,Missing
3,1,C
4,0,Missing
5,0,Missing
6,0,E
7,0,Missing
8,1,Missing
9,1,Missing


In [45]:
df['deck']=df['deck'].astype(str).str[0]

In [49]:
df.head()

Unnamed: 0,survived,deck
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [50]:
df.deck.unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F'], dtype=object)

In [52]:
df.groupby(['deck'])['survived'].mean()

deck
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
M    0.299419
Name: survived, dtype: float64

In [55]:
df.groupby(['deck'])['survived'].mean().sort_values()

deck
M    0.299419
A    0.466667
G    0.500000
C    0.593220
F    0.615385
B    0.744681
E    0.750000
D    0.757576
Name: survived, dtype: float64

In [64]:
ordinal_labels=df.groupby(['deck'])['survived'].mean().sort_values().index

In [65]:
ordinal_labels


Index(['M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='deck')

In [60]:
ordinal_labels_values=df.groupby(['deck'])['survived'].mean().sort_values().values

In [61]:
ordinal_labels_values

array([0.2994186 , 0.46666667, 0.5       , 0.59322034, 0.61538462,
       0.74468085, 0.75      , 0.75757576])

In [70]:
dict_ord={k:i for i,k in enumerate(ordinal_labels,0)}

In [71]:
dict_ord

{'M': 0, 'A': 1, 'G': 2, 'C': 3, 'F': 4, 'B': 5, 'E': 6, 'D': 7}

In [72]:
df['Cabin_ordinal_labels']=df['deck'].map(dict_ord)

In [76]:
df.head(100)

Unnamed: 0,survived,deck,Cabin_ordinal_labels
0,0,M,0
1,1,C,3
2,1,M,0
3,1,C,3
4,0,M,0
...,...,...,...
95,0,M,0
96,0,A,1
97,1,D,7
98,1,M,0


### Mean Encoding

In [77]:
mean_ordinal=df.groupby(['deck'])['survived'].mean().to_dict()

In [78]:
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29941860465116277}

### Probability Ratio Encoding
- Probability of survived based on cabin(groupby)--categoricall feature
- probability of not survived (survived-1)
- Ratio  - pr(survived)/pr(not survived)
- Dictionary to map cabin with probability
- replace with the categorical feature

In [120]:
import pandas as pd
df=pd.read_csv('titanic.csv', usecols=['deck','survived'])
df.head()

Unnamed: 0,survived,deck
0,0,
1,1,C
2,1,
3,1,C
4,0,


In [121]:
df['deck'].fillna('Missing',inplace=True)

In [122]:
df['deck'].unique()

array(['Missing', 'C', 'E', 'G', 'D', 'A', 'B', 'F'], dtype=object)

In [123]:
prob_df=df.groupby(['deck'])['survived'].mean()

In [124]:
prob_df

deck
A          0.466667
B          0.744681
C          0.593220
D          0.757576
E          0.750000
F          0.615385
G          0.500000
Missing    0.299419
Name: survived, dtype: float64

In [125]:
prob_df=pd.DataFrame(prob_df)

In [126]:
prob_df.head()

Unnamed: 0_level_0,survived
deck,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75


In [127]:
prob_df['Not live']=1-prob_df['survived']

In [128]:
prob_df.head()

Unnamed: 0_level_0,survived,Not live
deck,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25


In [129]:
prob_df['Probability_ratio']=prob_df['survived']/prob_df['Not live']

In [130]:
prob_df.head()

Unnamed: 0_level_0,survived,Not live,Probability_ratio
deck,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0


In [131]:
probability_encoded=prob_df['Probability_ratio'].to_dict()

In [132]:
probability_encoded

{'A': 0.875,
 'B': 2.916666666666666,
 'C': 1.4583333333333333,
 'D': 3.125,
 'E': 3.0,
 'F': 1.6000000000000003,
 'G': 1.0,
 'Missing': 0.4273858921161825}

In [133]:
df['Cabin_encoded']=df['deck'].map(probability_encoded)

In [137]:
df['deck']=df['deck'].astype(str).str[0]

In [139]:
df.head(20)

Unnamed: 0,survived,deck,Cabin_encoded
0,0,M,0.427386
1,1,C,1.458333
2,1,M,0.427386
3,1,C,1.458333
4,0,M,0.427386
5,0,M,0.427386
6,0,E,3.0
7,0,M,0.427386
8,1,M,0.427386
9,1,M,0.427386
