# Categorical Variable Encoding

### Type's of Categorical Variable encoding
    1) One Hot Encoding
    2) Label Encoding
    3) Ordinal Encoding
    4) Helmert Encoding
    5) Binary Encoding
    6) Frequency Encoding
    7) Mean Encoding
    8) Weight of Evidence Encoding
    9) Probability Ratio Encoding
    10) Hashing Encoding
    11) Backward Difference Encoding
    12) Leave One Out Encoding
    13) James-Stein Encoding
    14) M-estimator Encoding
    15) Thermometer Encoder

##### Source:
https://towardsdatascience.com/all-about-categorical-variable-encoding-305f3361fd02

In [26]:
## Import packages
import pandas as pd
import numpy as np

In [27]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [28]:
data = {'Temperature':['Hot','Cold','Very Hot','Warm','Hot','Warm','Warm','Hot','Hot','Cold'],
        'Color':['Red','Yellow','Blue','Blue','Red','Yellow','Red','Yellow','Yellow','Yellow'],
        'Target':[1,1,1,0,1,0,1,0,1,1]}
df = pd.DataFrame(data,columns=['Temperature','Color','Target'])
df

Unnamed: 0,Temperature,Color,Target
0,Hot,Red,1
1,Cold,Yellow,1
2,Very Hot,Blue,1
3,Warm,Blue,0
4,Hot,Red,1
5,Warm,Yellow,0
6,Warm,Red,1
7,Hot,Yellow,0
8,Hot,Yellow,1
9,Cold,Yellow,1


#### 1. ONE-HOT Encoding

In [29]:
## By Pandas : get_dummies() function
df_dumm = pd.get_dummies(df,prefix=['Temp'],columns=['Temperature'])  
df_dumm

Unnamed: 0,Color,Target,Temp_Cold,Temp_Hot,Temp_Very Hot,Temp_Warm
0,Red,1,0,1,0,0
1,Yellow,1,1,0,0,0
2,Blue,1,0,0,1,0
3,Blue,0,0,0,0,1
4,Red,1,0,1,0,0
5,Yellow,0,0,0,0,1
6,Red,1,0,0,0,1
7,Yellow,0,0,1,0,0
8,Yellow,1,0,1,0,0
9,Yellow,1,1,0,0,0


In [30]:
## One-Hot Encoding
from sklearn.preprocessing import OneHotEncoder
ohc = OneHotEncoder()
ohe = ohc.fit_transform(df.Temperature.values.reshape(-1,1)).toarray()
# print(ohe)

df_ohe = pd.DataFrame(ohe,columns=["Temp_"+str(ohc.categories_[0][i]) for i in range(len(ohc.categories_[0]))])
df_ohe = pd.concat([df,df_ohe],axis=1)
df_ohe

Unnamed: 0,Temperature,Color,Target,Temp_Cold,Temp_Hot,Temp_Very Hot,Temp_Warm
0,Hot,Red,1,0.0,1.0,0.0,0.0
1,Cold,Yellow,1,1.0,0.0,0.0,0.0
2,Very Hot,Blue,1,0.0,0.0,1.0,0.0
3,Warm,Blue,0,0.0,0.0,0.0,1.0
4,Hot,Red,1,0.0,1.0,0.0,0.0
5,Warm,Yellow,0,0.0,0.0,0.0,1.0
6,Warm,Red,1,0.0,0.0,0.0,1.0
7,Hot,Yellow,0,0.0,1.0,0.0,0.0
8,Hot,Yellow,1,0.0,1.0,0.0,0.0
9,Cold,Yellow,1,1.0,0.0,0.0,0.0


#### 2. Label Encoding

In [31]:
## By Label encoder
from sklearn.preprocessing import LabelEncoder
df['Temperature_label_encoded'] = LabelEncoder().fit_transform(df.Temperature)
df 

Unnamed: 0,Temperature,Color,Target,Temperature_label_encoded
0,Hot,Red,1,1
1,Cold,Yellow,1,0
2,Very Hot,Blue,1,2
3,Warm,Blue,0,3
4,Hot,Red,1,1
5,Warm,Yellow,0,3
6,Warm,Red,1,3
7,Hot,Yellow,0,1
8,Hot,Yellow,1,1
9,Cold,Yellow,1,0


In [32]:
## By Pandas : Fatorize
df.loc[:,'Temp_Factorize_encode'] = pd.factorize(df['Temperature'])[0].reshape(-1,1)
df

Unnamed: 0,Temperature,Color,Target,Temperature_label_encoded,Temp_Factorize_encode
0,Hot,Red,1,1,0
1,Cold,Yellow,1,0,1
2,Very Hot,Blue,1,2,2
3,Warm,Blue,0,3,3
4,Hot,Red,1,1,0
5,Warm,Yellow,0,3,3
6,Warm,Red,1,3,3
7,Hot,Yellow,0,1,0
8,Hot,Yellow,1,1,0
9,Cold,Yellow,1,0,1


#### 3. Ordinal Encoding

In [33]:
temp_dict = {'Cold': 1,'Warm':2,'Hot':3,'Very Hot':4}
df['Temp_Ordinal_encode']=df.Temperature.map(temp_dict)
df

Unnamed: 0,Temperature,Color,Target,Temperature_label_encoded,Temp_Factorize_encode,Temp_Ordinal_encode
0,Hot,Red,1,1,0,3
1,Cold,Yellow,1,0,1,1
2,Very Hot,Blue,1,2,2,4
3,Warm,Blue,0,3,3,2
4,Hot,Red,1,1,0,3
5,Warm,Yellow,0,3,3,2
6,Warm,Red,1,3,3,2
7,Hot,Yellow,0,1,0,3
8,Hot,Yellow,1,1,0,3
9,Cold,Yellow,1,0,1,1


#### 4. Helmert Encoding

In [34]:
import category_encoders as ce 
cat_encoder = ce.HelmertEncoder(cols=['Temperature'],drop_invariant=True)
df_ceh = cat_encoder.fit_transform(df['Temperature'])
df_ceh = pd.concat([df,df_ceh],axis=1)
df_ceh 

Unnamed: 0,Temperature,Color,Target,Temperature_label_encoded,Temp_Factorize_encode,Temp_Ordinal_encode,Temperature_0,Temperature_1,Temperature_2
0,Hot,Red,1,1,0,3,-1.0,-1.0,-1.0
1,Cold,Yellow,1,0,1,1,1.0,-1.0,-1.0
2,Very Hot,Blue,1,2,2,4,0.0,2.0,-1.0
3,Warm,Blue,0,3,3,2,0.0,0.0,3.0
4,Hot,Red,1,1,0,3,-1.0,-1.0,-1.0
5,Warm,Yellow,0,3,3,2,0.0,0.0,3.0
6,Warm,Red,1,3,3,2,0.0,0.0,3.0
7,Hot,Yellow,0,1,0,3,-1.0,-1.0,-1.0
8,Hot,Yellow,1,1,0,3,-1.0,-1.0,-1.0
9,Cold,Yellow,1,0,1,1,1.0,-1.0,-1.0


#### 5. Binary Encoding

In [35]:
import category_encoders as ce 
bin_encoder = ce.BinaryEncoder(cols=['Temperature'])
df_be = bin_encoder.fit_transform(df['Temperature'])
df_be = pd.concat([df,df_be],axis=1)
df_be

Unnamed: 0,Temperature,Color,Target,Temperature_label_encoded,Temp_Factorize_encode,Temp_Ordinal_encode,Temperature_0,Temperature_1,Temperature_2
0,Hot,Red,1,1,0,3,0,0,1
1,Cold,Yellow,1,0,1,1,0,1,0
2,Very Hot,Blue,1,2,2,4,0,1,1
3,Warm,Blue,0,3,3,2,1,0,0
4,Hot,Red,1,1,0,3,0,0,1
5,Warm,Yellow,0,3,3,2,1,0,0
6,Warm,Red,1,3,3,2,1,0,0
7,Hot,Yellow,0,1,0,3,0,0,1
8,Hot,Yellow,1,1,0,3,0,0,1
9,Cold,Yellow,1,0,1,1,0,1,0


#### 6. Frequency Encoding

In [36]:
## By Pandas
fe =  df.groupby('Temperature').size()/len(df)
df.loc[:,'Temp_Freq_Encoder'] = df['Temperature'].map(fe)
df

Unnamed: 0,Temperature,Color,Target,Temperature_label_encoded,Temp_Factorize_encode,Temp_Ordinal_encode,Temp_Freq_Encoder
0,Hot,Red,1,1,0,3,0.4
1,Cold,Yellow,1,0,1,1,0.2
2,Very Hot,Blue,1,2,2,4,0.1
3,Warm,Blue,0,3,3,2,0.3
4,Hot,Red,1,1,0,3,0.4
5,Warm,Yellow,0,3,3,2,0.3
6,Warm,Red,1,3,3,2,0.3
7,Hot,Yellow,0,1,0,3,0.4
8,Hot,Yellow,1,1,0,3,0.4
9,Cold,Yellow,1,0,1,1,0.2


#### 7. Mean Encoding

In [37]:
mean_encoder = df.groupby('Temperature')['Target'].mean()
print(mean_encoder)
df.loc[:,'Temp_mean_enc'] = df['Temperature'].map(mean_encoder)
df

Temperature
Cold        1.000000
Hot         0.750000
Very Hot    1.000000
Warm        0.333333
Name: Target, dtype: float64


Unnamed: 0,Temperature,Color,Target,Temperature_label_encoded,Temp_Factorize_encode,Temp_Ordinal_encode,Temp_Freq_Encoder,Temp_mean_enc
0,Hot,Red,1,1,0,3,0.4,0.75
1,Cold,Yellow,1,0,1,1,0.2,1.0
2,Very Hot,Blue,1,2,2,4,0.1,1.0
3,Warm,Blue,0,3,3,2,0.3,0.333333
4,Hot,Red,1,1,0,3,0.4,0.75
5,Warm,Yellow,0,3,3,2,0.3,0.333333
6,Warm,Red,1,3,3,2,0.3,0.333333
7,Hot,Yellow,0,1,0,3,0.4,0.75
8,Hot,Yellow,1,1,0,3,0.4,0.75
9,Cold,Yellow,1,0,1,1,0.2,1.0


In [38]:
# Smoothing for mean encoding
## Compute the global mean
mean = df['Target'].mean()

## Compute the number of values and the mean of each group
agg = df.groupby('Temperature')['Target'].agg(['count','mean'])
counts = agg['count']
means = agg['mean']
weight = 100

## Compute the 'Smoothed' means
smooth = (counts * means + weight * mean)/(counts+weight)

## Replace each value by the according smoothed mean
print(smooth)
df.loc[:,'Temp_smean_enc']= df['Temperature'].map(smooth)
df


Temperature
Cold        0.705882
Hot         0.701923
Very Hot    0.702970
Warm        0.689320
dtype: float64


Unnamed: 0,Temperature,Color,Target,Temperature_label_encoded,Temp_Factorize_encode,Temp_Ordinal_encode,Temp_Freq_Encoder,Temp_mean_enc,Temp_smean_enc
0,Hot,Red,1,1,0,3,0.4,0.75,0.701923
1,Cold,Yellow,1,0,1,1,0.2,1.0,0.705882
2,Very Hot,Blue,1,2,2,4,0.1,1.0,0.70297
3,Warm,Blue,0,3,3,2,0.3,0.333333,0.68932
4,Hot,Red,1,1,0,3,0.4,0.75,0.701923
5,Warm,Yellow,0,3,3,2,0.3,0.333333,0.68932
6,Warm,Red,1,3,3,2,0.3,0.333333,0.68932
7,Hot,Yellow,0,1,0,3,0.4,0.75,0.701923
8,Hot,Yellow,1,1,0,3,0.4,0.75,0.701923
9,Cold,Yellow,1,0,1,1,0.2,1.0,0.705882


#### 8. Weight of Evidence Encoding

In [39]:
## We calculate probability of target=1 i.e. Good=1 for each category
woe_df = df.groupby('Temperature')['Target'].mean()
woe_df = pd.DataFrame(woe_df)

## Rename the column name to 'Good' to keep it consistent with formula for easy understanding
woe_df =  woe_df.rename(columns={'Target':'Good'})

## Calculate Bad probability which is (1 - Good Probability)
woe_df['Bad'] = 1 - woe_df.Good

## We need to add  a small value to avoid divide by zero  in denominator
woe_df['Bad'] = np.where(woe_df['Bad']== 0, 0.000001, woe_df['Bad'])

## Compute the WoE
woe_df['WoE'] = np.log(woe_df.Good/woe_df.Bad)
woe_df

Unnamed: 0_level_0,Good,Bad,WoE
Temperature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cold,1.0,1e-06,13.815511
Hot,0.75,0.25,1.098612
Very Hot,1.0,1e-06,13.815511
Warm,0.333333,0.666667,-0.693147


In [40]:
df.loc[:,'WoE_Encode'] = df['Temperature'].map(woe_df['WoE'])
df

Unnamed: 0,Temperature,Color,Target,Temperature_label_encoded,Temp_Factorize_encode,Temp_Ordinal_encode,Temp_Freq_Encoder,Temp_mean_enc,Temp_smean_enc,WoE_Encode
0,Hot,Red,1,1,0,3,0.4,0.75,0.701923,1.098612
1,Cold,Yellow,1,0,1,1,0.2,1.0,0.705882,13.815511
2,Very Hot,Blue,1,2,2,4,0.1,1.0,0.70297,13.815511
3,Warm,Blue,0,3,3,2,0.3,0.333333,0.68932,-0.693147
4,Hot,Red,1,1,0,3,0.4,0.75,0.701923,1.098612
5,Warm,Yellow,0,3,3,2,0.3,0.333333,0.68932,-0.693147
6,Warm,Red,1,3,3,2,0.3,0.333333,0.68932,-0.693147
7,Hot,Yellow,0,1,0,3,0.4,0.75,0.701923,1.098612
8,Hot,Yellow,1,1,0,3,0.4,0.75,0.701923,1.098612
9,Cold,Yellow,1,0,1,1,0.2,1.0,0.705882,13.815511


### 9. Probability Ratio Encoding

In [43]:
## We calculate probability of target=1 i.e. Good=1 for each category
pr_df = df.groupby('Temperature')['Target'].mean()
pr_df = pd.DataFrame(pr_df)

## Rename the column name to 'Good' to keep it consistent with formula for easy understanding
pr_df =  pr_df.rename(columns={'Target':'Good'})

## Calculate Bad probability which is (1 - Good Probability)
pr_df['Bad'] = 1 - pr_df.Good

## We need to add  a small value to avoid divide by zero  in denominator
pr_df['Bad'] = np.where(pr_df['Bad']== 0, 0.000001, pr_df['Bad'])

## Compute the WoE
pr_df['PR'] = pr_df.Good/pr_df.Bad
pr_df

Unnamed: 0_level_0,Good,Bad,PR
Temperature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cold,1.0,1e-06,1000000.0
Hot,0.75,0.25,3.0
Very Hot,1.0,1e-06,1000000.0
Warm,0.333333,0.666667,0.5


In [44]:
df.loc[:,'PR_Encode'] = df['Temperature'].map(pr_df['PR'])
df

Unnamed: 0,Temperature,Color,Target,Temperature_label_encoded,Temp_Factorize_encode,Temp_Ordinal_encode,Temp_Freq_Encoder,Temp_mean_enc,Temp_smean_enc,WoE_Encode,PR_Encode
0,Hot,Red,1,1,0,3,0.4,0.75,0.701923,1.098612,3.0
1,Cold,Yellow,1,0,1,1,0.2,1.0,0.705882,13.815511,1000000.0
2,Very Hot,Blue,1,2,2,4,0.1,1.0,0.70297,13.815511,1000000.0
3,Warm,Blue,0,3,3,2,0.3,0.333333,0.68932,-0.693147,0.5
4,Hot,Red,1,1,0,3,0.4,0.75,0.701923,1.098612,3.0
5,Warm,Yellow,0,3,3,2,0.3,0.333333,0.68932,-0.693147,0.5
6,Warm,Red,1,3,3,2,0.3,0.333333,0.68932,-0.693147,0.5
7,Hot,Yellow,0,1,0,3,0.4,0.75,0.701923,1.098612,3.0
8,Hot,Yellow,1,1,0,3,0.4,0.75,0.701923,1.098612,3.0
9,Cold,Yellow,1,0,1,1,0.2,1.0,0.705882,13.815511,1000000.0


### 10. Hashing

In [45]:
### Sample Code — I Will update in a future

### 11. Backward Difference Encoding

In [46]:
### Sample Code — I Will update in a future

### 12. Leave One Out Encoding

In [47]:
### Sample Code — I Will update in a future

### 13. James-Stein Encoding

In [48]:
### Sample Code — I Will update in a future

### 14. M-estimator Encoding

In [49]:
### Sample Code — I Will update in a future

### 15. Thermometer Encoder

In [50]:
### Sample Code — I Will update in a future

## <------- END of All Categorical Variable Encoding --------> 