# Simple encoding: rate-by-level - Pandas and numpy
***

## Imports

In [38]:
import pandas as pd               # pandas for handling mixed data sets 
from numpy.random import uniform  # numpy for basic math and matrix operations

#### Create a sample data set

In [39]:
scratch_df = pd.DataFrame({'x1': ['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'],
                           'x2': ['C', 'D', 'D', 'D', 'C', 'C', 'E', 'C', 'E', 'E'],
                            'y': [0, 0, 1, 0, 1, 1, 1, 1, 0, 1]})
scratch_df

Unnamed: 0,x1,x2,y
0,A,C,0
1,A,D,0
2,A,D,1
3,A,D,0
4,A,C,1
5,B,C,1
6,B,E,1
7,B,C,1
8,B,E,0
9,B,E,1


#### Encode categorical variables using a rate-by-level approach 

In [44]:
# make a new deep copy of scratch_df 
# so you can run this cell many times w/o errors
scratch_df1 = scratch_df.copy()

# loop through columns to create new encoded columns 
for col_name in scratch_df.columns[:-1]:
    new_col_name = col_name + '_encode'   
    # create a dictionary of original categorical value:event rate for that value
    row_val_dict = {}
    for level in scratch_df[col_name].unique():
        row_val_dict[level] = scratch_df[scratch_df[col_name] == level]['y'].mean()
    # apply the transform from the dictionary on all rows in the column
    scratch_df1[new_col_name] = scratch_df[col_name].apply(lambda i: row_val_dict[i])   
        
scratch_df1

Unnamed: 0,x1,x2,y,x1_encode,x2_encode
0,A,C,0,0.4,0.75
1,A,D,0,0.4,0.333333
2,A,D,1,0.4,0.333333
3,A,D,0,0.4,0.333333
4,A,C,1,0.4,0.75
5,B,C,1,0.8,0.75
6,B,E,1,0.8,0.666667
7,B,C,1,0.8,0.75
8,B,E,0,0.8,0.666667
9,B,E,1,0.8,0.666667


#### Perturb to prevent overfitting

In [41]:
# make a new deep copy of scratch_df 
# so you can run this cell many times w/o errors
scratch_df2 = scratch_df.copy()

# loop through columns to create new encoded columns
for col_name in scratch_df.columns[:-1]:
    new_col_name = col_name + '_encode'   
    row_val_dict = {}
    # create a dictionary of original categorical value:event rate for that value
    for level in scratch_df[col_name].unique():
        # apply the transform from the dictionary on all rows in the column
        # add in a little random noise, can prevent overfitting for rare levels
        row_val_dict[level] = (scratch_df[scratch_df[col_name] == level]['y'].mean())
    scratch_df2[new_col_name] = scratch_df[col_name].apply(lambda i: row_val_dict[i] + uniform(low=-0.05, high=0.05))   
        
scratch_df2

Unnamed: 0,x1,x2,y,x1_encode,x2_encode
0,A,C,0,0.39666,0.752811
1,A,D,0,0.374276,0.359561
2,A,D,1,0.38512,0.362976
3,A,D,0,0.366503,0.35395
4,A,C,1,0.408456,0.704154
5,B,C,1,0.844466,0.737979
6,B,E,1,0.786456,0.707412
7,B,C,1,0.760163,0.709422
8,B,E,0,0.752278,0.709365
9,B,E,1,0.790468,0.714065
