# Simple encoding: average-by-level - Pandas and numpy
***

## Imports

In [5]:
import pandas as pd               # pandas for handling mixed data sets 
import numpy as np                # numpy for basic math and matrix operations
from numpy.random import uniform  # numpy for basic math and matrix operations

#### Create a sample data set

In [6]:
scratch_df = pd.DataFrame({'x1': ['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'],
                           'x2': ['C', 'D', 'D', 'D', 'C', 'C', 'E', 'C', 'E', 'E'],
                            'y': pd.Series(np.random.choice(1000, 10))})
scratch_df

Unnamed: 0,x1,x2,y
0,A,C,158
1,A,D,707
2,A,D,674
3,A,D,951
4,A,C,248
5,B,C,942
6,B,E,931
7,B,C,523
8,B,E,405
9,B,E,569


#### Encode categorical variables using a rate-by-level approach 

In [7]:
# make a new deep copy of scratch_df 
# so you can run this cell many times w/o errors
scratch_df1 = scratch_df.copy()

# loop through columns to create new encoded columns 
for col_name in scratch_df.columns[:-1]:
    new_col_name = col_name + '_encode'   
    # create a dictionary of original categorical value:average y for that value
    row_val_dict = {}
    for level in scratch_df[col_name].unique():
        row_val_dict[level] = scratch_df[scratch_df[col_name] == level]['y'].mean()
    # apply the transform from the dictionary on all rows in the column
    scratch_df1[new_col_name] = scratch_df[col_name].apply(lambda i: row_val_dict[i])   
        
scratch_df1

Unnamed: 0,x1,x2,y,x1_encode,x2_encode
0,A,C,158,547.6,467.75
1,A,D,707,547.6,777.333333
2,A,D,674,547.6,777.333333
3,A,D,951,547.6,777.333333
4,A,C,248,547.6,467.75
5,B,C,942,674.0,467.75
6,B,E,931,674.0,635.0
7,B,C,523,674.0,467.75
8,B,E,405,674.0,635.0
9,B,E,569,674.0,635.0


#### Perturb to prevent overfitting

In [9]:
# make a new deep copy of scratch_df 
# so you can run this cell many times w/o errors
scratch_df2 = scratch_df.copy()

# loop through columns to create new encoded columns
for col_name in scratch_df.columns[:-1]:
    new_col_name = col_name + '_encode'   
    row_val_dict = {}
    # create a dictionary of original categorical value:average y for that value
    for level in scratch_df[col_name].unique():
        # apply the transform from the dictionary on all rows in the column
        # add in a little random noise, can prevent overfitting for rare levels
        row_val_dict[level] = (scratch_df[scratch_df[col_name] == level]['y'].mean())
    scratch_df2[new_col_name] = scratch_df[col_name].apply(lambda i: row_val_dict[i] + uniform(low=-5, high=5))   
        
scratch_df2

Unnamed: 0,x1,x2,y,x1_encode,x2_encode
0,A,C,158,549.272525,472.079463
1,A,D,707,550.507909,774.249363
2,A,D,674,552.234351,780.370135
3,A,D,951,545.406594,774.984095
4,A,C,248,546.832261,465.229813
5,B,C,942,678.281481,467.12427
6,B,E,931,673.194447,638.293288
7,B,C,523,674.193392,463.425045
8,B,E,405,677.787234,639.248453
9,B,E,569,674.505306,632.268885
