## Target encoding done the right way
Author: Max Halford   
Source: https://maxhalford.github.io/blog/target-encoding/

In [1]:
import pandas as pd

### Target Encoding (without Smoothing)

In [2]:
# The dataframe
df = pd.DataFrame({
    'x_0': ['a']*5 + ['b']*5,
    'x_1': ['c']*9 + ['d']*1,
    'y': [1, 1, 1, 1, 0, 1, 0, 0, 0, 0]
})
print(df)

  x_0 x_1  y
0   a   c  1
1   a   c  1
2   a   c  1
3   a   c  1
4   a   c  0
5   b   c  1
6   b   c  0
7   b   c  0
8   b   c  0
9   b   d  0


In [3]:
# compute the means of target variable y w.r.t to each of the categorical variable x_i
means_x0 = df.groupby('x_0')['y'].mean()
means_x1 = df.groupby('x_1')['y'].mean()

In [4]:
# replace the values of categorical variable with the respective computed means
df['x_0'] = df['x_0'].map(means_x0)
df['x_1'] = df['x_1'].map(means_x1)
print(df)

   x_0       x_1  y
0  0.8  0.555556  1
1  0.8  0.555556  1
2  0.8  0.555556  1
3  0.8  0.555556  1
4  0.8  0.555556  0
5  0.2  0.555556  1
6  0.2  0.555556  0
7  0.2  0.555556  0
8  0.2  0.555556  0
9  0.2  0.000000  0


### Target Encoding with Additive or Laplace Smoothing
$$ \mu = \frac{n \times \bar{x} + m \times w}{n + m} $$  
where   
    - $\mu$ is the mean that will replace the categorical values  
    - $n$ is the number of values we have  
    - $\bar{x}$ is the estimated mean  
    - $m$ is the weight assigned to overall mean  
    - $w$ is the overall mean  
if $w = 0$, then we get the normal empirical mean
$$\mu = \frac{n \times \bar{x} + m \times w}{n + m} = \frac{n \times \bar{x}}{n} = \bar{x}$$

In [5]:
def calc_smooth_mean(df, by, on, m):
    '''
    df: pandas dataframe
    by: categorical column name
    on: target column name
    m: weight assigned to overall mean
    '''

    # global mean (w)
    mean_w = df[on].mean()

    # number of values (n) and mean of each group (x_bar)
    agg = df.groupby(by)[on].agg(['count', 'mean'])
    counts_n = agg['count']
    means_xbar = agg['mean']

    # compute smoothed means
    smooth_mu = (counts_n * means_xbar + m * mean_w) / (counts_n + m)

    # replace the categorical column by the smoothed mean
    return df[by].map(smooth_mu)

In [6]:
# redefine dataframe
# The dataframe
df = pd.DataFrame({
    'x_0': ['a']*5 + ['b']*5,
    'x_1': ['c']*9 + ['d']*1,
    'y': [1, 1, 1, 1, 0, 1, 0, 0, 0, 0]
})

In [7]:
# reproduce the non-smoothing means (m=0)
print('Column: x_0:')
print(calc_smooth_mean(df, by='x_0', on='y', m=0))

Column: x_0:
0    0.8
1    0.8
2    0.8
3    0.8
4    0.8
5    0.2
6    0.2
7    0.2
8    0.2
9    0.2
Name: x_0, dtype: float64


In [8]:
print('Column: x_1:')
print(calc_smooth_mean(df, by='x_1', on='y', m=0))

Column: x_1:
0    0.555556
1    0.555556
2    0.555556
3    0.555556
4    0.555556
5    0.555556
6    0.555556
7    0.555556
8    0.555556
9    0.000000
Name: x_1, dtype: float64


In [9]:
calc_smooth_mean(df, by='x_1', on='y', m=10)

0    0.526316
1    0.526316
2    0.526316
3    0.526316
4    0.526316
5    0.526316
6    0.526316
7    0.526316
8    0.526316
9    0.454545
Name: x_1, dtype: float64

In [10]:
# use m=10
print('Column: x_0:')
print(calc_smooth_mean(df, by='x_0', on='y', m=10))

Column: x_0:
0    0.6
1    0.6
2    0.6
3    0.6
4    0.6
5    0.4
6    0.4
7    0.4
8    0.4
9    0.4
Name: x_0, dtype: float64


In [11]:
print('Column: x_1:')
print(calc_smooth_mean(df, by='x_1', on='y', m=10))

Column: x_1:
0    0.526316
1    0.526316
2    0.526316
3    0.526316
4    0.526316
5    0.526316
6    0.526316
7    0.526316
8    0.526316
9    0.454545
Name: x_1, dtype: float64


Author mentions that $m=300$ works well in most cases:
```
you’re saying that you require that there must be at least 300 values for the sample mean to overtake the global mean
```