## Target encoding example
One hot encoding of high cardinality features can lead to unstable results

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Define function
def target_encoder(df, column, target, index=None, method='mean'):
    """
    Target-based encoding is numerization of a categorical variables via the target variable. Main purpose is to deal
    with high cardinality categorical features without exploding dimensionality. This replaces the categorical variable
    with just one new numerical variable. Each category or level of the categorical variable is represented by a
    summary statistic of the target for that level.

    Args:
        df (pandas df): Pandas DataFrame containing the categorical column and target.
        column (str): Categorical variable column to be encoded.
        target (str): Target on which to encode.
        index (arr): Can be supplied to use targets only from the train index. Avoids data leakage from the test fold
        method (str): Summary statistic of the target. Mean, median or std. deviation.

    Returns:
        arr: Encoded categorical column.

    """

    index = df.index if index is None else index # Encode the entire input df if no specific indices is supplied

    if method == 'mean':
        encoded_column = df[column].map(df.iloc[index].groupby(column)[target].mean())
    elif method == 'median':
        encoded_column = df[column].map(df.iloc[index].groupby(column)[target].median())
    elif method == 'std':
        encoded_column = df[column].map(df.iloc[index].groupby(column)[target].std())
    else:
        raise ValueError("Incorrect method supplied: '{}'. Must be one of 'mean', 'median', 'std'".format(method))

    return encoded_column

In [3]:
# Create some dummy data
df = pd.DataFrame({
    'product_id': ['a'] * 4 + ['c'] * 1 + ['b'] * 5 + ['a'] * 1 + ['c'] * 3 + ['b'] * 1,
    'product_type_id': [111] * 7 + [999] * 3 + [000] * 4 + [999] * 1,
    'target': [1, 3, 7, 4, 0, 1, 0, 1, 0, 0, 0, 1, 2, 3, 10]})

df

Unnamed: 0,product_id,product_type_id,target
0,a,111,1
1,a,111,3
2,a,111,7
3,a,111,4
4,c,111,0
5,b,111,1
6,b,111,0
7,b,999,1
8,b,999,0
9,b,999,0


In [4]:
# Encode entire column of df (without train/test split)
df['product_id_encoded_mean'] = target_encoder(df, column='product_id', target='target', method='mean')
df

Unnamed: 0,product_id,product_type_id,target,product_id_encoded_mean
0,a,111,1,3.0
1,a,111,3,3.0
2,a,111,7,3.0
3,a,111,4,3.0
4,c,111,0,1.5
5,b,111,1,2.0
6,b,111,0,2.0
7,b,999,1,2.0
8,b,999,0,2.0
9,b,999,0,2.0


In [5]:
# Encode entire column of df (without train/test split)
df['product_type_id_encoded_std'] = target_encoder(df, column='product_type_id', target='target', method='std')
df

Unnamed: 0,product_id,product_type_id,target,product_id_encoded_mean,product_type_id_encoded_std
0,a,111,1,3.0,2.56348
1,a,111,3,3.0,2.56348
2,a,111,7,3.0,2.56348
3,a,111,4,3.0,2.56348
4,c,111,0,1.5,2.56348
5,b,111,1,2.0,2.56348
6,b,111,0,2.0,2.56348
7,b,999,1,2.0,4.856267
8,b,999,0,2.0,4.856267
9,b,999,0,2.0,4.856267


In [6]:
# Encode only on train indices of column of df to avoid data leakage in folds
# Note that the value based on the train indices is automatically mapped to the test indices
df['product_id_encoded_median_train'] = target_encoder(df, column='product_id', target='target', index=[0, 1, 2, 3, 4, 5], method='median')
df

Unnamed: 0,product_id,product_type_id,target,product_id_encoded_mean,product_type_id_encoded_std,product_id_encoded_median_train
0,a,111,1,3.0,2.56348,3.5
1,a,111,3,3.0,2.56348,3.5
2,a,111,7,3.0,2.56348,3.5
3,a,111,4,3.0,2.56348,3.5
4,c,111,0,1.5,2.56348,0.0
5,b,111,1,2.0,2.56348,1.0
6,b,111,0,2.0,2.56348,1.0
7,b,999,1,2.0,4.856267,1.0
8,b,999,0,2.0,4.856267,1.0
9,b,999,0,2.0,4.856267,1.0
