# Using _Z-score_

### Compute $Z-score$
$
\begin{align}
    Z = \frac{x - \mu} {\sigma} 
\end{align}
$


In [1]:
import pandas as pd
from scipy.stats import zscore

dataset = pd.read_csv("dataset/auto-mpg.csv", na_values=['NA','?'])
dataset['mpg'] = zscore(dataset['mpg'])
display(dataset[0:5])

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,-0.706439,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,-1.090751,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,-0.706439,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,-0.962647,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,-0.834543,8,302.0,140.0,3449,10.5,70,1,ford torino


# OneHotEncode a.k.a Dummies Variable(under Pandas)

In [2]:
dataset = pd.read_csv("dataset/simple-dataset.csv", na_values=['NA','?'])
areas = list(dataset['area'].unique())
print(f'Number of areas: {len(areas)}')
print(f'Areas: {areas}')

# Encode 
dummies = pd.get_dummies(dataset['area'], prefix='area')
# Concatenate
dataset = pd.concat([dataset,dummies], axis=1)
# Drop OneHot features
dataset.drop('area', axis=1, inplace=True)

dataset.head()

Number of areas: 4
Areas: ['c', 'd', 'a', 'b']


Unnamed: 0,id,job,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,crime,product,area_a,area_b,area_c,area_d
0,1,vv,50876.0,13.1,1,9.017895,35,11.738935,49,0.885827,0.492126,0.0711,b,0,0,1,0
1,2,kd,60369.0,18.625,2,7.766643,59,6.805396,51,0.874016,0.34252,0.400809,c,0,0,1,0
2,3,pe,55126.0,34.766667,1,3.632069,6,13.671772,44,0.944882,0.724409,0.207723,b,0,0,1,0
3,4,11,51690.0,15.808333,1,5.372942,16,4.333286,50,0.889764,0.444882,0.361216,b,0,0,1,0
4,5,kl,28347.0,40.941667,3,3.822477,20,5.967121,38,0.744094,0.661417,0.068033,a,0,0,0,1


# Target encoding without smoothing
## Approach:
### Replace label to the mean of its corresponding class values

In [3]:
# Create a small sample dataset
import pandas as pd
import numpy as np

np.random.seed(43) # seeding param

dataset = pd.DataFrame({
    'cont_9': np.random.rand(10)*100,
    'cat_0': ['dog'] * 5 + ['cat'] * 5,
    'cat_1': ['wolf'] * 9 + ['tiger'] * 1,
    'y': [1, 0, 1, 1, 1, 1, 0, 0, 0, 0]
})
print("Before Mapping")
print(dataset)

# Group by Label 
# Compute class mean for each group 
mean_by_group = dataset.groupby('cat_0')['y'].mean()
mean_by_group = dict(mean_by_group)

# Mean can be applied to substitute the categorical value 
# Map mean group to Label, 
# Drop categorical feature
dataset["cat_0_enc"] = dataset["cat_0"].map(mean_by_group)
dataset.drop('cat_0', axis=1, inplace=True)
print("After Mapping")
print(dataset)

Before Mapping
      cont_9 cat_0  cat_1  y
0  11.505457   dog   wolf  1
1  60.906654   dog   wolf  0
2  13.339096   dog   wolf  1
3  24.058962   dog   wolf  1
4  32.713906   dog   wolf  1
5  85.913749   cat   wolf  1
6  66.609021   cat   wolf  0
7  54.116221   cat   wolf  0
8   2.901382   cat   wolf  0
9  73.374830   cat  tiger  0
After Mapping
      cont_9  cat_1  y  cat_0_enc
0  11.505457   wolf  1        0.8
1  60.906654   wolf  0        0.8
2  13.339096   wolf  1        0.8
3  24.058962   wolf  1        0.8
4  32.713906   wolf  1        0.8
5  85.913749   wolf  1        0.2
6  66.609021   wolf  0        0.2
7  54.116221   wolf  0        0.2
8   2.901382   wolf  0        0.2
9  73.374830  tiger  0        0.2


# Target encoding with smoothing (Using weight factor, here 5)
### Clear explanation: [kaggle](https://www.kaggle.com/ogrellier/python-target-encoding-for-categorical-features)

In [10]:
# Source: https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
def calc_smooth_mean(df1, df2, cat_name, target, weight):
    # Compute the global mean
    mean = dataset[target].mean()

    # Compute the number of values and the mean of each group
    agg = dataset.groupby(cat_name)[target].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + weight * mean) / (counts + weight)

    # Replace each value by the according smoothed mean
    if df2 is None:
        return df1[cat_name].map(smooth)
    else:
        return df1[cat_name].map(smooth), df2[cat_name].map(smooth.to_dict())

In [11]:
dataset = pd.DataFrame({
    'cont_9': np.random.rand(10)*100,
    'cat_0': ['dog'] * 5 + ['cat'] * 5,
    'cat_1': ['wolf'] * 9 + ['tiger'] * 1,
    'y': [1, 0, 1, 1, 1, 1, 0, 0, 0, 0]
})

# Weight factor: Adjusted upon the dataset 
WEIGHT = 5
dataset['cat_0_enc'] = calc_smooth_mean(df1=dataset, df2=None, cat_name='cat_0', target='y', weight=WEIGHT)
dataset['cat_1_enc'] = calc_smooth_mean(df1=dataset, df2=None, cat_name='cat_1', target='y', weight=WEIGHT)

dataset

Unnamed: 0,cont_9,cat_0,cat_1,y,cat_0_enc,cat_1_enc
0,84.893915,dog,wolf,1,0.65,0.535714
1,97.146509,dog,wolf,0,0.65,0.535714
2,38.537691,dog,wolf,1,0.65,0.535714
3,95.448813,dog,wolf,1,0.65,0.535714
4,44.575836,dog,wolf,1,0.65,0.535714
5,66.972465,cat,wolf,1,0.35,0.535714
6,8.250005,cat,wolf,0,0.35,0.535714
7,89.709858,cat,wolf,0,0.35,0.535714
8,29.80035,cat,wolf,0,0.35,0.535714
9,26.230482,cat,tiger,0,0.35,0.416667
