# One Hot Encoding

In [1]:
import pandas as pd

# to split the datasets
from sklearn.model_selection import train_test_split

# for one hot encoding with sklearn
from sklearn.preprocessing import OneHotEncoder

# for one hot encoding with feature-engine
from feature_engine.categorical_encoders import OneHotCategoricalEncoder

In [2]:

data = pd.read_csv('titanic.csv',
                   usecols=['sex', 'embarked', 'cabin', 'survived'])
data.head()

Unnamed: 0,survived,sex,cabin,embarked
0,1,female,B5,S
1,1,male,C22,S
2,0,female,C22,S
3,0,male,C22,S
4,0,female,C22,S


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    data[['sex', 'embarked', 'cabin']],  # predictors
    data['survived'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((916, 3), (393, 3))

In [4]:
ohe_enc = OneHotCategoricalEncoder(
    top_categories=None,
    variables=['sex', 'embarked'], # we can select which variables to encode,if this is blank it selects all automatically
    drop_last=True) # to return k-1, false to return k

ohe_enc.fit(X_train)

OneHotCategoricalEncoder(drop_last=True, top_categories=None,
                         variables=['sex', 'embarked'])

In [5]:
tmp = ohe_enc.transform(X_train)

tmp.head()

Unnamed: 0,cabin,sex_female,embarked_S,embarked_C,embarked_Q
501,,1,1,0,0
588,,1,1,0,0
402,,1,0,1,0
1193,,0,0,0,1
686,,1,0,0,1


In [6]:
tmp = ohe_enc.transform(X_test)

tmp.head()

Unnamed: 0,cabin,sex_female,embarked_S,embarked_C,embarked_Q
1139,,0,1,0,0
533,,1,1,0,0
459,,0,1,0,0
1150,,0,1,0,0
393,,0,1,0,0


# One Hot Encoding Frequent Categories

### Dummy variables created only for the most frequent categories.10 most frequent labels of the variable. This means that they would make one binary variable for each of the 10 most frequent labels only

In [10]:
data = pd.read_csv(
    'houseprice.csv',
    usecols=['Neighborhood', 'Exterior1st', 'Exterior2nd', 'SalePrice'])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    data[['Neighborhood', 'Exterior1st', 'Exterior2nd']],  # predictors
    data['SalePrice'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((1022, 3), (438, 3))

In [12]:
ohe_enc = OneHotCategoricalEncoder(
    top_categories=10,  # you can change this value to select more or less variables
    # we can select which variables to encode
    variables=['Neighborhood', 'Exterior1st', 'Exterior2nd'],
    drop_last=False)

ohe_enc.fit(X_train)

OneHotCategoricalEncoder(drop_last=False, top_categories=10,
                         variables=['Neighborhood', 'Exterior1st',
                                    'Exterior2nd'])

In [13]:
X_train = ohe_enc.transform(X_train)
X_test = ohe_enc.transform(X_test)
X_train.head()

Unnamed: 0,Neighborhood_NAmes,Neighborhood_CollgCr,Neighborhood_OldTown,Neighborhood_Edwards,Neighborhood_Sawyer,Neighborhood_Somerst,Neighborhood_Gilbert,Neighborhood_NWAmes,Neighborhood_NridgHt,Neighborhood_SawyerW,...,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_HdBoard,Exterior2nd_MetalSd,Exterior2nd_Plywood,Exterior2nd_CmentBd,Exterior2nd_Wd Shng,Exterior2nd_BrkFace,Exterior2nd_AsbShng,Exterior2nd_Stucco
64,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
682,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
960,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1384,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1100,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


# Integer Encoding

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    data[['Neighborhood', 'Exterior1st', 'Exterior2nd']], # predictors
    data['SalePrice'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((1022, 3), (438, 3))

In [16]:
from feature_engine.categorical_encoders import OrdinalCategoricalEncoder

In [17]:
ordinal_enc = OrdinalCategoricalEncoder(
    encoding_method='arbitrary',
    variables=['Neighborhood', 'Exterior1st', 'Exterior2nd'])

ordinal_enc.fit(X_train)

OrdinalCategoricalEncoder(encoding_method='arbitrary',
                          variables=['Neighborhood', 'Exterior1st',
                                     'Exterior2nd'])

In [18]:
X_train = ordinal_enc.transform(X_train)
X_test = ordinal_enc.transform(X_test)
X_train.head()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
64,0,0,0
682,1,1,1
960,2,1,2
1384,3,2,3
1100,4,1,1


# Count or Frequency Encoding

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    data[['Neighborhood', 'Exterior1st', 'Exterior2nd']], # predictors
    data['SalePrice'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((1022, 3), (438, 3))

In [21]:
from feature_engine.categorical_encoders import CountFrequencyCategoricalEncoder

In [22]:
count_enc = CountFrequencyCategoricalEncoder(
    encoding_method='count', # to do frequency ==> encoding_method='frequency'
    variables=['Neighborhood', 'Exterior1st', 'Exterior2nd'])

count_enc.fit(X_train)

CountFrequencyCategoricalEncoder(encoding_method='count',
                                 variables=['Neighborhood', 'Exterior1st',
                                            'Exterior2nd'])

In [23]:
X_train = count_enc.transform(X_train)
X_test = count_enc.transform(X_test)
X_train.head()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
64,105,364,353
682,24,148,142
960,41,148,112
1384,71,21,29
1100,18,148,142


# Target Guided Ordered Integer Encoding

### we have the variable city with values London, Manchester and Bristol; if the default rate is 30% in London, 20% in Bristol and 10% in Manchester, then we replace London by 1, Bristol by 2 and Manchester by 3

In [29]:
from feature_engine.categorical_encoders import OrdinalCategoricalEncoder

In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    data[['Neighborhood', 'Exterior1st', 'Exterior2nd']], # predictors
    data['SalePrice'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((1022, 3), (438, 3))

In [26]:
ordinal_enc = OrdinalCategoricalEncoder(
    # NOTE that we indicate ordered in the encoding_method, otherwise it assings numbers arbitrarily
    encoding_method='ordered',
    variables=['Neighborhood', 'Exterior1st', 'Exterior2nd'])

In [27]:
ordinal_enc.fit(X_train, y_train)

OrdinalCategoricalEncoder(encoding_method='ordered',
                          variables=['Neighborhood', 'Exterior1st',
                                     'Exterior2nd'])

In [28]:
X_train = ordinal_enc.transform(X_train)
X_test = ordinal_enc.transform(X_test)
X_train.head()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
64,16,11,13
682,17,5,7
960,4,5,8
1384,3,4,4
1100,8,5,7


# Mean Encoding or Target Encoding

### Mean encoding implies replacing the category by the average target value for that category. For example, if we have the variable city, with categories London, Manchester and Bristol, and we want to predict the default rate, if the default rate for London is 30% we replace London by 0.3, if the default rate for Manchester is 20% we replace Manchester by 0.2 and so on.

In [30]:
data = pd.read_csv(
    'titanic.csv',
    usecols=['cabin', 'sex', 'embarked', 'survived'])

data.head()

Unnamed: 0,survived,sex,cabin,embarked
0,1,female,B5,S
1,1,male,C22,S
2,0,female,C22,S
3,0,male,C22,S
4,0,female,C22,S


In [31]:
data['embarked'].fillna('Missing', inplace=True)

In [32]:
data['cabin'] = data['cabin'].astype(str).str[0]

In [33]:
X_train, X_test, y_train, y_test = train_test_split(
    data[['cabin', 'sex', 'embarked']],  # predictors
    data['survived'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((916, 3), (393, 3))

In [34]:
from feature_engine.categorical_encoders import MeanCategoricalEncoder

In [35]:
mean_enc = MeanCategoricalEncoder(
    variables=['cabin', 'sex', 'embarked'])

In [36]:
mean_enc.fit(X_train, y_train)

MeanCategoricalEncoder(variables=['cabin', 'sex', 'embarked'])

In [37]:
X_train = mean_enc.transform(X_train)
X_test = mean_enc.transform(X_test)
X_train.head()

Unnamed: 0,cabin,sex,embarked
501,0.304843,0.728358,0.338957
588,0.304843,0.728358,0.338957
402,0.304843,0.728358,0.553073
1193,0.304843,0.187608,0.373494
686,0.304843,0.728358,0.373494


# Probability Ratio Encoding

### These encoding is suitable for classification problems only, where the target is binary.

### For each category, we calculate the mean of target=1, that is the probability of the target being 1 ( P(1) ), and the probability of the target=0 ( P(0) ). And then, we calculate the ratio P(1)/P(0), and replace the categories by that ratio.

In [38]:
from feature_engine.categorical_encoders import WoERatioCategoricalEncoder

In [39]:
X_train, X_test, y_train, y_test = train_test_split(
    data[['cabin', 'sex', 'embarked']], # predictors
    data['survived'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((916, 3), (393, 3))

In [40]:
ratio_enc = WoERatioCategoricalEncoder(
    encoding_method = 'ratio',
    variables=['cabin', 'sex', 'embarked'])

In [None]:
ratio_enc.fit(X_train, y_train)

In [None]:
X_train = ratio_enc.transform(X_train)
X_test = ratio_enc.transform(X_test)
X_train.head()

# Weight of evidence

### WoE will be 0 if the P(Goods) / P(Bads) = 1, that is, 
### if the outcome is random for that group.If P(Bads) > P(Goods) the odds ratio will be < 1 and,
### WoE will be < 0 if, P(Goods) > P(Bads).
### WoE is well suited for Logistic Regression, because the Logit transformation is simply the log of the odds, i.e., ln(P(Goods)/P(Bads)). Therefore, by using WoE-coded predictors in logistic regression, the predictors are all prepared and coded to the same scale, and the parameters in the linear logistic regression equation can be directly compared.

In [42]:
X_train, X_test, y_train, y_test = train_test_split(
    data[['cabin', 'sex', 'embarked']],  # predictors
    data['survived'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((916, 3), (393, 3))

In [43]:
woe_enc = WoERatioCategoricalEncoder(
    encoding_method = 'woe',
    variables=['cabin', 'sex', 'embarked'])

In [None]:
woe_enc.fit(X_train, y_train)

In [None]:
X_train = ratio_enc.transform(X_train)
X_test = ratio_enc.transform(X_test)
X_train.head()