In [1]:
%%time
import warnings
warnings.filterwarnings("ignore")

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import missingno as msno
import pickle
%matplotlib inline

Wall time: 1.71 s


### One hot encoding
* Just like imputation, all methods of categorical encoding should be performed over the training set, and then propagated to the test set.
* Why?
    * Because these methods will "learn" patterns from the train data, and therefore you want to avoid leaking information and overfitting. But more importantly, because we don't know whether in future / live data, we will have all the categories present in the train data, or if there will be more or less categories. Therefore, we want to anticipate this uncertainty by setting the right processes right from the start. We want to create transformers that learn the categories from the train set, and used those learned categories to create the dummy variables in both train and test sets.

In [2]:
df = pd.read_csv('titanic.csv', usecols=[col.title() for col in ['sex', 'embarked', 'cabin', 'survived']])
df['Cabin'] = df['Cabin'].str[0] # capture only the first letter of the cabin
df.head()

Unnamed: 0,Survived,Sex,Cabin,Embarked
0,0,male,,S
1,1,female,C,C
2,1,female,,S
3,1,female,C,S
4,0,male,,S


In [3]:
X = df.drop(labels=["Survived"], axis=1)
y = df["Survived"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=True, stratify=y)

In [4]:
X_train.nunique()

Sex         2
Cabin       8
Embarked    3
dtype: int64

In [5]:
X_test.nunique()

Sex         2
Cabin       7
Embarked    3
dtype: int64

__One hot encoding with pandas__

**pd.get_dummies can handle missing values (NAN)**

* Advantages
    * quick
    * returns pandas dataframe
    * returns feature names for the dummy variables
* Limitations of pandas:
    * it does not preserve information from train data to propagate to test data
    * In practice, we shouldn't be using get-dummies in our machine learning pipelines. It is however useful, for a quick data exploration.
    * The train set contains 13 dummy features, whereas the test set contains 12 features. This occurred because there was no category T in cabin in the test set. This will cause problems if training and scoring models with scikit-learn, because predictors require train and test sets to be of the same shape. 

In [6]:
pd.get_dummies(data=X_train, prefix="titanic", prefix_sep="-", 
               drop_first=False, 
               dummy_na=True # # we can add an additional dummy variable to indicate missing data
              ).head()

Unnamed: 0,titanic-female,titanic-male,titanic-nan,titanic-A,titanic-B,titanic-C,titanic-D,titanic-E,titanic-F,titanic-G,titanic-T,titanic-nan.1,titanic-C.1,titanic-Q,titanic-S,titanic-nan.2
231,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0
836,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0
639,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0
389,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
597,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0


In [7]:
pd.get_dummies(data=X_test, prefix="titanic", prefix_sep="-", 
               drop_first=False, 
               dummy_na=True # # we can add an additional dummy variable to indicate missing data
              ).head()

Unnamed: 0,titanic-female,titanic-male,titanic-nan,titanic-A,titanic-B,titanic-C,titanic-D,titanic-E,titanic-F,titanic-G,titanic-nan.1,titanic-C.1,titanic-Q,titanic-S,titanic-nan.2
421,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0
618,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0
116,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0
310,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0
57,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0


__One hot encoding with Scikit-learn__
* [OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)

In [8]:
X = df.drop(labels=["Survived"], axis=1)
y = df["Survived"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=True, stratify=y)

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer

features = X.columns

# preprocessor = ColumnTransformer(transformers=[
#     ('OneHotEncoder', 
#      OneHotEncoder(categories='auto', 
#                    drop=None, 
#                    sparse=False, 
#                    handle_unknown='ignore'  # helps deal with rare labels
#                   ),
#      features),
# ], remainder="drop")

preprocessor = make_column_transformer(
    (OneHotEncoder(categories='auto', 
                   drop=None, 
                   sparse=False, 
                   handle_unknown='ignore'  # helps deal with rare labels in the X_test
                  ),
     features), remainder="drop")

In [10]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [11]:
preprocessor

ColumnTransformer(transformers=[('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse=False),
                                 Index(['Sex', 'Cabin', 'Embarked'], dtype='object'))])

In [12]:
preprocessor.named_transformers_

{'onehotencoder': OneHotEncoder(handle_unknown='ignore', sparse=False)}

In [13]:
preprocessor.named_transformers_["onehotencoder"]

OneHotEncoder(handle_unknown='ignore', sparse=False)

In [14]:
preprocessor.named_transformers_["onehotencoder"].categories_

[array(['female', 'male'], dtype=object),
 array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', nan], dtype=object),
 array(['C', 'Q', 'S', nan], dtype=object)]

In [15]:
preprocessor.named_transformers_["onehotencoder"].get_feature_names()

array(['x0_female', 'x0_male', 'x1_A', 'x1_B', 'x1_C', 'x1_D', 'x1_E',
       'x1_F', 'x1_G', 'x1_T', 'x1_nan', 'x2_C', 'x2_Q', 'x2_S', 'x2_nan'],
      dtype=object)

In [16]:
X_train

array([[0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 1., 0.]])

In [17]:
pd.DataFrame(data=X_train, columns=preprocessor.named_transformers_["onehotencoder"].get_feature_names()).head()

Unnamed: 0,x0_female,x0_male,x1_A,x1_B,x1_C,x1_D,x1_E,x1_F,x1_G,x1_T,x1_nan,x2_C,x2_Q,x2_S,x2_nan
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


__One hot encoding with feature_engine__
* quick
* returns dataframe
* returns feature names
* allows to select features to encode

[fe_OneHotEncoder](https://feature-engine.readthedocs.io/en/1.1.x/encoding/OneHotEncoder.html#api-reference)

In [18]:
X = df.drop(labels=["Survived"], axis=1)
y = df["Survived"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=True, stratify=y)

In [19]:
from sklearn.impute import SimpleImputer
from feature_engine.encoding import OneHotEncoder as fe_OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline

features = X.columns.tolist()

pipeline = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="Missing"), # return a numpy instead of dataframe
    fe_OneHotEncoder(top_categories=None, drop_last=False, 
#                    variables=features # Specify features name. Since it numpy array from Simputer()
#                                       # and we want to encode all features, no need to use variables=features
                    )
)

In [20]:
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

In [21]:
pipeline

Pipeline(steps=[('simpleimputer',
                 SimpleImputer(fill_value='Missing', strategy='constant')),
                ('onehotencoder', OneHotEncoder())])

In [22]:
pipeline.named_steps

{'simpleimputer': SimpleImputer(fill_value='Missing', strategy='constant'),
 'onehotencoder': OneHotEncoder()}

In [23]:
pipeline.named_steps["onehotencoder"].encoder_dict_

{'0': ['male', 'female'],
 '1': ['Missing', 'F', 'E', 'C', 'A', 'B', 'D', 'G', 'T'],
 '2': ['S', 'C', 'Q', 'Missing']}

In [24]:
pipeline.named_steps["onehotencoder"].variables_

['0', '1', '2']

In [25]:
X_train.head()

Unnamed: 0,0_male,0_female,1_Missing,1_F,1_E,1_C,1_A,1_B,1_D,1_G,1_T,2_S,2_C,2_Q,2_Missing
0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0
1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0
2,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0
3,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0
4,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0


In [26]:
X_test.head()

Unnamed: 0,0_male,0_female,1_Missing,1_F,1_E,1_C,1_A,1_B,1_D,1_G,1_T,2_S,2_C,2_Q,2_Missing
0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0
1,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0
2,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0
3,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0
4,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0


__Allows to select features to encode__

In [27]:
X = df.drop(labels=["Survived"], axis=1)
y = df["Survived"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=True, stratify=y)

In [28]:
from feature_engine.encoding import OneHotEncoder as fe_OneHotEncoder

features = X.columns.to_list()
enc = fe_OneHotEncoder(top_categories=None, drop_last=False, variables=features) # Allows to select features to encode

X_train = enc.fit_transform(X_train.fillna('Missing'))
X_test = enc.transform(X_test.fillna('Missing'))

In [29]:
enc

OneHotEncoder(variables=['Sex', 'Cabin', 'Embarked'])

In [30]:
enc.encoder_dict_

{'Sex': ['male', 'female'],
 'Cabin': ['Missing', 'F', 'E', 'C', 'A', 'B', 'D', 'G', 'T'],
 'Embarked': ['S', 'C', 'Q', 'Missing']}

In [31]:
enc.variables_

['Sex', 'Cabin', 'Embarked']

In [32]:
X_train.head()

Unnamed: 0,Sex_male,Sex_female,Cabin_Missing,Cabin_F,Cabin_E,Cabin_C,Cabin_A,Cabin_B,Cabin_D,Cabin_G,Cabin_T,Embarked_S,Embarked_C,Embarked_Q,Embarked_Missing
231,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0
836,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0
639,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0
389,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0
597,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0


In [33]:
X_test.head()

Unnamed: 0,Sex_male,Sex_female,Cabin_Missing,Cabin_F,Cabin_E,Cabin_C,Cabin_A,Cabin_B,Cabin_D,Cabin_G,Cabin_T,Embarked_S,Embarked_C,Embarked_Q,Embarked_Missing
421,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0
618,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0
116,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0
310,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0
57,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0


### One Hot Encoding of Frequent Categories
* Performing one hot encoding, only considering the most frequent categories. Often, categorical variables show a few dominating categories while the remaining labels add little information. Therefore, OHE of top categories is a simple and useful technique.

In [34]:
df = pd.read_csv('houseprice.csv', usecols=['Neighborhood', 'Exterior1st', 'Exterior2nd', 'SalePrice'])
df.head()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd,SalePrice
0,CollgCr,VinylSd,VinylSd,208500
1,Veenker,MetalSd,MetalSd,181500
2,CollgCr,VinylSd,VinylSd,223500
3,Crawfor,Wd Sdng,Wd Shng,140000
4,NoRidge,VinylSd,VinylSd,250000


In [35]:
for col in df.columns:
    print(f"{col}:  {len(df[col].unique())} labels")

Neighborhood:  25 labels
Exterior1st:  15 labels
Exterior2nd:  16 labels
SalePrice:  663 labels


In [36]:
X = df.drop(columns=["SalePrice"])
y = df["SalePrice"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=True, 
#                                                     stratify=y # Not a classification problem
                                                   )

In [37]:
# It is important to select the top or most frequent categories based of the train data. Then, we will use those top 
# categories to encode the variables in the test data as well

from feature_engine.encoding import OneHotEncoder as fe_OneHotEncoder

features = ['Neighborhood', 'Exterior1st', 'Exterior2nd']

# If the argument variables=None, then the encoder will automatically identify all categorical variables. Is that not sweet?
# The encoder will not encode numerical variables. So if some of your numerical variables are in fact categories, 
# you will need to re-cast them as object before using the encoder.

enc = fe_OneHotEncoder(top_categories=10,  # you can change this value to select more or less variables
                       drop_last=False, variables=features)

X_train = enc.fit_transform(X_train)
X_test = enc.transform(X_test)

In [38]:
enc

OneHotEncoder(top_categories=10,
              variables=['Neighborhood', 'Exterior1st', 'Exterior2nd'])

In [39]:
enc.encoder_dict_

{'Neighborhood': ['NAmes',
  'CollgCr',
  'OldTown',
  'Edwards',
  'Sawyer',
  'Somerst',
  'Gilbert',
  'NWAmes',
  'NridgHt',
  'SawyerW'],
 'Exterior1st': ['VinylSd',
  'HdBoard',
  'Wd Sdng',
  'MetalSd',
  'Plywood',
  'CemntBd',
  'BrkFace',
  'WdShing',
  'Stucco',
  'AsbShng'],
 'Exterior2nd': ['VinylSd',
  'Wd Sdng',
  'HdBoard',
  'MetalSd',
  'Plywood',
  'CmentBd',
  'Wd Shng',
  'BrkFace',
  'AsbShng',
  'Stucco']}

In [40]:
enc.variables_

['Neighborhood', 'Exterior1st', 'Exterior2nd']

In [41]:
X_train.head()

Unnamed: 0,Neighborhood_NAmes,Neighborhood_CollgCr,Neighborhood_OldTown,Neighborhood_Edwards,Neighborhood_Sawyer,Neighborhood_Somerst,Neighborhood_Gilbert,Neighborhood_NWAmes,Neighborhood_NridgHt,Neighborhood_SawyerW,...,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_HdBoard,Exterior2nd_MetalSd,Exterior2nd_Plywood,Exterior2nd_CmentBd,Exterior2nd_Wd Shng,Exterior2nd_BrkFace,Exterior2nd_AsbShng,Exterior2nd_Stucco
64,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
682,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
960,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1384,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1100,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [42]:
X_test.head()

Unnamed: 0,Neighborhood_NAmes,Neighborhood_CollgCr,Neighborhood_OldTown,Neighborhood_Edwards,Neighborhood_Sawyer,Neighborhood_Somerst,Neighborhood_Gilbert,Neighborhood_NWAmes,Neighborhood_NridgHt,Neighborhood_SawyerW,...,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_HdBoard,Exterior2nd_MetalSd,Exterior2nd_Plywood,Exterior2nd_CmentBd,Exterior2nd_Wd Shng,Exterior2nd_BrkFace,Exterior2nd_AsbShng,Exterior2nd_Stucco
529,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
491,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
459,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
279,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
655,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Label / Ordinal("arbitrary", "ordered") / Integer encoding
* Integer encoding consist in replacing the categories by digits from 1 to n (or 0 to n-1, depending the implementation), where n is the number of distinct categories of the variable. The numbers are assigned arbitrarily. This encoding method allows for quick benchmarking of machine learning models. Integer encoding is better suited for non-linear methods which are able to navigate through the arbitrarily assigned digits to try and find patters that relate them to the target.

In [43]:
df = pd.read_csv('titanic.csv', usecols=[item.title() for item in ['sex', 'embarked', 'cabin', 'survived']])
df['Cabin'] = df['Cabin'].str[0] # capture only the first letter of the cabin
df.head()

Unnamed: 0,Survived,Sex,Cabin,Embarked
0,0,male,,S
1,1,female,C,C
2,1,female,,S
3,1,female,C,S
4,0,male,,S


In [44]:
df.isna().mean()

Survived    0.000000
Sex         0.000000
Cabin       0.771044
Embarked    0.002245
dtype: float64

In [45]:
X = df.drop(labels=["Survived"], axis=1)
y = df["Survived"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=True, stratify=y)

In [46]:
# Must fill "Missing" before LabelEncoder
X_train = X_train.fillna('Missing')
X_test = X_test.fillna('Missing')

In [47]:
# LabelEncoder for y
# Unfortunately, the LabelEncoder from sklearn works one variable at the time.

from sklearn.preprocessing import LabelEncoder

le_y = LabelEncoder()
y_train = le_y.fit_transform(y_train)
y_test = le_y.transform(y_test)

In [48]:
le_y.classes_

array([0, 1], dtype=int64)

[OrdinalEncoder vs LabelEncoder](https://datascience.stackexchange.com/questions/39317/difference-between-ordinalencoder-and-labelencoder)
* OrdinalEncoder is for 2D data with the shape (n_samples, n_features)
* LabelEncoder is for 1D data with the shape (n_samples,)

In [49]:
# OrdinalEncoder for X

from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder() # only accept input as dataframe
X_train["Sex"] = oe.fit_transform(X_train["Sex"].to_frame())
X_test["Sex"] = oe.transform(X_test["Sex"].to_frame())

# fe_enc_arbitrary for X, fe_enc_ordered for X

from feature_engine.encoding import OrdinalEncoder as fe_OrdinalEncoder

fe_enc_arbitrary = fe_OrdinalEncoder(encoding_method='arbitrary', variables=['Cabin'])
X_train = fe_enc_arbitrary.fit_transform(X=X_train, y=None) # y=None
X_test = fe_enc_arbitrary.transform(X=X_test)

fe_enc_ordered = fe_OrdinalEncoder(encoding_method='ordered', variables=['Embarked'])
# monotonic relationship between the features and the target.
X_train = fe_enc_ordered.fit_transform(X=X_train, y=y_train) # y=y_train
X_test = fe_enc_ordered.transform(X=X_test)

In [50]:
X_train.head()

Unnamed: 0,Sex,Cabin,Embarked
231,1.0,0,2
836,1.0,0,2
639,1.0,0,2
389,0.0,0,1
597,1.0,0,2


In [51]:
X_test.head()

Unnamed: 0,Sex,Cabin,Embarked
421,1.0,0,3
618,0.0,1,2
116,1.0,0,3
310,0.0,3,1
57,1.0,0,1


In [52]:
oe.categories_

[array(['female', 'male'], dtype=object)]

In [53]:
fe_enc_arbitrary.encoder_dict_

{'Cabin': {'Missing': 0,
  'F': 1,
  'E': 2,
  'C': 3,
  'A': 4,
  'B': 5,
  'D': 6,
  'G': 7,
  'T': 8}}

In [54]:
fe_enc_arbitrary.variables_

['Cabin']

In [55]:
fe_enc_ordered.encoder_dict_

{'Embarked': {'Missing': 0, 'C': 1, 'S': 2, 'Q': 3}}

In [56]:
fe_enc_ordered.variables_

['Embarked']

### Count_or_frequency_encoding
* In count encoding we replace the categories by the count of the observations that show that category in the dataset. Similarly, we can replace the category by the frequency or percentage of observations in the dataset.

In [57]:
df = pd.read_csv('houseprice.csv', usecols=['Neighborhood', 'Exterior1st', 'Exterior2nd', 'SalePrice'])
df.head()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd,SalePrice
0,CollgCr,VinylSd,VinylSd,208500
1,Veenker,MetalSd,MetalSd,181500
2,CollgCr,VinylSd,VinylSd,223500
3,Crawfor,Wd Sdng,Wd Shng,140000
4,NoRidge,VinylSd,VinylSd,250000


In [58]:
# When doing count transformation of categorical variables, it is important to calculate the count 
# (or frequency = count / total observations) over the training set, and then use those numbers to 
# replace the labels in the test set.

X = df.drop(columns=["SalePrice"])
y = df["SalePrice"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=True, 
#                                                     stratify=y # Not a classification problem
                                                   )

In [59]:
from feature_engine.encoding import CountFrequencyEncoder

frequency_enc = CountFrequencyEncoder(
    encoding_method='frequency', # to do count ==> encoding_method='count'
    variables=['Neighborhood', 'Exterior1st', 'Exterior2nd']
)

X_train = frequency_enc.fit_transform(X_train)
X_test = frequency_enc.transform(X_test)

In [60]:
X_train.head()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
64,0.10274,0.356164,0.345401
682,0.023483,0.144814,0.138943
960,0.040117,0.144814,0.109589
1384,0.069472,0.020548,0.028376
1100,0.017613,0.144814,0.138943


In [61]:
X_test.head()

Unnamed: 0,Neighborhood,Exterior1st,Exterior2nd
529,0.034247,0.144814,0.003914
491,0.14775,0.144814,0.138943
459,0.040117,0.135029,0.133072
279,0.023483,0.084149,0.109589
655,0.009785,0.149706,0.007828


In [62]:
frequency_enc.encoder_dict_

{'Neighborhood': {'NAmes': 0.14774951076320939,
  'CollgCr': 0.10273972602739725,
  'OldTown': 0.07142857142857142,
  'Edwards': 0.06947162426614481,
  'Sawyer': 0.05968688845401174,
  'Somerst': 0.0547945205479452,
  'Gilbert': 0.053816046966731895,
  'NWAmes': 0.049902152641878667,
  'NridgHt': 0.049902152641878667,
  'SawyerW': 0.04403131115459882,
  'BrkSide': 0.040117416829745595,
  'Mitchel': 0.03522504892367906,
  'Crawfor': 0.03424657534246575,
  'Timber': 0.029354207436399216,
  'NoRidge': 0.029354207436399216,
  'ClearCr': 0.023483365949119372,
  'IDOTRR': 0.023483365949119372,
  'SWISU': 0.01761252446183953,
  'StoneBr': 0.015655577299412915,
  'Blmngtn': 0.011741682974559686,
  'MeadowV': 0.011741682974559686,
  'BrDale': 0.009784735812133072,
  'NPkVill': 0.00684931506849315,
  'Veenker': 0.005870841487279843,
  'Blueste': 0.0019569471624266144},
 'Exterior1st': {'VinylSd': 0.3561643835616438,
  'HdBoard': 0.149706457925636,
  'Wd Sdng': 0.14481409001956946,
  'MetalSd': 0

In [63]:
frequency_enc.variables_

['Neighborhood', 'Exterior1st', 'Exterior2nd']

### Mean or target encoding
* Create a monotonic relationship between the variable and the target, therefore suitable for linear models

[fe_MeanEncoder](https://feature-engine.readthedocs.io/en/1.1.x/encoding/MeanEncoder.html)

In [64]:
df = pd.read_csv('titanic.csv',  usecols=[col.title() for col in ['sex', 'embarked', 'cabin', 'survived']])
df['Cabin'] = df['Cabin'].astype(str).str[0]
df["Embarked"] = df["Embarked"].fillna(value="Missing")
df.head()

Unnamed: 0,Survived,Sex,Cabin,Embarked
0,0,male,n,S
1,1,female,C,C
2,1,female,n,S
3,1,female,C,S
4,0,male,n,S


In [65]:
X = df.drop(labels=["Survived"], axis=1)
y = df["Survived"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=True, stratify=y)

In [66]:
# from sklearn.impute import SimpleImputer

# imputer = SimpleImputer(strategy="constant", fill_value="Missing") # return a numpy instead of dataframe
# X_train = imputer.fit_transform(X_train)
# X_test = imputer.transform(X_test)

from feature_engine.encoding import MeanEncoder as fe_MeanEncoder

mean_enc = fe_MeanEncoder(variables=['Cabin', 'Sex', 'Embarked']) # make sure no NaN in Features
X_train = mean_enc.fit_transform(X_train, y_train)
X_test = mean_enc.transform(X_test)

In [67]:
X_test.head()

Unnamed: 0,Sex,Cabin,Embarked
421,0.190594,0.3,0.36
618,0.739726,0.666667,0.347253
116,0.190594,0.3,0.36
310,0.739726,0.581395,0.525862
57,0.190594,0.3,0.525862


In [68]:
mean_enc.encoder_dict_

{'Cabin': {'A': 0.25,
  'B': 0.75,
  'C': 0.5813953488372093,
  'D': 0.68,
  'E': 0.8095238095238095,
  'F': 0.6666666666666666,
  'G': 0.3333333333333333,
  'T': 0.0,
  'n': 0.3},
 'Sex': {'female': 0.7397260273972602, 'male': 0.1905940594059406},
 'Embarked': {'C': 0.5258620689655172,
  'Missing': 1.0,
  'Q': 0.36,
  'S': 0.34725274725274724}}

In [69]:
mean_enc.variables_

['Cabin', 'Sex', 'Embarked']

[TargetEncoder](https://contrib.scikit-learn.org/category_encoders/targetencoder.html)

In [70]:
df = pd.read_csv('titanic.csv',  usecols=[col.title() for col in ['sex', 'embarked', 'cabin', 'survived']])
df['Cabin'] = df['Cabin'].astype(str).str[0]
# df["Embarked"] = df["Embarked"].fillna(value="Missing")
df.head()

Unnamed: 0,Survived,Sex,Cabin,Embarked
0,0,male,n,S
1,1,female,C,C
2,1,female,n,S
3,1,female,C,S
4,0,male,n,S


In [71]:
X = df.drop(labels=["Survived"], axis=1)
y = df["Survived"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=True, stratify=y)

In [72]:
from category_encoders.target_encoder import TargetEncoder

mean_enc = TargetEncoder(handle_missing="value", handle_unknown="value") # can handle NaN in Features
X_train = mean_enc.fit_transform(X_train, y_train)
X_test = mean_enc.transform(X_test)

In [73]:
X_test.head()

Unnamed: 0,Sex,Cabin,Embarked
421,0.190594,0.3,0.36
618,0.739726,0.664772,0.347253
116,0.190594,0.3,0.36
310,0.739726,0.581395,0.525862
57,0.190594,0.3,0.525862


In [74]:
mean_enc.get_feature_names()

['Sex', 'Cabin', 'Embarked']

In [75]:
mean_enc.get_params()

{'cols': ['Sex', 'Cabin', 'Embarked'],
 'drop_invariant': False,
 'handle_missing': 'value',
 'handle_unknown': 'value',
 'min_samples_leaf': 1,
 'return_df': True,
 'smoothing': 1.0,
 'verbose': 0}

### Probability Ratio Encoding
* These encoding is suitable for **binary classification** problems only.. 
* If p(0) = 0 or p(1) = 0 for the ratio encoder, the encoder will return an error.

In [76]:
df = pd.read_csv('titanic.csv',  usecols=[col.title() for col in ['sex', 'embarked', 'cabin', 'survived']])
df['Cabin'] = df['Cabin'].astype(str).str[0]
df["Embarked"] = df["Embarked"].fillna(value="Missing")
# df.dropna(subset=['Embarked'], inplace=True)
df.head()

Unnamed: 0,Survived,Sex,Cabin,Embarked
0,0,male,n,S
1,1,female,C,C
2,1,female,n,S
3,1,female,C,S
4,0,male,n,S


In [77]:
# If p(0) = 0 or p(1) = 0 for the ratio encoder, the encoder will return an error.
df.iloc[829, 0] = 0
df[df["Embarked"] == "Missing"]

Unnamed: 0,Survived,Sex,Cabin,Embarked
61,1,female,B,Missing
829,0,female,B,Missing


In [78]:
X = df.drop(labels=["Survived"], axis=1)
y = df["Survived"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=True, stratify=y)

In [79]:
from feature_engine.encoding import PRatioEncoder

ratio_enc = PRatioEncoder(encoding_method='ratio', 
                          variables=['Cabin', 'Sex', 'Embarked']) # make sure no NaN in Features

X_train = ratio_enc.fit_transform(X_train, y_train)
X_test = ratio_enc.transform(X_test)

In [80]:
X_test.head()

Unnamed: 0,Sex,Cabin,Embarked
126,0.233129,0.426901,0.529412
618,2.745763,1.333333,0.528814
116,0.233129,0.426901,0.529412
310,2.745763,1.625,1.145455
420,0.233129,0.426901,1.145455


In [81]:
ratio_enc.encoder_dict_

{'Cabin': {'A': 0.3333333333333333,
  'B': 4.000000000000001,
  'C': 1.6250000000000002,
  'D': 2.2857142857142856,
  'E': 4.000000000000001,
  'F': 1.3333333333333333,
  'G': 0.49999999999999994,
  'T': 0.0,
  'n': 0.42690058479532167},
 'Sex': {'female': 2.745762711864406, 'male': 0.23312883435582823},
 'Embarked': {'C': 1.1454545454545455,
  'Missing': 1.0,
  'Q': 0.5294117647058824,
  'S': 0.5288135593220339}}

In [82]:
ratio_enc.variables_

['Cabin', 'Sex', 'Embarked']

### Weight of Evidence
[fe_WoEEncoder](https://feature-engine.readthedocs.io/en/1.1.x/encoding/WoEEncoder.html)

In [83]:
df = pd.read_csv('titanic.csv',  usecols=[col.title() for col in ['sex', 'embarked', 'cabin', 'survived']])
df['Cabin'] = df['Cabin'].astype(str).str[0] 
# if any of the terms in the weight of evidence calculation is 0, 
# the log of 0 is not defined, so this transformer will raise an error.
df = df[df['Cabin'] != 'T']
df.dropna(subset=['Embarked'], inplace=True)
df.head()

Unnamed: 0,Survived,Sex,Cabin,Embarked
0,0,male,n,S
1,1,female,C,C
2,1,female,n,S
3,1,female,C,S
4,0,male,n,S


In [84]:
X = df.drop(labels=["Survived"], axis=1)
y = df["Survived"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=True, stratify=y)

In [85]:
from feature_engine.encoding import WoEEncoder as fe_WoEEncoder

woe_enc = fe_WoEEncoder(variables=['Cabin', 'Sex', 'Embarked']) # make sure no NaN in Features
X_train = woe_enc.fit_transform(X_train, y_train)
X_test = woe_enc.transform(X_test)

In [86]:
X_test.head()

Unnamed: 0,Sex,Cabin,Embarked
285,-0.941302,-0.353886,0.723172
735,-0.941302,-0.353886,-0.241234
200,-0.941302,-0.353886,-0.241234
570,-0.941302,-0.353886,-0.241234
786,1.568067,-0.353886,-0.241234


In [87]:
woe_enc.encoder_dict_

{'Cabin': {'A': 0.4757643155091712,
  'B': 1.209733490589372,
  'C': 0.698907866823381,
  'D': 1.7285272840045396,
  'E': 1.8620586766290619,
  'F': 0.8812294236173356,
  'G': 0.4757643155091712,
  'n': -0.353886028064454},
 'Sex': {'female': 1.5680674349840162, 'male': -0.9413017042774728},
 'Embarked': {'C': 0.7231724888230124,
  'Q': 0.10420075907668835,
  'S': -0.24123408087295403}}

In [88]:
woe_enc.variables_

['Cabin', 'Sex', 'Embarked']

[WOEEncoder](https://contrib.scikit-learn.org/category_encoders/woe.html)

In [89]:
df = pd.read_csv('titanic.csv',  usecols=[col.title() for col in ['sex', 'embarked', 'cabin', 'survived']])
df['Cabin'] = df['Cabin'].astype(str).str[0] 
df.head()

Unnamed: 0,Survived,Sex,Cabin,Embarked
0,0,male,n,S
1,1,female,C,C
2,1,female,n,S
3,1,female,C,S
4,0,male,n,S


In [90]:
X = df.drop(labels=["Survived"], axis=1)
y = df["Survived"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=True, stratify=y)

In [91]:
from category_encoders.woe import WOEEncoder


woe_enc = WOEEncoder(cols=['Cabin', 'Sex', 'Embarked']) # can handle NaN in Features
X_train = woe_enc.fit_transform(X_train, y_train)
X_test = woe_enc.transform(X_test)

In [92]:
X_test.head()

Unnamed: 0,Sex,Cabin,Embarked
421,-0.965264,-0.372309,-0.081028
618,1.504348,0.981866,-0.157149
116,-0.965264,-0.372309,-0.081028
310,1.504348,0.784698,0.572823
57,-0.965264,-0.372309,0.572823


In [93]:
woe_enc.get_feature_names()

['Sex', 'Cabin', 'Embarked']

In [94]:
woe_enc.get_params()

{'cols': ['Cabin', 'Sex', 'Embarked'],
 'drop_invariant': False,
 'handle_missing': 'value',
 'handle_unknown': 'value',
 'random_state': None,
 'randomized': False,
 'regularization': 1.0,
 'return_df': True,
 'sigma': 0.05,
 'verbose': 0}

In [95]:
woe_enc.mapping

{'Cabin': Cabin
  1   -0.372309
  2    0.981866
  3    1.751974
  4    0.784698
  5   -0.222107
  6    1.507132
  7    1.164188
  8    0.065575
  9    0.000000
 -1    0.000000
 -2    0.000000
 dtype: float64,
 'Sex': Sex
  1   -0.965264
  2    1.504348
 -1    0.000000
 -2    0.000000
 dtype: float64,
 'Embarked': Embarked
  1   -0.157149
  2    0.572823
  3   -0.081028
  4    1.569653
 -1    0.000000
 -2    0.000000
 dtype: float64}

### Rare Label Encoding
[fe_RareLabelEncoder](https://feature-engine.readthedocs.io/en/1.1.x/encoding/RareLabelEncoder.html)

In [96]:
df = pd.read_csv('houseprice.csv')

X = df.drop(columns=["SalePrice"])
y = df["SalePrice"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=True, 
#                                                     stratify=y # Not a classification problem
                                                   )

In [97]:
from feature_engine.encoding import RareLabelEncoder as fe_RareLabelEncoder

rare_enc = fe_RareLabelEncoder( # make sure no NaN in Features
    tol=0.05,  # minimal percentage to be considered non-rare
    n_categories=4, # minimal number of categories the variable should have to re-cgroup rare categories
    variables=['Neighborhood', 'Exterior1st', 'Exterior2nd',
               'MasVnrType', 'ExterQual', 'BsmtCond'] # variables to re-group
)

X_train = rare_enc.fit_transform(X_train.fillna("Missing"))
X_test = rare_enc.transform(X_test.fillna("Missing"))

In [98]:
X_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
529,530,20,RL,Missing,32668,Pave,Missing,IR1,Lvl,AllPub,...,0,0,Missing,Missing,Missing,0,3,2007,WD,Alloca
491,492,50,RL,79.0,9490,Pave,Missing,Reg,Lvl,AllPub,...,0,0,Missing,MnPrv,Missing,0,8,2006,WD,Normal
459,460,50,RL,Missing,7015,Pave,Missing,IR1,Bnk,AllPub,...,0,0,Missing,Missing,Missing,0,7,2009,WD,Normal
279,280,60,RL,83.0,10005,Pave,Missing,Reg,Lvl,AllPub,...,0,0,Missing,Missing,Missing,0,3,2008,WD,Normal
655,656,160,RM,21.0,1680,Pave,Missing,Reg,Lvl,AllPub,...,0,0,Missing,Missing,Missing,0,3,2010,WD,Family


In [99]:
rare_enc.variables_

['Neighborhood',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'BsmtCond']

In [100]:
rare_enc.encoder_dict_

{'Neighborhood': Index(['NAmes', 'CollgCr', 'OldTown', 'Edwards', 'Sawyer', 'Somerst',
        'Gilbert'],
       dtype='object'),
 'Exterior1st': Index(['VinylSd', 'HdBoard', 'Wd Sdng', 'MetalSd', 'Plywood'], dtype='object'),
 'Exterior2nd': Index(['VinylSd', 'Wd Sdng', 'HdBoard', 'MetalSd', 'Plywood'], dtype='object'),
 'MasVnrType': Index(['None', 'BrkFace', 'Stone'], dtype='object'),
 'ExterQual': array(['TA', 'Gd', 'Ex', 'Fa'], dtype=object),
 'BsmtCond': Index(['TA'], dtype='object')}

### Binary Encoding (Not often in the industry)
[BinaryEncoder](https://contrib.scikit-learn.org/category_encoders/binary.html)

In [101]:
df = pd.read_csv('titanic.csv',  usecols=[col.title() for col in ['sex', 'embarked', 'cabin', 'survived']])
df['Cabin'] = df['Cabin'].astype(str).str[0] 
df.head()

Unnamed: 0,Survived,Sex,Cabin,Embarked
0,0,male,n,S
1,1,female,C,C
2,1,female,n,S
3,1,female,C,S
4,0,male,n,S


In [102]:
X = df.drop(labels=["Survived"], axis=1)
y = df["Survived"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=True, stratify=y)

In [103]:
from category_encoders.binary import BinaryEncoder

bin_enc = BinaryEncoder()
X_train = bin_enc.fit_transform(X_train)
X_test = bin_enc.transform(X_test)

In [104]:
X_test.head()

Unnamed: 0,Sex_0,Sex_1,Cabin_0,Cabin_1,Cabin_2,Cabin_3,Cabin_4,Embarked_0,Embarked_1,Embarked_2
421,0,1,0,0,0,0,1,0,1,1
618,1,0,0,0,0,1,0,0,0,1
116,0,1,0,0,0,0,1,0,1,1
310,1,0,0,0,1,0,0,0,1,0
57,0,1,0,0,0,0,1,0,1,0


In [105]:
bin_enc.get_feature_names()

['Sex_0',
 'Sex_1',
 'Cabin_0',
 'Cabin_1',
 'Cabin_2',
 'Cabin_3',
 'Cabin_4',
 'Embarked_0',
 'Embarked_1',
 'Embarked_2']

In [106]:
bin_enc.get_params()

{'cols': None,
 'drop_invariant': False,
 'handle_missing': 'value',
 'handle_unknown': 'value',
 'mapping': None,
 'return_df': True,
 'verbose': 0}

### Hashing Encoding (Not often in the industry)
[HashingEncoder](https://contrib.scikit-learn.org/category_encoders/hashing.html)

In [107]:
df = pd.read_csv('titanic.csv',  usecols=[col.title() for col in ['sex', 'embarked', 'cabin', 'survived']])
df['Cabin'] = df['Cabin'].astype(str).str[0] 
df.head()

Unnamed: 0,Survived,Sex,Cabin,Embarked
0,0,male,n,S
1,1,female,C,C
2,1,female,n,S
3,1,female,C,S
4,0,male,n,S


In [108]:
X = df.drop(labels=["Survived"], axis=1)
y = df["Survived"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=True, stratify=y)

In [109]:
%%time
from category_encoders.hashing import HashingEncoder

hash_enc = HashingEncoder()
X_train = hash_enc.fit_transform(X_train)
X_test = hash_enc.transform(X_test)

Wall time: 13.4 s


In [110]:
X_test.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7
0,0,1,0,0,0,1,1,0
1,0,0,1,0,0,1,1,0
2,0,1,0,0,0,1,1,0
3,0,0,0,0,0,1,0,2
4,0,1,0,0,0,1,0,1


In [111]:
hash_enc.get_feature_names()

['col_0', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7']

In [112]:
hash_enc.get_params()

{'cols': ['Sex', 'Cabin', 'Embarked'],
 'drop_invariant': False,
 'hash_method': 'md5',
 'max_process': 4,
 'max_sample': 67,
 'n_components': 8,
 'return_df': True,
 'verbose': 0}