In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder 
from dirty_cat import SimilarityEncoder, TargetEncoder, GapEncoder, SuperVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_val_score

In [2]:
train = pd.read_csv('data\\train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## HistGradientBoostingClassifier with categorical features

**Prepare categorial features**

In [3]:
categorical_features = ['Sex', 'Ticket', 'Cabin', 'Embarked']

**Prepare X (features) and y (targets)**

In [4]:
X = train[categorical_features]
y = train['Survived']

**Check missing values**

In [5]:
X.isna().sum()

Sex           0
Ticket        0
Cabin       687
Embarked      2
dtype: int64

**Prepare a dict of encoders for looping**

In [6]:
encoders = {}
encoders['one-hot'] = OneHotEncoder(handle_unknown='ignore', sparse=False)
encoders['similarity'] = SimilarityEncoder(similarity='ngram')
encoders['target'] = TargetEncoder(handle_unknown='ignore')
encoders['gap'] = GapEncoder(n_components=30, random_state=5)

encoders

{'one-hot': OneHotEncoder(handle_unknown='ignore', sparse=False),
 'similarity': SimilarityEncoder(),
 'target': TargetEncoder(handle_unknown='ignore'),
 'gap': GapEncoder(n_components=30, random_state=5)}

**We now loop over the different encoding methods, instantiate a new Pipeline each time, fit it and store the returned cross-validation score.**

In [7]:
# one hot ecoding for for "Sex" and "Embarked" (fixed)
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=float)

for name, method in encoders.items():
    
    encoder = make_column_transformer(
        (ohe, ['Sex', 'Embarked']),
        (method, ['Cabin', 'Ticket'])) # "Cabin" and "Ticket" are dirty categories
    
    hist = HistGradientBoostingClassifier()
    
    pipe = make_pipeline(encoder, hist)
    
    scores = cross_val_score(pipe, X, y, cv=5, scoring='accuracy', n_jobs=-1).mean()
    
    print(f'{name.title()} Encoding (accuracy = {scores:.3f})')

One-Hot Encoding (accuracy = 0.780)
Similarity Encoding (accuracy = 0.811)
Target Encoding (accuracy = 0.786)
Gap Encoding (accuracy = 0.837)


- The best encoder for "Cabin" and "Ticket" is **gap encoding**

**Combine encoders**

In [8]:
# prepare the encoders
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=float)
gap_enc = GapEncoder(n_components=30, random_state=5)

# make a transformer
ct = make_column_transformer((ohe, ['Sex', 'Embarked']),
                             (gap_enc, ['Cabin', 'Ticket']))

# initialize the model
hist = HistGradientBoostingClassifier()

# make a pipeline
pipe = make_pipeline(ct, hist)

In [9]:
cross_val_score(pipe, X, y, cv=5, scoring='accuracy', n_jobs=-1).mean()

0.8372606867114432

#### SuperVectorizer: automatic vectorization

In [10]:
super_vect = SuperVectorizer(auto_cast=True, cardinality_threshold=40)

pipe = make_pipeline(super_vect, hist)

cross_val_score(pipe, X, y, cv=5, scoring='accuracy', n_jobs=-1).mean()

0.8383717280773334

- Awesome dirty_cat: https://dirty-cat.github.io/stable/