# 6. FEATURE ENGINEERING

# 6.2. CATEGORICAL ENCODINGS

In [2]:
# SUITE DU CHAPITRE 6.1. BASELINE MODEL

# 6.2.1. COURS

In [None]:
# Train a model (on the baseline data)
train, valid, test = get_data_splits(data)
train_model(train, valid)
# Validation AUC score: 0.7467

# 6.2.1.1. CATEGORICAL COUNT ENCODING

In [None]:
# Count encoding replaces each categorical value with the number of times 
# it appears in the dataset. For example, if the value "GB" occured 10 
# times in the country feature, then each "GB" would be replaced with the 
# number 10.

In [None]:
# We'll use the categorical-encodings package to get this encoding. 
# The encoder itself is available as CountEncoder. This encoder and the 
# others in categorical-encodings work like scikit-learn transformers 
# with .fit and .transform methods.

In [None]:
import category_encoders as ce
cat_features = ['category', 'currency', 'country']

# Create the encoder
count_enc = ce.CountEncoder()

# Transform the features, rename the columns with the _count suffix, 
# and join to dataframe
count_encoded = count_enc.fit_transform(ks[cat_features])
data = data.join(count_encoded.add_suffix("_count"))

# Train a model 
train, valid, test = get_data_splits(data)
train_model(train, valid)
# Validation AUC score: 0.7486
# Adding the count encoding features increase the validation score from 
# 0.7467 to 0.7486, only a slight improvement.

# 6.2.1.2. TARGET ENCODING

In [4]:
# Target encoding replaces a categorical value with the average value of 
# the target for that value of the feature. 
# For example, given the country value "CA", you'd calculate the average 
# outcome for all the rows with country == 'CA', around 0.28. T
# This is often blended with the target probability over the entire dataset
# to reduce the variance of values with few occurences.

In [None]:
# This technique uses the targets to create new features. 
# So including the validation or test data in the target encodings would 
# be a form of target leakage. 
# Instead, you should learn the target encodings from the training dataset 
# only and apply it to the other datasets.

In [None]:
# The category_encoders package provides TargetEncoder for target encoding.
# The implementation is similar to CountEncoder.

In [None]:
# Create the encoder
target_enc = ce.TargetEncoder(cols=cat_features)
target_enc.fit(train[cat_features], train['outcome'])

# Transform the features, rename the columns with _target suffix, and join to dataframe
train_TE = train.join(target_enc.transform(train[cat_features]).add_suffix('_target'))
valid_TE = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_target'))

# Train a model
train_model(train_TE, valid_TE)

# Validation AUC score: 0.7491
# The validation score is higher again, from 0.7467 to 0.7491.

# 6.2.1.3. CATBOOST ENCODING

In [None]:
# Finally, we'll look at CatBoost encoding. 
# This is similar to target encoding in that it's based on the target 
# probablity for a given value. 
# However with CatBoost, for each row, the target probability is calculated
# only from the rows before it.

In [None]:
# Create the encoder
target_enc = ce.CatBoostEncoder(cols=cat_features)
target_enc.fit(train[cat_features], train['outcome'])

# Transform the features, rename columns with _cb suffix, and join to dataframe
train_CBE = train.join(target_enc.transform(train[cat_features]).add_suffix('_cb'))
valid_CBE = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_cb'))

# Train a model
train_model(train_CBE, valid_CBE)

# Validation AUC score: 0.7492
# This does slightly better than target encoding.

# 6.2.2. EXERCICES

In [1]:
# Chargement des librairies
import numpy as np
import pandas as pd
from sklearn import preprocessing, metrics
import lightgbm as lgb

In [2]:
# Chargement du dataset - fichier parquet
clicks_path = 'C:/Users/PC Maison/4-KAGGLE/KAGGLE_DEV/KAGGLE_COURS_6-FEATURE_ENGINEERING/feature-engineering-data/input/'
clicks = pd.read_parquet(clicks_path + 'baseline_data.pqt')

In [3]:
# Fonction pour spliter le dataset en TRAIN SET, VALID SET ET TEST SET
def get_data_splits(dataframe, valid_fraction=0.1):
    """Splits a dataframe into train, validation, and test sets.

    First, orders by the column 'click_time'. Set the size of the 
    validation and test sets with the valid_fraction keyword argument.
    """

    dataframe = dataframe.sort_values('click_time')
    valid_rows = int(len(dataframe) * valid_fraction)
    train = dataframe[:-valid_rows * 2]
    # valid size == test size, last two sections of the data
    valid = dataframe[-valid_rows * 2:-valid_rows]
    test = dataframe[-valid_rows:]
    
    return train, valid, test

In [4]:
# Fonction pour entrainer scorer le modèle

def train_model(train, valid, test=None, feature_cols=None):
    if feature_cols is None:
        feature_cols = train.columns.drop(['click_time', 'attributed_time',
                                           'is_attributed'])
    dtrain = lgb.Dataset(train[feature_cols], label=train['is_attributed'])
    dvalid = lgb.Dataset(valid[feature_cols], label=valid['is_attributed'])
    
    param = {'num_leaves': 64, 
             'objective': 'binary', 
             'metric': 'auc', 
             'seed': 7}
    num_round = 1000
    
    bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], 
                    early_stopping_rounds=20, verbose_eval=False)
    
    valid_pred = bst.predict(valid[feature_cols])
    valid_score = metrics.roc_auc_score(valid['is_attributed'], valid_pred)
    print(f"Validation AUC score: {valid_score}")
    
    if test is not None: 
        test_pred = bst.predict(test[feature_cols])
        test_score = metrics.roc_auc_score(test['is_attributed'], test_pred)
        return bst, valid_score, test_score
    else:
        return bst, valid_score

In [5]:
# baseline score :
print("Baseline model")
train, valid, test = get_data_splits(clicks)
_ = train_model(train, valid)

Baseline model
Validation AUC score: 0.9622743228943659


## 6.2.2.1. CATEGORICAL ENCODING AND LEAKAGE

In [None]:
# Ces encodages sont tous basés sur des statistiques calculées à partir de 
# l'ensemble de données comme les nombres et les moyennes.
# Compte tenu de cela, quelles données devez-vous utiliser pour calculer 
# les encodages? 
# Plus précisément, pouvez-vous utiliser les données de validation? 
# Pouvez-vous utiliser les données de test?

In [None]:
# Vous ne devez calculer les encodages qu'à partir du train set. 
# Si vous incluez les donnéesdu val set et du test set dans les encodages, 
# vous surestimerez les performances du modèle. 
# Vous devez en général être vigilant pour éviter les fuites, c'est-à-dire 
# inclure toute information provenant du valid set et du test set dans le
# modèle.

## 6.2.2.2. COUNT ENCODINGS

In [12]:
import category_encoders as ce

cat_features = ['ip', 'app', 'device', 'os', 'channel']
train, valid, test = get_data_splits(clicks)

In [13]:
# Using CountEncoder from the category_encoders library, fit the encoding
# using the categorical feature columns defined in cat_features.

# Create the count encoder
count_enc = ce.CountEncoder(cols=cat_features)

In [15]:
# Entrainement du modèle
# Learn encoding from the training set
count_enc.fit(train[cat_features])

  elif pd.api.types.is_categorical(cols):


CountEncoder(cols=['ip', 'app', 'device', 'os', 'channel'],
             combine_min_nan_groups=True)

In [16]:
# Then apply the encodings to the train and validation sets 

# Transform pour train set et valid set
train_transf_count_enc = count_enc.transform(train[cat_features])
valid_transf_count_enc = count_enc.transform(valid[cat_features])

In [17]:
# adding them as new columns with names suffixed "_count".
# Apply encoding to the train and validation sets as new columns
# Make sure to add `_count` as a suffix to the new columns
train_encoded = train.join(train_transf_count_enc.add_suffix('_count'))
valid_encoded = valid.join(valid_transf_count_enc.add_suffix('_count'))

In [19]:
# Train the model on the encoded datasets
# This can take around 30 seconds to complete
_ = train_model(train_encoded, valid_encoded)
# old_score : Validation AUC score: 0.9622743228943659
# new_score : Validation AUC score: 0.9653051135205329
# meilleur score

Validation AUC score: 0.9653051135205329


In [None]:
# À première vue, il peut être surprenant que l'encodage de comptage aide 
# à créer des modèles précis. 
# Pourquoi pensez-vous que l'encodage de comptage est une bonne idée ou 
# comment améliore-t-il le score du modèle?

In [None]:
# Les valeurs rares ont tendance à avoir des nombres similaires (avec des 
# valeurs telles que 1 ou 2), vous pouvez donc classer les valeurs rares 
# ensemble au moment de la prédiction. 
# Il est peu probable que les valeurs courantes avec un grand nombre aient 
# le même nombre exact que les autres valeurs. 
# Ainsi, les valeurs communes / importantes obtiennent leur propre 
# groupement.

# 6.2.2.3. TARGET ENCODING

In [20]:
# Create the target encoder. You can find this easily by using tab completion.
# Start typing ce. the press Tab to bring up a list of classes and functions.
target_enc = ce.TargetEncoder(cols=cat_features)

X_train = train[cat_features]
y_train = train['is_attributed']
X_valid = valid[cat_features]

# Learn encoding from the training set. Use the 'is_attributed' column as the target.
# fit sur X_train = train[cat_features], y_train = train['is_attributed']
target_enc.fit(X_train, y_train)

# Applying transform target encoding
train_transf_target_enc = target_enc.transform(X_train)
valid_transf_target_enc = target_enc.transform(X_valid)

# Apply encoding to the train and validation sets as new columns
# Make sure to add `_target` as a suffix to the new columns
train_encoded = train.join(train_transf_target_enc.add_suffix('_target'))
valid_encoded = valid.join(valid_transf_target_enc.add_suffix('_target'))

  elif pd.api.types.is_categorical(cols):


In [21]:
# Run the next cell to see how target encoding affects your results.
_ = train_model(train_encoded, valid_encoded)
# old_score_1 : Validation AUC score: 0.9622743228943659
# old_score_2 : Validation AUC score: 0.9653051135205329
# new_score   : Validation AUC score: 0.9540530347873288
# pas le meilleur score

Validation AUC score: 0.9540530347873288


In [22]:
# Essayez de supprimer le codage de la colonne I.P
# Si vous laissez ip hors des fonctionnalités encodées et réentraînez le 
# modèle avec l'encodage cible, vous devriez constater que le score 
# augmente et est supérieur au score de base! 
# Pourquoi pensez-vous que le score est inférieur à la ligne de base 
# lorsque nous encodons l'adresse IP, mais supérieur à la ligne de base 
# lorsque nous ne le faisons pas?

In [24]:
# Le codage cible tente de mesurer la moyenne de population de la cible 
# pour chaque niveau dans une fonction catégorielle. 
# Cela signifie que lorsqu'il y a moins de données par niveau, la moyenne 
# estimée sera plus éloignée de la moyenne «vraie», il y aura plus de 
# variance. 
# Il y a peu de données par adresse IP, il est donc probable que les 
# estimations soient beaucoup plus bruyantes que pour les autres 
# fonctionnalités. 
# Le modèle s'appuiera fortement sur cette fonctionnalité car elle est 
# extrêmement prédictive. 
# Cela l'amène à faire moins de fractionnements sur d'autres 
# fonctionnalités, et ces fonctionnalités sont adaptées uniquement aux 
# erreurs restantes concernant la comptabilisation de l'adresse IP. 
# Ainsi, le modèle fonctionnera très mal lorsqu'il verra de nouvelles 
# adresses IP qui ne figuraient pas dans les données d'entraînement 
# (ce qui est probablement la plupart des nouvelles données). 
# À l'avenir, nous laisserons de côté la fonction IP lors de l'essai de 
# différents encodages.

# 6.2.2.4. CATBOOST ENCODING

In [25]:
# Remove IP from the encoded features
cat_features = ['app', 'device', 'os', 'channel']
train, valid, test = get_data_splits(clicks)


# Create the CatBoost encoder
cb_enc = ce.CatBoostEncoder(cols=cat_features, random_state=7)

# Learn encoding from the training set
X_train = train[cat_features]
y_train = train['is_attributed']
X_valid = valid[cat_features]
cb_enc.fit(X_train, y_train)

# Transformation par encodage du train set et valid set
train_transf_c_enc = cb_enc.transform(X_train)
valid_transf_c_enc = cb_enc.transform(X_valid)

# Apply encoding to the train and validation sets as new columns
# Make sure to add `_cb` as a suffix to the new columns
train_encoded = train.join(train_transf_c_enc.add_suffix('_cb'))
valid_encoded = valid.join(valid_transf_c_enc.add_suffix('_cb'))


  elif pd.api.types.is_categorical(cols):


In [26]:
_ = train_model(train_encoded, valid_encoded)
# old_score_1 : Validation AUC score: 0.9622743228943659
# old_score_2 : Validation AUC score: 0.9653051135205329
# old_score_3 : Validation AUC score: 0.9540530347873288
# new_score   : Validation AUC score: 0.962868024575231  
# meilleur score

Validation AUC score: 0.962868024575231
