# 6. FEATURE ENGINEERING

# 6.4. FEATURE SELECTION

In [2]:
# BUT : réduire le nombre de variables pour rendre le modèle plus 
# performant : avec la technique des FEATURES SELECTION

In [3]:
# Chargement des librairies
import numpy as np
import pandas as pd

# 6.4.1. COURS

# 6.4.1.1. UNIVARIATE FEATURE SELECTION

In [None]:
# The simplest and fastest methods are based on univariate statistical 
# tests. For each feature, measure how strongly the target depends on the 
# feature using a statistical test like  χ2  or ANOVA.
# 
# From the scikit-learn feature selection module, 
# feature_selection.SelectKBest returns the K best features given some 
# scoring function. 
# For our classification problem, the module provides three different 
# scoring functions:  χ2 , ANOVA F-value, and the mutual information score. The F-value measures the linear dependency between the feature variable and the target. This means the score might underestimate the relation between a feature and the target if the relationship is nonlinear. The mutual information score is nonparametric and so can capture nonlinear relationships.
# 
# With SelectKBest, we define the number of features to keep, based on the
# score from the scoring function. Using .fit_transform(features, target) 
# we get back an array with only the selected features.

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

feature_cols = baseline_data.columns.drop('outcome')

# Keep 5 features
selector = SelectKBest(f_classif, k=5)

X_new = selector.fit_transform(baseline_data[feature_cols], baseline_data['outcome'])
X_new
# array([[2015.,    5.,    9.,   18., 1409.],
#        [2017.,   13.,   22.,   31.,  957.],
#        [2013.,   13.,   22.,   31.,  739.],
#        ...,
#        [2010.,   13.,   22.,   31.,  238.],
#        [2016.,   13.,   22.,   31., 1100.],
#        [2011.,   13.,   22.,   31.,  542.]])

In [None]:
# However, I've done something wrong here. 
# The statistical tests are calculated using all of the data. 
# This means information from the validation and test sets could influence
# the features we keep, introducing a source of leakage. 
# This means we should select features using only a training set.

In [None]:
feature_cols = baseline_data.columns.drop('outcome')
train, valid, _ = get_data_splits(baseline_data)

# Keep 5 features
selector = SelectKBest(f_classif, k=5)

X_new = selector.fit_transform(train[feature_cols], train['outcome'])
X_new
# array([[2.015e+03, 5.000e+00, 9.000e+00, 1.800e+01, 1.409e+03],
#        [2.017e+03, 1.300e+01, 2.200e+01, 3.100e+01, 9.570e+02],
#        [2.013e+03, 1.300e+01, 2.200e+01, 3.100e+01, 7.390e+02],
#        ...,
#        [2.011e+03, 1.300e+01, 2.200e+01, 3.100e+01, 5.150e+02],
#        [2.015e+03, 1.000e+00, 3.000e+00, 2.000e+00, 1.306e+03],
#        [2.013e+03, 1.300e+01, 2.200e+01, 3.100e+01, 1.084e+03]])

In [None]:
# You should notice that the selected features are different than when I 
# used the entire dataset. 
# Now we have our selected features, but it's only the feature values for 
# the training set. 
# To drop the rejected features from the validation and test sets, we need 
# to figure out which columns in the dataset were kept with SelectKBest. 
# To do this, we can use .inverse_transform to get back an array with the 
# shape of the original data.

In [None]:
# Get back the features we've kept, zero out all other features
selected_features = pd.DataFrame(selector.inverse_transform(X_new), 
                                 index=train.index, 
                                 columns=feature_cols)
selected_features.head()
# 	goal	hour	day	month	year	category	currency	country	category_currency	category_country	currency_country	count_7_days	time_since_last_project
# 0	0.0	0.0	0.0	0.0	2015.0	0.0	5.0	9.0	0.0	0.0	18.0	1409.0	0.0
# 1	0.0	0.0	0.0	0.0	2017.0	0.0	13.0	22.0	0.0	0.0	31.0	957.0	0.0
# 2	0.0	0.0	0.0	0.0	2013.0	0.0	13.0	22.0	0.0	0.0	31.0	739.0	0.0
# 3	0.0	0.0	0.0	0.0	2012.0	0.0	13.0	22.0	0.0	0.0	31.0	907.0	0.0
# 4	0.0	0.0	0.0	0.0	2015.0	0.0	13.0	22.0	0.0	0.0	31.0	1429.0	0.0

In [None]:
# This returns a DataFrame with the same index and columns as the training 
# set, but all the dropped columns are filled with zeros. 
# We can find the selected columns by choosing features where the variance
# is non-zero.

In [None]:
# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns = selected_features.columns[selected_features.var() != 0]

# Get the valid dataset with the selected features.
valid[selected_columns].head()
# 	year	currency	country	currency_country	count_7_days
# 302896	2015	13	22	31	1534.0
# 302897	2013	13	22	31	625.0
# 302898	2014	5	9	18	851.0
# 302899	2014	13	22	31	1973.0
# 302900	2014	5	9	18	2163.0

# 6.4.1.2. L1 REGULARIZATION

In [None]:
# Univariate methods consider only one feature at a time when making a 
# selection decision. 
# Instead, we can make our selection using all of the features by including
# them in a linear model with L1 regularization. 
# This type of regularization (sometimes called Lasso) penalizes the 
# absolute magnitude of the coefficients, as compared to L2 (Ridge) 
# regression which penalizes the square of the coefficients.

In [None]:
# As the strength of regularization is increased, features which are less 
# important for predicting the target are set to 0. 
# This allows us to perform feature selection by adjusting the 
# regularization parameter. We choose the parameter by finding the best 
# performance on a hold-out set, or decide ahead of time how many features
# to keep.

In [None]:
# For regression problems you can use sklearn.linear_model.Lasso, or 
# sklearn.linear_model.LogisticRegression for classification. 
# These can be used along with sklearn.feature_selection.SelectFromModel 
# to select the non-zero coefficients. 
# Otherwise, the code is similar to the univariate tests.

In [None]:
from sklearn.feature_selection import SelectFromModel

train, valid, _ = get_data_splits(baseline_data)

X, y = train[train.columns.drop("outcome")], train['outcome']

# Set the regularization parameter C=1
logistic = LogisticRegression(C=1, penalty="l1", solver='liblinear', random_state=7).fit(X, y)
model = SelectFromModel(logistic, prefit=True)

X_new = model.transform(X)
X_new
# array([[1.000e+03, 1.200e+01, 1.100e+01, ..., 1.900e+03, 1.800e+01,
#         1.409e+03],
#        [3.000e+04, 4.000e+00, 2.000e+00, ..., 1.630e+03, 3.100e+01,
#         9.570e+02],
#        [4.500e+04, 0.000e+00, 1.200e+01, ..., 1.630e+03, 3.100e+01,
#         7.390e+02],
#        ...,
#        [2.500e+03, 0.000e+00, 3.000e+00, ..., 1.830e+03, 3.100e+01,
#         5.150e+02],
#        [2.600e+03, 2.100e+01, 2.300e+01, ..., 1.036e+03, 2.000e+00,
#         1.306e+03],
#        [2.000e+04, 1.600e+01, 4.000e+00, ..., 9.200e+02, 3.100e+01,
#         1.084e+03]])

In [None]:
# Similar to the univariate tests, we get back an array with the selected 
# features. 
# Again, we will want to convert these to a DataFrame so we can get the 
# selected columns.

In [None]:
# Get back the kept features as a DataFrame with dropped columns as all 0s
selected_features = pd.DataFrame(model.inverse_transform(X_new), 
                                 index=X.index,
                                 columns=X.columns)

# Dropped columns have values of all 0s, keep other columns 
selected_columns = selected_features.columns[selected_features.var() != 0]

In [None]:
# In this case with the L1 parameter C=1, we're dropping the 
# time_since_last_project column.

In [None]:
# In general, feature selection with L1 regularization is more powerful 
# the univariate tests, but it can also be very slow when you have a lot 
# of data and a lot of features. 
# Univariate tests will be much faster on large datasets, but also will 
# likely perform worse.

# 6.4.2. EXERCICES

In [5]:
# Chargement des librairies
import numpy as np
import pandas as pd
from sklearn import preprocessing, metrics
import lightgbm as lgb

import os

In [6]:
# Chargmenent du dataset
clicks_path='C:/Users/PC Maison/4-KAGGLE/KAGGLE_DEV/KAGGLE_COURS_6-FEATURE_ENGINEERING/feature-engineering-data/input/'
clicks = pd.read_parquet(clicks_path + 'baseline_data.pqt')
data_files = ['count_encodings.pqt',
              'catboost_encodings.pqt',
              'interactions.pqt',
              'past_6hr_events.pqt',
              'downloads.pqt',
              'time_deltas.pqt',
              'svd_encodings.pqt']
data_root = clicks_path
for file in data_files:
    features = pd.read_parquet(os.path.join(data_root, file))
    clicks = clicks.join(features)


In [7]:
# Fonction de splits du TRAIN SET, VALID SET ET TEST SET
def get_data_splits(dataframe, valid_fraction=0.1):

    dataframe = dataframe.sort_values('click_time')
    valid_rows = int(len(dataframe) * valid_fraction)
    train = dataframe[:-valid_rows * 2]
    # valid size == test size, last two sections of the data
    valid = dataframe[-valid_rows * 2:-valid_rows]
    test = dataframe[-valid_rows:]
    
    return train, valid, test

In [8]:
# Fonction d'entrainement et de scoring du model
def train_model(train, valid, test=None, feature_cols=None):
    if feature_cols is None:
        feature_cols = train.columns.drop(['click_time', 'attributed_time',
                                           'is_attributed'])
    dtrain = lgb.Dataset(train[feature_cols], label=train['is_attributed'])
    dvalid = lgb.Dataset(valid[feature_cols], label=valid['is_attributed'])
    
    param = {'num_leaves': 64, 'objective': 'binary', 
             'metric': 'auc', 'seed': 7}
    num_round = 1000
    print("Training model!")
    bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], 
                    early_stopping_rounds=20, verbose_eval=False)
    
    valid_pred = bst.predict(valid[feature_cols])
    valid_score = metrics.roc_auc_score(valid['is_attributed'], valid_pred)
    print(f"Validation AUC score: {valid_score}")
    
    if test is not None: 
        test_pred = bst.predict(test[feature_cols])
        test_score = metrics.roc_auc_score(test['is_attributed'], test_pred)
        return bst, valid_score, test_score
    else:
        return bst, valid_score

In [9]:
# BASELINE SCORE
train, valid, test = get_data_splits(clicks)
_, baseline_score = train_model(train, valid)
# new_score_1 : Validation AUC score: 0.9658334271834417

Training model!
Validation AUC score: 0.9658334271834417


## 6.4.2.1. WHICH DATA TO USE FOR FEATURE SELECTION?

In [10]:
# Since many feature selection methods require calculating statistics from 
# the dataset, should you use all the data for feature selection?

In [11]:
# Including validation and test data within the feature selection is a 
# source of leakage. 
# You'll want to perform feature selection on the train set only, then use 
# the results there to remove features from the validation and test sets.

## 6.4.2.2. UNIVARIATE FEATURE SELECTION

In [13]:
from sklearn.feature_selection import SelectKBest, f_classif
feature_cols = clicks.columns.drop(['click_time', 'attributed_time',
                                    'is_attributed'])
train, valid, test = get_data_splits(clicks)

X_train=train[feature_cols]
y_train=train['is_attributed']
X_valid=valid[feature_cols]

# Create the selector, keeping 40 features
selector = SelectKBest(f_classif, k=40)

# Use the selector to retrieve the best features fit_transform
X_new = selector.fit_transform(X_train, y_train)

# Get back the kept features as a DataFrame with dropped columns as all 0s
selected_features = pd.DataFrame(selector.inverse_transform(X_new),
                                 index=train.index,
                                 columns=feature_cols)

# Find the columns that were dropped
dropped_columns = selected_features.columns[selected_features.var() == 0]

In [14]:
# Entrainement et scoring
_ = train_model(train.drop(dropped_columns, axis=1), 
                valid.drop(dropped_columns, axis=1))
# old_score_1 : Validation AUC score: 0.9658334271834417
# new_score_2 : Validation AUC score: 0.9625481759576047

Training model!
Validation AUC score: 0.9625481759576047


# 6.4.2.3. THE BEST VALUE OF K

In [15]:
#  With this method we can choose the best K features, but we still have 
# to choose K ourselves. How would you find the "best" value of K? 
# That is, you want it to be small so you're keeping the best features, 
# but not so small that it's degrading the model's performance.

In [None]:
# To find the best value of K, you can fit multiple models with increasing 
# values of K, then choose the smallest K with validation score above some 
# threshold or some other criteria. 
# A good way to do this is loop over values of K and record the validation 
# scores for each iteration.

# 6.4.2.4. L1 REGULARISATION FOR FEATURES SELECTIONS

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

def select_features_l1(X, y):
    logistic = LogisticRegression(C=0.1, penalty="l1", random_state=7, solver='liblinear').fit(X, y)
    model = SelectFromModel(logistic, prefit=True)

    X_new = model.transform(X)

    # Get back the kept features as a DataFrame with dropped columns as all 0s
    selected_features = pd.DataFrame(model.inverse_transform(X_new),
                                     index=X.index,
                                     columns=X.columns)

    # Dropped columns have values of all 0s, keep other columns
    cols_to_keep = selected_features.columns[selected_features.var() != 0]

    return cols_to_keep

In [17]:
# Entrainement et scoring
n_samples = 10000
X, y = train[feature_cols][:n_samples], train['is_attributed'][:n_samples]
selected = select_features_l1(X, y)

dropped_columns = feature_cols.drop(selected)
_ = train_model(train.drop(dropped_columns, axis=1), 
                valid.drop(dropped_columns, axis=1))
# old_score_1 : Validation AUC score: 0.9658334271834417
# old_score_2 : Validation AUC score: 0.9625481759576047
# new_score_3 : Validation AUC score: 0.9655039361169727

Training model!
Validation AUC score: 0.9655039361169727


# 6.4.2.5. FEATURE SELECTION WITH TREES

In [18]:
# Since we're using a tree-based model, using another tree-based model for 
# feature selection might produce better results. What would you do 
# different to select the features using a trees classifier?

In [20]:
# You could use something like RandomForestClassifier or 
# ExtraTreesClassifier to find feature importances. 
# SelectFromModel can use the feature importances to find the best 
# features.

# 6.4.2.6. TOP K FEAUTURES WITH L1 REGULARIZATION

In [21]:
# Here you've set the regularization parameter C=0.1 which led to some 
# number of features being dropped. 
# However, by setting C you aren't able to choose a certain number of 
# features to keep. 
# What would you do to keep the top K important features using L1 
# regularization?

In [None]:
# To select a certain number of features with L1 regularization, you need 
# to find the regularization parameter that leaves the desired number of 
# features. 
# To do this you can iterate over models with different regularization 
# parameters from low to high and choose the one that leaves K features. 
# Note that for the scikit-learn models C is the inverse of the 
# regularization strength.