In [None]:
#hide
%reload_ext autoreload
%autoreload 2

In [None]:
#export
import pandas as pd
import numpy as np

from typing import List, Dict
from sklearn.datasets import load_boston

from task_substitution.core import *
from task_substitution.data import *
from task_substitution.model import *
from task_substitution.external_data import *

In [None]:
# default_exp feature_selection

# Feature Selection

> Class that would train a `LightGBM` model on a dataset and select set of features that are actually relevant to the task.

In [None]:
#hide
from nbdev.showdoc import *

### Feature Selection Using Null Importance

In [None]:
#export
class FeatureSelection:
    def __init__(self, model_args:Dict, threshold:int=3):
        self.model_args = model_args
        self.threshold  = threshold
    
    def train(self, X:pd.DataFrame, y:pd.Series):
        model = Model(**self.model_args)
        self.trained_model = model.fit(X, y)
        return self.trained_model
    
    def get_feature_importances(self, model, feature_names:List)->pd.DataFrame:
        feat_imp = model.feature_importance(importance_type='gain')
        feat_df  = pd.DataFrame({'features': feature_names,
                                 'imp(gain)': feat_imp
                                })
        return feat_df
    
    def get_feature_names(self, X:pd.DataFrame)->List:
        return list(sorted(X.columns))
    
    def add_ratio_gain(self, feat_imp:pd.DataFrame, feat_imp_useless:pd.DataFrame)->pd.DataFrame:
        tmp_df = feat_imp.copy()
        tmp_df = tmp_df.assign(imp_gain_useless=feat_imp_useless['imp(gain)'].values)
        tmp_df = tmp_df.assign(ratio=feat_imp_useless['imp(gain)']/feat_imp['imp(gain)'])
        return tmp_df
    
    def filter_features(self, ratio_df:pd.DataFrame)->List:
        return list(ratio_df.loc[(ratio_df.ratio <= self.threshold), 'features'])
    
    def select_features(self, X:pd.DataFrame, y:pd.Series)->List:
        model = self.train(X, y)
        
        shuffled_y = _shuffle(y.copy())
        useless_model = self.train(X, shuffled_y)
        
        feature_names = self.get_feature_names(X)
        feat_imp = self.get_feature_importances(model, feature_names)
        feat_imp_useless = self.get_feature_importances(useless_model, feature_names)
        
        self.ratio_df = self.add_ratio_gain(feat_imp, feat_imp_useless)
        selected_features = self.filter_features(self.ratio_df)
        
        return selected_features

### Usage

In [None]:
data = load_boston()
X = pd.DataFrame(data['data'], columns=data['feature_names'])
y = pd.Series(data['target'])

In [None]:
model_args = {
    'num_boost_round': 300,
    'objective': 'regression',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'nthread': -1,
    'verbosity': -1,
    'seed': 41
}

print(f'Feature list: {X.columns.tolist()}')
fs = FeatureSelection(model_args)
selected_features = fs.select_features(X, y)
print(f'Selected features: {selected_features}')

Feature list: ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
Selected features: ['AGE', 'B', 'CHAS', 'CRIM', 'DIS', 'INDUS', 'LSTAT', 'NOX', 'PTRATIO', 'RAD', 'RM', 'TAX', 'ZN']


In [None]:
fs.ratio_df

Unnamed: 0,features,imp(gain),imp_gain_useless,ratio
0,AGE,660,763,1.156061
1,B,36,50,1.388889
2,CHAS,175,178,1.017143
3,CRIM,51,12,0.235294
4,DIS,369,267,0.723577
5,INDUS,841,913,1.085612
6,LSTAT,732,858,1.172131
7,NOX,854,605,0.708431
8,PTRATIO,105,70,0.666667
9,RAD,194,222,1.14433
