In [None]:
import gc
from glob import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import time
from collections import OrderedDict
from tqdm import tqdm

import category_encoders as ce
import catboost
from sklearn import metrics

import sys
sys.path.append("../utils")
from metrics import compute_recall_at4, compute_normalized_gini, compute_amex_metric
#from messaging import send_message

pd.set_option('display.max_columns', None)

In [None]:
catboost.__version__

In [None]:
# metrics in catboost format
class AmexMetric:
    
    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):    
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])
        scores = approxes[0]
        target = target
        return compute_amex_metric(target, scores), 1.

    def get_final_error(self, error, weight):
        return error

class RecallAt4:
    
    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])
        scores = np.array(approxes[0])
        target = np.array(target)                                   
        return compute_recall_at4(target, scores), 1.

    def get_final_error(self, error, weight):
        return error

class NormGini:
    
    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])
        scores = np.array(approxes[0])
        target = np.array(target)                   
        return compute_normalized_gini(target, scores), 1.

    def get_final_error(self, error, weight):
        return error

In [None]:
# CONFIG PARAMS
N_REPEATS = 3
DATASET_VERSION = "05"

In [None]:
OUT_PATH = Path(f"../data/feat-selection")

if not OUT_PATH.exists():
    OUT_PATH.mkdir(parents=True, exist_ok=True)

***
## load and prepare data

In [None]:
train = pd.read_parquet(f"../data/processed/dsv{DATASET_VERSION}/train.parquet")
train_labels = pd.read_csv("../data/raw/train_labels.csv", index_col="customer_ID")

In [None]:
train.info()

In [None]:
input_feats = train.columns.tolist()
#categ_feats = [
#    'B_30_first', 'B_38_first', 'D_114_first', 'D_116_first', 'D_117_first', 
#    'D_120_first', 'D_126_first', 'D_63_first', 'D_64_first', 'D_66_first', 'D_68_first',
#    'B_30_last', 'B_38_last', 'D_114_last', 'D_116_last', 'D_117_last', 
#    'D_120_last', 'D_126_last', 'D_63_last', 'D_64_last', 'D_66_last', 'D_68_last',
#]
len(input_feats)

In [None]:
train = pd.merge(train, train_labels, how="inner", left_index=True, right_index=True)
del train_labels
gc.collect()

***
## model training

In [None]:
model_params = {
    'eval_metric':AmexMetric(),
    'learning_rate': 0.05,
    'nan_mode':'Min',
    'random_seed': 2112,
    'auto_class_weights': None,
    'bootstrap_type': 'Bernoulli',
    'depth': 5,
    'rsm': 0.2,
    'iterations': 3900,
    'l2_leaf_reg': 8.017281499631434,
    'min_data_in_leaf': 1600,
    'random_strength': 7.69963242351621,
    'subsample': 0.8500000000000001,
    # early stopping
    'early_stopping_rounds':300,
    'use_best_model': True,
 }

In [None]:
def train_models(dataframe: pd.DataFrame, n_folds: int = 5,) -> tuple:
    
    models = list()
    oof_dfs = list()
    
    for fold in range(n_folds):
        
        print(f" training model {fold+1}/{n_folds} ".center(100, "#"))
        
        train_df = dataframe.query("fold != @fold").copy()
        valid_df = dataframe.query("fold == @fold").copy()
                
        train_dset = catboost.Pool(
            data=train_df.loc[:,input_feats],
            label=train_df.loc[:,"target"].values,
            #cat_features=categ_feats,
        )
        valid_dset = catboost.Pool(
            data=valid_df.loc[:,input_feats],
            label=valid_df.loc[:,"target"].values,
            #cat_features=categ_feats,
        )
        
        model = catboost.CatBoostClassifier(**model_params)
        model.fit(
            train_dset,
            eval_set=valid_dset,
            verbose=50,
        )
                    
        valid_df.loc[:,"pred"] = model.predict(valid_dset, prediction_type="Probability")[:,1]
        
        models.append(model)
        oof_dfs.append(valid_df)
        del train_df,train_dset,valid_dset
        gc.collect()
    
    return models,oof_dfs

In [None]:
%%time 

all_models = list()
all_oof_dfs = list()

for repetition in range(1,N_REPEATS+1):

    print(f" repeated cross-validation step: {repetition+1}/{N_REPEATS} ".center(100, "#"))

    folds = pd.read_csv(f'../data/processed/cv{repetition}.csv', index_col="customer_ID")
    _train = pd.merge(train, folds, how="inner", left_index=True, right_index=True).reset_index(drop=True)
    
    tic = time.time()
    models,oof_dfs = train_models(_train)
    tac = time.time()
    print(f"Training time: {(tac-tic)/60} min.")
              
    all_models.extend(models)
    all_oof_dfs.extend(oof_dfs)
              
    del _train, folds; gc.collect()

***
## computes LFC (loss function change)

In [None]:
outputs = list()

for model,oof_df in zip(all_models,all_oof_dfs):
    oof_dset = catboost.Pool(
        data = oof_df.loc[:,input_feats],
        label = oof_df.loc[:,"target"].values,
    )
    out = model.get_feature_importance(
        data = oof_dset,
        type = "LossFunctionChange",
        verbose=True,
    )
    outputs.append(out)

In [None]:
result_lfc = pd.DataFrame(outputs, columns=input_feats)
result_lfc.to_csv(OUT_PATH/"catb-lfc.csv", index=False)
result_lfc

***
## computes SHAP

In [None]:
outputs = list()

for model,oof_df in zip(all_models,all_oof_dfs):  
    oof_dset = catboost.Pool(
        data = oof_df.loc[:,input_feats],
        label = oof_df.loc[:,"target"].values,
    )
    out = model.get_feature_importance(
        data = oof_dset,
        type = "ShapValues",
        verbose=True,
    )
    outputs.append(np.mean(np.abs(out[:,:-1]), axis=0))

In [None]:
results_shap = pd.DataFrame(outputs, columns=input_feats)
results_shap.to_csv(OUT_PATH/"catb-shap.csv", index=False)
results_shap

***
## computes PFI

In [None]:
def compute_pfi(
        model:catboost.core.CatBoostClassifier,
        dataframe:pd.DataFrame, 
        features:list, 
        target:str
    ):
    dataframe = dataframe.copy()
    
    # calculates base score
    preds = model.predict(dataframe[features], prediction_type="Probability")[:,1]
    target = dataframe[target].values
    BASE_SCORE = compute_amex_metric(target, preds)

    output = OrderedDict()
    output["base_score"] = BASE_SCORE

    # calculates pfi for each feature
    for col in tqdm(features):
        raw_values = dataframe[col].copy()
        index = np.random.permutation(dataframe.index.values)
        dataframe[col] = dataframe.loc[index, col].values
        preds = model.predict(dataframe[features], prediction_type="Probability")[:,1]
        score = compute_amex_metric(target, preds)
        pfi = (BASE_SCORE - score)
        output[col] = pfi
        # rollback the permutation
        dataframe[col] = raw_values
        del raw_values
        
    return output

In [None]:
outputs = list()

for model,oof_df in zip(all_models,all_oof_dfs):
    out = compute_pfi(
        model=model,
        dataframe=oof_df,
        features=input_feats,
        target="target",
    )
    outputs.append(out)

In [None]:
result_pfi = pd.DataFrame(outputs)
result_pfi.to_csv(OUT_PATH/"catb-pfi.csv", index=False)
result_pfi

***