In [1]:
import gc
from glob import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import time
from typing import Tuple

import category_encoders as ce
import xgboost as xgb
from sklearn import metrics

import sys
sys.path.append("../utils")
from metrics import compute_recall_at4, compute_normalized_gini, compute_amex_metric
#from messaging import send_message

pd.set_option('display.max_columns', None)

In [2]:
# metrics in xgboost format

def metric_recall_at4(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
    y_true = dtrain.get_label()
    return 'recall_at4', compute_recall_at4(y_true, predt)

def metric_normalized_gini(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
    y_true = dtrain.get_label()
    return 'norm_gini', compute_normalized_gini(y_true, predt)

def metric_amex(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
    y_true = dtrain.get_label()
    return 'amex_metric', compute_amex_metric(y_true, predt)

In [3]:
# CONFIG PARAMS
N_REPEATS = 1
MAX_ITERATIONS = 5000

In [4]:
OOF_PATH = Path("../data/oof/xgboost-dart-dsv02")
SUB_PATH = Path("../data/subs/xgboost-dart-dsv02")
ART_PATH = Path("../artifacts/xgboost-dart-dsv02")

if not OOF_PATH.exists():
    OOF_PATH.mkdir(parents=True, exist_ok=True)
if not SUB_PATH.exists():
    SUB_PATH.mkdir(parents=True, exist_ok=True)
if not ART_PATH.exists():
    ART_PATH.mkdir(parents=True, exist_ok=True)

***
## load and prepare data

In [5]:
!ls ../data/processed/dsv02

test.parquet  train.parquet


In [6]:
train = pd.read_parquet("../data/processed/dsv02/train.parquet")
train_labels = pd.read_csv("../data/raw/train_labels.csv", index_col="customer_ID")

In [7]:
input_feats = train.columns.tolist()
categ_feats = [
    'B_30_first', 'B_38_first', 'D_114_first', 'D_116_first', 'D_117_first', 
    'D_120_first', 'D_126_first', 'D_63_first', 'D_64_first', 'D_66_first', 'D_68_first',
    'B_30_last', 'B_38_last', 'D_114_last', 'D_116_last', 'D_117_last', 
    'D_120_last', 'D_126_last', 'D_63_last', 'D_64_last', 'D_66_last', 'D_68_last',
]
len(input_feats)

1562

In [8]:
train = pd.merge(train, train_labels, how="inner", left_index=True, right_index=True)
del train_labels
gc.collect()

517

***
## model training

train with repeated cross validation

In [9]:
model_params = {
    'objective':'binary:logistic',
    'seed':2112,
    'booster': 'dart',
    'disable_default_eval_metric':True,
    'eta': 0.05,
    'max_depth': 5,
    'min_child_weight': 500,
    'subsample':0.95,
    'colsample_bytree':0.2,
    'alpha':10,
    'lambda':1,
    'tree_method':'hist',
    'grow_policy':'depthwise',
    'max_bin':256,
    # dart parameters
    'sample_type':'uniform',
    'normalize_type':'tree',
    'rate_drop':0.05,
    'skip_drop':0.5,
}

In [10]:
def train_models(dataframe: pd.DataFrame, n_folds: int = 5,) -> tuple:
    
    models = list()
    encoders = list()
    
    # dataframe to store the oof predictions
    oof = dataframe[["target"]].copy()
    oof["pred"] = -1

    for fold in range(n_folds):
        
        print(f" training model {fold+1}/{n_folds} ".center(100, "#"))
        
        train_df = dataframe.query("fold != @fold").copy()
        valid_df = dataframe.query("fold == @fold").copy()
        
        encoder = ce.glmm.GLMMEncoder()
        encoder.fit(train_df[categ_feats], train_df["target"].values)
        train_df[categ_feats] = encoder.transform(train_df[categ_feats])
        valid_df[categ_feats] = encoder.transform(valid_df[categ_feats])
                        
        train_dset = xgb.DMatrix(
            data=train_df.loc[:,input_feats],
            label=train_df.loc[:,"target"].values,
        )
        valid_dset = xgb.DMatrix(
            data=valid_df.loc[:,input_feats],
            label=valid_df.loc[:,"target"].values,
        )
        
        model = xgb.train(
            params = model_params,
            dtrain=train_dset,
            num_boost_round=MAX_ITERATIONS,
            #early_stopping_rounds=300,
            evals=[(valid_dset,"eval"), ],
            custom_metric=metric_amex,
            maximize=True,
            verbose_eval=20,
        )
        
        #lgb.plot_importance(model, figsize=(8,15), importance_type="split", max_num_features=30)
        #lgb.plot_importance(model, figsize=(8,15), importance_type="gain", max_num_features=30)
        #plt.show()        
        
        oof.loc[valid_df.index,"pred"] = model.predict(valid_dset, iteration_range=(0,MAX_ITERATIONS))
        
        models.append(model)
        encoders.append(encoder)
        del train_df,valid_df,train_dset,valid_dset
        gc.collect()
    
    return models,encoders,oof

In [11]:
# implement repeated cross validation
sorted(glob("../data/processed/cv*.csv"))

['../data/processed/cv0.csv',
 '../data/processed/cv1.csv',
 '../data/processed/cv2.csv',
 '../data/processed/cv3.csv',
 '../data/processed/cv4.csv',
 '../data/processed/cv5.csv',
 '../data/processed/cv6.csv',
 '../data/processed/cv7.csv',
 '../data/processed/cv8.csv',
 '../data/processed/cv9.csv']

In [12]:
%%time 

all_models = list()
all_encoders = list()
all_oof_dfs = list()

for repetition in range(N_REPEATS):
    print(f" repeated cross-validation step: {repetition+1}/{N_REPEATS} ".center(100, "#"))

    folds = pd.read_csv(f'../data/processed/cv{repetition}.csv', index_col="customer_ID")
    _train = pd.merge(train, folds, how="inner", left_index=True, right_index=True).reset_index(drop=True)
    
    tic = time.time()
    models,encoders,oof = train_models(_train, n_folds=1)
    tac = time.time()
    print(f"Training time: {(tac-tic)/60} min.")
          
    # oof metrics
    print("OOF recall_at4:", compute_recall_at4(oof.target.values, oof.pred.values))
    print("OOF normalized_gini:", compute_normalized_gini(oof.target.values, oof.pred.values))
    print("OOF competition metric:", compute_amex_metric(oof.target.values, oof.pred.values))
    
    all_models.append(models)
    all_encoders.append(encoders)
    all_oof_dfs.append(oof)
    
    # save oof predictions
    oof.to_csv(OOF_PATH/f"oof-cv{repetition}.csv", index=False)
    # save models
    #for fold,_model in enumerate(models):
    #    _model.save_model(ART_PATH/f"/model-cv{repetition}-fold{fold}.cbm", format="cbm")
          
    del _train, folds; gc.collect()

############################### repeated cross-validation step: 1/1 ################################


######################################## training model 1/1 ########################################


[0]	eval-amex_metric:0.67798


[20]	eval-amex_metric:0.74552


[40]	eval-amex_metric:0.75533


[60]	eval-amex_metric:0.76387


[80]	eval-amex_metric:0.76650


[100]	eval-amex_metric:0.76929


[120]	eval-amex_metric:0.77217


[140]	eval-amex_metric:0.77435


[160]	eval-amex_metric:0.77692


[180]	eval-amex_metric:0.77870


[200]	eval-amex_metric:0.78033


[220]	eval-amex_metric:0.78138


[240]	eval-amex_metric:0.78196


[260]	eval-amex_metric:0.78356


[280]	eval-amex_metric:0.78464


[300]	eval-amex_metric:0.78532


[320]	eval-amex_metric:0.78587


[340]	eval-amex_metric:0.78594


[360]	eval-amex_metric:0.78630


[380]	eval-amex_metric:0.78707


[400]	eval-amex_metric:0.78796


[420]	eval-amex_metric:0.78819


[440]	eval-amex_metric:0.78914


In [None]:
results = list()

for oof in all_oof_dfs:  
    r = {
        "recall_at4": compute_recall_at4(oof.target.values, oof.pred.values),
        "gini": compute_normalized_gini(oof.target.values, oof.pred.values),
        "metric": compute_amex_metric(oof.target.values, oof.pred.values),
    }
    results.append(r)

results = pd.DataFrame(results)
display(results)

print("\nmean:")
display(results.mean(axis=0))

print("\nstd:")
display(results.std(axis=0))

In [None]:
del train
gc.collect()

***
## make predictions and submit

In [None]:
def make_predictions(
        dataframe:pd.DataFrame, 
        input_feats:list,
        categ_feats:list,
        encoders:list,
        models:list,
    ) -> np.array:
    preds = list()
    for encoder,model in zip(encoders,models):
        _dataframe = dataframe.copy()
        _dataframe[categ_feats] = encoder.transform(_dataframe[categ_feats])
        _dataframe_casted = xgb.DMatrix(data=_dataframe[input_feats])
        preds.append( model.predict(_dataframe_casted, iteration_range=(0,MAX_ITERATIONS)))
    return np.mean(preds, axis=0)   

In [None]:
test = pd.read_parquet("../data/processed/dsv02/test.parquet")
sub = pd.read_csv("../data/raw/sample_submission.csv")

In [None]:
%%time

all_preds = list()

for repetition in range(N_REPEATS):
    if "prediction" in sub.columns:
        sub.drop("prediction", axis=1, inplace=True)
    if "prediction" in test.columns:
        test.drop("prediction", axis=1, inplace=True)
        
    models = all_models[repetition]
    encoders = all_encoders[repetition]
    preds = make_predictions(test, input_feats, categ_feats, encoders, models)
    all_preds.append(preds)
       
    test["prediction"] = preds
    sub["prediction"] = test.loc[sub.customer_ID.values,"prediction"].values
    assert sub.prediction.isna().sum() == 0
    sub.to_csv(SUB_PATH/f"submission-cv{repetition}.csv", index=False)

In [None]:
%%time
# predict using all the trained models
if "prediction" in sub.columns:
    sub.drop("prediction", axis=1, inplace=True)
if "prediction" in test.columns:
    test.drop("prediction", axis=1, inplace=True)

test["prediction"] = np.mean(all_preds, axis=0)
sub["prediction"] = test.loc[sub.customer_ID.values,"prediction"].values
assert sub.prediction.isna().sum() == 0
sub.to_csv(SUB_PATH/f"submission-all.csv", index=False)

***