In [1]:
import gc
from glob import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import time
from typing import Tuple

import category_encoders as ce
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

import sys
sys.path.append("../utils")
from metrics import compute_recall_at4, compute_normalized_gini, compute_amex_metric
#from messaging import send_message

pd.set_option('display.max_columns', None)

In [2]:
# metrics in xgboost format

def metric_recall_at4(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
    y_true = dtrain.get_label()
    return 'recall_at4', compute_recall_at4(y_true, predt)

def metric_normalized_gini(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
    y_true = dtrain.get_label()
    return 'norm_gini', compute_normalized_gini(y_true, predt)

def metric_amex(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
    y_true = dtrain.get_label()
    return 'amex_metric', compute_amex_metric(y_true, predt)

In [3]:
# CONFIG PARAMS
N_REPEATS = 3
MAX_ITERATIONS = 3000
DATASET_VERSION = "02"

In [4]:
OOF_PATH = Path(f"../data/oof/xgboost-gblinear-dsv{DATASET_VERSION}")
SUB_PATH = Path(f"../data/subs/xgboost-gblinear-dsv{DATASET_VERSION}")
ART_PATH = Path(f"../artifacts/xgboost-gblinear-dsv{DATASET_VERSION}")

if not OOF_PATH.exists():
    OOF_PATH.mkdir(parents=True, exist_ok=True)
if not SUB_PATH.exists():
    SUB_PATH.mkdir(parents=True, exist_ok=True)
if not ART_PATH.exists():
    ART_PATH.mkdir(parents=True, exist_ok=True)

***
## load and prepare data

In [5]:
train = pd.read_parquet(f"../data/processed/dsv{DATASET_VERSION}/train.parquet")
train_labels = pd.read_csv("../data/raw/train_labels.csv", index_col="customer_ID")

In [6]:
input_feats = train.columns.tolist()
categ_feats = [
    'B_30_first', 'B_38_first', 'D_114_first', 'D_116_first', 'D_117_first', 
    'D_120_first', 'D_126_first', 'D_63_first', 'D_64_first', 'D_66_first', 'D_68_first',
    'B_30_last', 'B_38_last', 'D_114_last', 'D_116_last', 'D_117_last', 
    'D_120_last', 'D_126_last', 'D_63_last', 'D_64_last', 'D_66_last', 'D_68_last',
]
numeric_feats = [col for col in input_feats if col not in categ_feats]

len(input_feats)

1562

In [7]:
train = pd.merge(train, train_labels, how="inner", left_index=True, right_index=True)
del train_labels
gc.collect()

517

***
## model training

train with repeated cross validation

In [8]:
model_params = {
    # general parameters
    'booster':'gblinear',
    'objective':'binary:logistic',
    'disable_default_eval_metric':True,
    'seed':2112,
    'eta': 0.05,
    # linear booster parameters
    'updater': 'coord_descent',
    'feature_selector': 'thrifty',
    #'feature_selector': 'cyclic',
    'top_k': 150,
    #'alpha':  1.0,
    #'lambda': 0.1,
}

In [9]:
def train_models(dataframe: pd.DataFrame, n_folds: int = 5,) -> tuple:
    
    models = list()
    encoders = list()
    scalers = list()
    
    # dataframe to store the oof predictions
    oof = dataframe[["target"]].copy()
    oof["pred"] = -1

    for fold in range(n_folds):
        
        print(f" training model {fold+1}/{n_folds} ".center(100, "#"))
        
        train_df = dataframe.query("fold != @fold").copy()
        valid_df = dataframe.query("fold == @fold").copy()
        
        encoder = ce.glmm.GLMMEncoder()
        encoder.fit(train_df[categ_feats], train_df["target"].values)
        train_df[categ_feats] = encoder.transform(train_df[categ_feats])
        valid_df[categ_feats] = encoder.transform(valid_df[categ_feats])
        
        scaler = StandardScaler(with_mean=True, with_std=True)
        scaler.fit(train_df[input_feats].values)
        train_df[input_feats] = scaler.transform(train_df[input_feats].values)
        valid_df[input_feats] = scaler.transform(valid_df[input_feats].values)
                        
        train_dset = xgb.DMatrix(
            data=train_df.loc[:,input_feats],
            label=train_df.loc[:,"target"].values,
        )
        valid_dset = xgb.DMatrix(
            data=valid_df.loc[:,input_feats],
            label=valid_df.loc[:,"target"].values,
        )
        model = xgb.train(
            params = model_params,
            dtrain=train_dset,
            num_boost_round=MAX_ITERATIONS,
            early_stopping_rounds=int(0.1*MAX_ITERATIONS),
            evals=[(valid_dset,"eval"), ],
            custom_metric=metric_amex,
            maximize=True,
            verbose_eval=20,
        )
        
        #lgb.plot_importance(model, figsize=(8,15), importance_type="split", max_num_features=30)
        #lgb.plot_importance(model, figsize=(8,15), importance_type="gain", max_num_features=30)
        #plt.show()        
        
        oof.loc[valid_df.index,"pred"] = model.predict(valid_dset)
        
        models.append(model)
        encoders.append(encoder)
        scalers.append(scaler)
        del train_df,valid_df,train_dset,valid_dset
        gc.collect()
    
    return models,encoders,scalers,oof

In [10]:
# implement repeated cross validation
sorted(glob("../data/processed/cv*.csv"))

['../data/processed/cv0.csv',
 '../data/processed/cv1.csv',
 '../data/processed/cv2.csv',
 '../data/processed/cv3.csv',
 '../data/processed/cv4.csv',
 '../data/processed/cv5.csv',
 '../data/processed/cv6.csv',
 '../data/processed/cv7.csv',
 '../data/processed/cv8.csv',
 '../data/processed/cv9.csv']

In [11]:
%%time 

all_models = list()
all_encoders = list()
all_scalers = list()
all_oof_dfs = list()

for repetition in range(N_REPEATS):
    print(f" repeated cross-validation step: {repetition+1}/{N_REPEATS} ".center(100, "#"))

    folds = pd.read_csv(f'../data/processed/cv{repetition}.csv', index_col="customer_ID")
    _train = pd.merge(train, folds, how="inner", left_index=True, right_index=True).reset_index(drop=True)
    
    tic = time.time()
    models,encoders,scalers,oof = train_models(_train, n_folds=5)
    tac = time.time()
    print(f"Training time: {(tac-tic)/60} min.")
          
    # oof metrics
    print("OOF recall_at4:", compute_recall_at4(oof.target.values, oof.pred.values))
    print("OOF normalized_gini:", compute_normalized_gini(oof.target.values, oof.pred.values))
    print("OOF competition metric:", compute_amex_metric(oof.target.values, oof.pred.values))
    
    all_models.append(models)
    all_encoders.append(encoders)
    all_scalers.append(scalers)
    all_oof_dfs.append(oof)
    
    # save oof predictions
    oof.to_csv(OOF_PATH/f"oof-cv{repetition}.csv", index=False)
    # save models
    #for fold,_model in enumerate(models):
    #    _model.save_model(ART_PATH/f"/model-cv{repetition}-fold{fold}.cbm", format="cbm")
          
    del _train, folds; gc.collect()

############################### repeated cross-validation step: 1/3 ################################
######################################## training model 1/5 ########################################
[0]	eval-amex_metric:0.69738
[20]	eval-amex_metric:0.77446
[40]	eval-amex_metric:0.77874
[60]	eval-amex_metric:0.78210
[80]	eval-amex_metric:0.78316
[100]	eval-amex_metric:0.78409
[120]	eval-amex_metric:0.78444
[140]	eval-amex_metric:0.78492
[160]	eval-amex_metric:0.78511
[180]	eval-amex_metric:0.78470
[200]	eval-amex_metric:0.78493
[220]	eval-amex_metric:0.78467
[240]	eval-amex_metric:0.78457
[260]	eval-amex_metric:0.78479
[280]	eval-amex_metric:0.78492
[300]	eval-amex_metric:0.78476
[320]	eval-amex_metric:0.78451
[340]	eval-amex_metric:0.78437
[360]	eval-amex_metric:0.78448
[380]	eval-amex_metric:0.78465
[400]	eval-amex_metric:0.78453
[420]	eval-amex_metric:0.78454
[440]	eval-amex_metric:0.78452
[460]	eval-amex_metric:0.78446
######################################## training model 2/5 

In [12]:
results = list()

for oof in all_oof_dfs:  
    r = {
        "recall_at4": compute_recall_at4(oof.target.values, oof.pred.values),
        "gini": compute_normalized_gini(oof.target.values, oof.pred.values),
        "metric": compute_amex_metric(oof.target.values, oof.pred.values),
    }
    results.append(r)

results = pd.DataFrame(results)
display(results)

print("\nmean:")
display(results.mean(axis=0))

print("\nstd:")
display(results.std(axis=0))

Unnamed: 0,recall_at4,gini,metric
0,0.645412,0.917261,0.781336
1,0.646775,0.917258,0.782016
2,0.645883,0.917319,0.781601



mean:


recall_at4    0.646023
gini          0.917279
metric        0.781651
dtype: float64


std:


recall_at4    0.000692
gini          0.000035
metric        0.000343
dtype: float64

In [13]:
del train
gc.collect()

21

***
## make predictions and submit

In [14]:
def make_predictions(
        dataframe:pd.DataFrame, 
        input_feats:list,
        categ_feats:list,
        encoders:list,
        scalers:list,
        models:list,
    ) -> np.array:
    preds = list()
    for encoder,scaler,model in zip(encoders,scalers,models):
        _dataframe = dataframe.copy()
        _dataframe[categ_feats] = encoder.transform(_dataframe[categ_feats])
        _dataframe[input_feats] = scaler.transform(_dataframe[input_feats].values)
        _dataframe_casted = xgb.DMatrix(data=_dataframe[input_feats])
        preds.append(model.predict(_dataframe_casted))
    return np.mean(preds, axis=0)   

In [15]:
test = pd.read_parquet(f"../data/processed/dsv{DATASET_VERSION}/test.parquet")
sub = pd.read_csv("../data/raw/sample_submission.csv")

In [16]:
%%time

all_preds = list()

for repetition in range(N_REPEATS):
    if "prediction" in sub.columns:
        sub.drop("prediction", axis=1, inplace=True)
    if "prediction" in test.columns:
        test.drop("prediction", axis=1, inplace=True)
        
    models = all_models[repetition]
    encoders = all_encoders[repetition]
    preds = make_predictions(test, input_feats, categ_feats, encoders, scalers, models)
    all_preds.append(preds)
       
    test["prediction"] = preds
    sub["prediction"] = test.loc[sub.customer_ID.values,"prediction"].values
    assert sub.prediction.isna().sum() == 0
    sub.to_csv(SUB_PATH/f"submission-cv{repetition}.csv", index=False)

CPU times: user 24min 2s, sys: 2min 42s, total: 26min 44s
Wall time: 11min 5s


In [17]:
%%time
# predict using all the trained models
if "prediction" in sub.columns:
    sub.drop("prediction", axis=1, inplace=True)
if "prediction" in test.columns:
    test.drop("prediction", axis=1, inplace=True)

test["prediction"] = np.mean(all_preds, axis=0)
sub["prediction"] = test.loc[sub.customer_ID.values,"prediction"].values
assert sub.prediction.isna().sum() == 0
sub.to_csv(SUB_PATH/f"submission-all.csv", index=False)

CPU times: user 4.37 s, sys: 2.29 s, total: 6.66 s
Wall time: 6.66 s


***