In [None]:
import gc
from glob import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import time

import category_encoders as ce
import lightgbm as lgb
from sklearn import metrics

import sys
sys.path.append("../utils")
from metrics import compute_recall_at4, compute_normalized_gini, compute_amex_metric
#from messaging import send_message

pd.set_option('display.max_columns', None)

In [None]:
# metrics in lgbm format

def metric_recall_at4(y_pred: np.ndarray, data: lgb.Dataset):
    y_true = data.get_label()
    return 'recall_at4', compute_recall_at4(y_true, y_pred), True

def metric_normalized_gini(y_pred: np.ndarray, data: lgb.Dataset):
    y_true = data.get_label()
    return 'norm_gini', compute_normalized_gini(y_true, y_pred), True

def metric_amex(y_pred: np.ndarray, data: lgb.Dataset):
    y_true = data.get_label()
    return 'amex_metric', compute_amex_metric(y_true, y_pred), True

In [None]:
# CONFIG PARAMS
SEEDS = [2, 5, 7, 11, 19, 23, 42]
MAX_ITERATIONS = 5000

In [None]:
SUB_PATH = Path("../data/subs/lgbm-dart-bce-dsv02-full")
ART_PATH = Path("../artifacts/lgbm-dart-bce-dsv02-full")

if not SUB_PATH.exists():
    SUB_PATH.mkdir(parents=True, exist_ok=True)
if not ART_PATH.exists():
    ART_PATH.mkdir(parents=True, exist_ok=True)

***
## load and prepare data

In [None]:
!ls ../data/processed/dsv02

In [None]:
train = pd.read_parquet("../data/processed/dsv02/train.parquet")
train_labels = pd.read_csv("../data/raw/train_labels.csv", index_col="customer_ID")

In [None]:
input_feats = train.columns.tolist()
categ_feats = [
    'B_30_first', 'B_38_first', 'D_114_first', 'D_116_first', 'D_117_first', 
    'D_120_first', 'D_126_first', 'D_63_first', 'D_64_first', 'D_66_first', 'D_68_first',
    'B_30_last', 'B_38_last', 'D_114_last', 'D_116_last', 'D_117_last', 
    'D_120_last', 'D_126_last', 'D_63_last', 'D_64_last', 'D_66_last', 'D_68_last',
]
len(input_feats)

In [None]:
train = pd.merge(train, train_labels, how="inner", left_index=True, right_index=True)
del train_labels
gc.collect()

***
## model training

train with repeated cross validation

In [None]:
model_params = {
    'boosting':'dart',
    'objective': 'binary',
    'metric': 'None',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'max_bin': 511,
    'bin_construct_sample_cnt': 100000000,
    'bagging_freq': 1,
    'bagging_fraction': 0.95,
    'feature_fraction': 0.15,
    'lambda_l1': 10.352308845012756,
    'lambda_l2': 1.569788743184169,
    'min_data_in_leaf': 2000,
    'path_smooth': 30.4965047619009,
    'seed': 2112,
    'force_col_wise': True,
    'feature_pre_filter': False,
    'verbosity': -1,
}

In [None]:
def train_model(dataframe:pd.DataFrame, model_params:dict) -> lgb.Booster:         
    train_dset = lgb.Dataset(
        data=dataframe.loc[:,input_feats],
        label=dataframe.loc[:,"target"].values,
        categorical_feature=categ_feats,
        free_raw_data=True,
    )
    model = lgb.train(
        params=model_params,
        train_set=train_dset,
        num_boost_round=MAX_ITERATIONS,
    )
    lgb.plot_importance(model, figsize=(8,15), importance_type="split", max_num_features=30)
    lgb.plot_importance(model, figsize=(8,15), importance_type="gain", max_num_features=30)
    plt.show()        

    del train_dset
    gc.collect()
    
    return model

In [None]:
%%time 

all_models = list()

for it,seed in enumerate(SEEDS):
    print("#"*80)
    print(f" Training model {it+1} of {len(SEEDS)} ".center(80, "#"))
    print("#"*80)
    
    _model_params = dict(model_params)
    _model_params["seed"] = seed
    
    tic = time.time()
    model = train_model(train, _model_params)
    tac = time.time()
    print(f"Training time: {(tac-tic)/60} min.")
    
    all_models.append(model)
    
# save models
for seed,_model in zip(SEEDS,all_models):
    _model.save_model(ART_PATH/f"model-seed{str(seed).zfill(2)}.txt")

In [None]:
del train
gc.collect()

***
## make predictions and submit

In [None]:
test = pd.read_parquet("../data/processed/dsv02/test.parquet")
sub = pd.read_csv("../data/raw/sample_submission.csv")

In [None]:
%%time

all_preds = list()

for seed,model in zip(SEEDS,all_models):
    if "prediction" in sub.columns:
        sub.drop("prediction", axis=1, inplace=True)
    if "prediction" in test.columns:
        test.drop("prediction", axis=1, inplace=True)
        
    preds = model.predict(test[input_feats])
    all_preds.append(preds)
       
    test["prediction"] = preds
    sub["prediction"] = test.loc[sub.customer_ID.values,"prediction"].values
    assert sub.prediction.isna().sum() == 0
    sub.to_csv(SUB_PATH/f"submission-seed{str(seed).zfill(2)}.csv", index=False)

In [None]:
%%time
# predict using all the trained models
if "prediction" in sub.columns:
    sub.drop("prediction", axis=1, inplace=True)
if "prediction" in test.columns:
    test.drop("prediction", axis=1, inplace=True)

test["prediction"] = np.mean(all_preds, axis=0)
sub["prediction"] = test.loc[sub.customer_ID.values,"prediction"].values
assert sub.prediction.isna().sum() == 0
sub.to_csv(SUB_PATH/f"submission-all.csv", index=False)

***