<a href="https://colab.research.google.com/github/meltyyyyy/kaggle-amex/blob/main/Notebooks/XGB/Aggregation001.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
class Config:
    name = "XGB/Agg002"

    n_splits = 5
    seed = 2022
    target = "target"

    # Colab Env
    upload_from_colab = True
    api_path = "/content/drive/MyDrive/workspace/kaggle.json"
    drive_path = "/content/drive/MyDrive/workspace/kaggle-amex"
    
    # Kaggle Env
    kaggle_dataset_path = None
    
    # Reka Env
    dir_path = '/home/abe/kaggle/kaggle-amex'

In [None]:
import os
import json
import warnings
import shutil
import logging
import random
import datetime
import sys
import gc
import multiprocessing
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cupy, cudf
warnings.filterwarnings('ignore')

## Environment Setting

In [None]:
INPUT = os.path.join(Config.dir_path, 'input')
OUTPUT = os.path.join(Config.dir_path, 'output')
SUBMISSION = os.path.join(Config.dir_path, 'submissions')
OUTPUT_EXP = os.path.join(OUTPUT, Config.name)
EXP_MODEL = os.path.join(OUTPUT_EXP, "model")
EXP_FIG = os.path.join(OUTPUT_EXP, "fig")
EXP_PREDS = os.path.join(OUTPUT_EXP, "preds")

# make dirs
for d in [INPUT, SUBMISSION, EXP_MODEL, EXP_FIG, EXP_PREDS]:
    os.makedirs(d, exist_ok=True)

In [None]:
train = cudf.read_parquet(os.path.join(INPUT, 'train.parquet'))
# test = cudf.read_parquet(os.path.join(INPUT, 'test.parquet'))
target = cudf.read_csv(os.path.join(INPUT, 'train_labels.csv'))
train['S_2'] = cudf.to_datetime(train['S_2'])
# test['S_2'] = cudf.to_datetime(test['S_2'])
train = train.fillna(-127)
# test = test.fillna(-127)

In [None]:
train.info()

In [None]:
train.head()

## Feature Engineering

In [None]:
def add_agg_features(df):
    # FEATURE ENGINEERING FROM 
    # https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
    all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
    cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    num_features = [col for col in all_cols if col not in cat_features]

    num_agg = df.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    num_agg.columns = ['_'.join(x) for x in num_agg.columns]

    cat_agg = df.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    cat_agg.columns = ['_'.join(x) for x in cat_agg.columns]

    df = cudf.concat([num_agg, cat_agg], axis=1)
    del num_agg, cat_agg
    return df

train = add_agg_features(train)
# test = add_agg_features(test)

## Create target

In [None]:
# ADD TARGETS
target = target.set_index('customer_ID')
train = train.merge(target, left_index=True, right_index=True, how='left')
train.target = train.target.astype('int8')
del target

In [None]:
# NEEDED TO MAKE CV DETERMINISTIC (cudf merge above randomly shuffles rows)
train = train.sort_index().reset_index()
# FEATURES
features = train.columns[1:-1]

## Define amex metric

In [None]:
import xgboost as xgb
# NEEDED WITH DeviceQuantileDMatrix BELOW
class IterLoadForDMatrix(xgb.core.DataIter):
    def __init__(self, df=None, features=None, target=None, batch_size=256*1024):
        self.features = features
        self.target = target
        self.df = df
        self.it = 0 # set iterator to 0
        self.batch_size = batch_size
        self.batches = int( np.ceil( len(df) / self.batch_size ) )
        super().__init__()

    def reset(self):
        '''Reset the iterator'''
        self.it = 0

    def next(self, input_data):
        '''Yield next batch of data.'''
        if self.it == self.batches:
            return 0 # Return 0 when there's no more batch.
        
        a = self.it * self.batch_size
        b = min( (self.it + 1) * self.batch_size, len(self.df) )
        dt = cudf.DataFrame(self.df.iloc[a:b])
        input_data(data=dt[self.features], label=dt[self.target]) #, weight=dt['weight'])
        self.it += 1
        return 1

In [None]:
# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

## Training

In [None]:
def plot_metric(result):
  train_metric = result['train']['AmexMetric']
  plt.plot(train_metric, label='amex metric')
  eval_metric = result['valid']['AmexMetric']
  plt.plot(eval_metric, label='amex metric')
  plt.grid()
  plt.legend()
  plt.xlabel('rounds')
  plt.ylabel('amex metric')
  plt.show()
  plt.savefig(f'{EXP_FIG}/learning_curve.png')

In [None]:
from sklearn.model_selection import StratifiedKFold

def fit_xgb(X, y, params=None):
  models = []
  scores = []

  skf = StratifiedKFold(n_splits=Config.n_splits, shuffle=True, random_state=Config.seed)
  
  for fold, (train_indices, valid_indices) in enumerate(skf.split(X, y)):
    print("-"*50+f' fold{fold} '+'-'*50)
    Xy_train = IterLoadForDMatrix(train.loc[train_indices], features, Config.target)
    X_valid, y_valid = train.loc[valid_indices, features], train.loc[valid_indices, Config.target]
    
    dtrain = xgb.DeviceQuantileDMatrix(Xy_train, max_bin=256)
    dvalid = xgb.DMatrix(data=X_valid, label=y_valid)
    
    evals_result = {}
    model = xgb.train(params,
                dtrain=dtrain,
                evals=[(dtrain,'train'),(dvalid,'valid')],
                num_boost_round=9999,
                early_stopping_rounds=100,
                evals_result=evals_result,
                verbose_eval=100) 
    
    # ------------------- prediction -------------------
    pred = model.predict(dvalid)
    score = amex_metric_mod(y_valid.values, pred)

    # ------------------- plot -------------------
    plot_metric(evals_result)

    # ------------------- save -------------------
    file = f'{EXP_MODEL}/xgb_fold{fold}.pkl'
    model.save_model(file)
    scores.append(score)
    models.append(model)
    print(f'fold{fold} amex meric: {score}')
    
    del dtrain, Xy_train
    del X_valid, y_valid, dvalid, model
    print()

  print(f"OOF Score: {np.mean(scores):.5f}")
  return models

def inference_xgb(models, X):
    pred = np.array([model.predict_proba(X) for model in models])
    pred = np.mean(pred, axis=0)[:, 1]
    return pred

In [None]:
xgb_params = { 
    'max_depth':4, 
    'learning_rate':0.05, 
    'subsample':0.8,
    'colsample_bytree':0.6, 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'tree_method':'gpu_hist',
    'random_state':Config.seed
}
train = train.to_pandas()
models = fit_xgb(train[features], train[Config.target], params=xgb_params)
# models = [joblib.load(f'{EXP_MODEL}/lgbm_fold{i}.pkl') for i in range(Config.n_splits)]
# pred = inference_xgb(models, test[features])

## Plot importances

In [None]:
def plot_importances(models):
    importance_df = pd.DataFrame(models[0].feature_importances_, 
                                 index=features, 
                                 columns=['importance'])\
                        .sort_values("importance", ascending=False)

    plt.subplots(figsize=(len(features) // 4, 5))
    plt.bar(importance_df.index, importance_df.importance)
    plt.grid()
    plt.xticks(rotation=90)
    plt.ylabel("importance")
    plt.tight_layout()
    plt.show()

plot_importances(models)

## Submission

In [None]:
sub = pd.DataFrame({'customer_ID': test.index,
                    'prediction': pred})
sub.to_csv(f'{SUBMISSION}/submission.csv', index=False)

In [None]:
! kaggle competitions submit -c amex-default-prediction -f /content/drive/MyDrive/workspace/kaggle-amex/Submission/submission.csv -m "CatBoost Baseline"