## Import Libraries

#### Change from version 1: We will be using the RAPIDS architecture to make use of the GPU environment

In [2]:
# Basic
import numpy as np
import pandas as pd
import os
import pyarrow
from datetime import datetime
from datetime import datetime, date
import joblib
import csv
import math
import pickle
from scipy import stats
import numpy as np
import statistics as st
import gc
from IPython.display import clear_output

# ML scikit-learn
import sklearn as sk
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import ParameterGrid, KFold

# XGBoost
import xgboost as xgb
import optuna

In [3]:
# Additional GPU libraries
import cudf
import cupy
import cuml
from cuml.model_selection import GridSearchCV, train_test_split
cudf.set_allocator("managed")

## Section 1: Create functions and pipeline classes

In [4]:
def stringToDate(X, date_col):
    
    # convert column to date time
    X[date_col] = cudf.to_datetime(X[date_col], infer_datetime_format = True)
        
    # return df
    return X

In [5]:
def createFeatures(X, cat_cols, num_cols):
    
    # sort dataframe
    X = X.sort_values(by = ['customer_ID', 'S_2'], ascending = False)
        
    ## create features from numeric cols
    X_num = X.groupby("customer_ID", as_index = False)[num_cols].agg(['mean', 'median', 'std', 'min', 'max', 'last', 'first'])
    X_num.columns = ['_'.join(x) for x in X_num.columns]
    X_num = X_num.reset_index()
    
    # fill na values
    num_cols = [col for col in X_num.columns if col not in ['customer_ID']]
    for col in num_cols:
        X_num[col].fillna(X_num[col].mean().astype(cupy.float32), inplace = True)
    

    ## create features from categorical cols
    X_cat = X.groupby("customer_ID", as_index = False)[cat_cols].agg(['count', 'last', 'first', 'nunique'])
    X_cat.columns = ['_'.join(x) for x in X_cat.columns]
    X_cat = X_cat.reset_index()
    
    # fill na values
    cat_cols = [col for col in X_cat.columns if col not in ['customer_ID']]
    for col in cat_cols:
        X_cat[col].fillna(stats.mode(X_cat[col].to_pandas(),nan_policy = 'omit')[0][0], inplace = True)
        
        
    # merge the dataframes
    X_updated = cudf.merge(X_num, X_cat, on = 'customer_ID', how = 'outer')
    
    # fill na after merging
    X_updated.fillna(method = 'ffill', inplace = True)
    X_updated.fillna(method = 'bfill', inplace = True)
        
    # remove intermediate dfs
    del X_num, X_cat
        
    # return updated df
    return X_updated    

## Section 2: Read Data

In [6]:
# initialize project directory
project_dir = '/kaggle'

In [7]:
# list file names in input directory
for dirname, _, filenames in os.walk(os.path.join(project_dir, 'input')):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/model-xgb-v2/model_xbg_v2.pkl
/kaggle/input/model-xgb-v3/model_xbg_v2.json
/kaggle/input/amex-data-integer-dtypes-parquet-format/train.parquet
/kaggle/input/amex-data-integer-dtypes-parquet-format/test.parquet
/kaggle/input/amex-prediction-model-xgb/model_xgb.json
/kaggle/input/model-xgb-20220616/model_xgb_20220616.json
/kaggle/input/amex-default-prediction-v1/__results__.html
/kaggle/input/amex-default-prediction-v1/__notebook_source__.ipynb
/kaggle/input/amex-default-prediction-v1/__notebook__.ipynb
/kaggle/input/amex-default-prediction-v1/__output__.json
/kaggle/input/amex-default-prediction-v1/custom.css
/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv


In [8]:
## path to files
# Train
train_X_path = '/kaggle/input/amex-data-integer-dtypes-parquet-format/train.parquet'
train_y_path = '/kaggle/input/amex-default-prediction/train_labels.csv'

# Test
test_X_path = '/kaggle/input/amex-data-integer-dtypes-parquet-format/test.parquet'

In [None]:
# read train data
df_train_X = cudf.read_parquet(train_X_path)

In [None]:
# view sample
print(df_train_X.shape)
df_train_X.head(5)

## Section 3: Feature engineering

In [None]:
# no of folds
FOLDS = 5

In [None]:
# initialize relevant cols
all_cols = [c for c in list(df_train_X.columns) if c not in ['customer_ID','S_2']]

cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
num_features = [col for col in all_cols if col not in cat_features]

date_field = 'S_2'

In [None]:
# step 1: change date string to date
df_train_X = stringToDate(df_train_X, date_field)

In [None]:
# step 2: create features
df_train_X = createFeatures(df_train_X, cat_features, num_features)

In [None]:
# sort data
df_train_X = df_train_X.sort_values(by = 'customer_ID')

In [None]:
# view sample
print(df_train_X.shape)
df_train_X.head()

## Section 4: Train Models

### Read labels

In [None]:
# read data
df_train_y = cudf.read_csv(train_y_path)

In [None]:
# sort updated df by customer id
df_train_y = df_train_y.sort_values(by = 'customer_ID')

In [None]:
print(df_train_y.shape)
df_train_y.head()

In [None]:
df_train_y['target'].value_counts() / df_train_y.shape[0]

In [None]:
# merge with X
df_train = cudf.merge(df_train_X, df_train_y, on = 'customer_ID', how = 'left')
print(df_train.shape)

In [None]:
# train validation split
X_train, X_val, y_train, y_val = train_test_split(X = df_train.drop(columns = ['target']),
                                                  y = df_train['target'],
                                                  test_size = 0.25, random_state = 42)

In [None]:
# check incidence rate
print(y_train.sum() / y_train.count(), y_val.sum() / y_val.count())

In [None]:
# function for custom evaulation metric for the amex competition (specifically for xgboost)
def amex_metric_xgboost(predt: np.ndarray, dtrain: xgb.DMatrix):
    
    # convert to pandas dataframe
    y_true = pd.DataFrame(data = {'target' : dtrain.get_label()})
    y_pred = pd.DataFrame(data = {'prediction' : predt})

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = pd.concat([y_true, y_pred], axis = 'columns').sort_values('prediction', ascending=False)
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = pd.concat([y_true, y_pred], axis = 'columns').sort_values('prediction', ascending=False)
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return ('Amex_Metric', 0.5 * (g + d))

In [None]:
# # function for custom evaulation metric for the amex competition (generic)
# def amex_metric_generic(y_true: cupy.ndarray, y_pred: cupy.ndarray):
    
#     # convert to pandas dataframe
#     y_true = cudf.DataFrame(data = {'target' : y_true}).to_pandas()
#     y_pred = cudf.DataFrame(data = {'prediction' : y_pred}).to_pandas()

#     def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
#         df = pd.concat([y_true, y_pred], axis = 'columns').sort_values('prediction', ascending=False)
#         df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
#         four_pct_cutoff = int(0.04 * df['weight'].sum())
#         df['weight_cumsum'] = df['weight'].cumsum()
#         df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
#         return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
#     def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
#         df = pd.concat([y_true, y_pred], axis = 'columns').sort_values('prediction', ascending=False)
#         df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
#         df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
#         total_pos = (df['target'] * df['weight']).sum()
#         df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
#         df['lorentz'] = df['cum_pos_found'] / total_pos
#         df['gini'] = (df['lorentz'] - df['random']) * df['weight']
#         return df['gini'].sum()

#     def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
#         y_true_pred = y_true.rename(columns={'target': 'prediction'})
#         return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

#     g = normalized_weighted_gini(y_true, y_pred)
#     d = top_four_percent_captured(y_true, y_pred)
    
#     amex_metric = 0.5 * (g + d)
    
#     return amex_metric

### 4.1 XGBoost

#### 4.1.1 Train single model using sklearn API

In [None]:
# initialize param grid
param_grid_xgb = {'n_estimators' : [100],
                  'max_depth' : [1],
                  'subsample' : [0.5],
                  'learning_rate' : [0.05],
                  'colsample_bytree' : [0.5]}

In [None]:
# model obect
model_xgb = xgb.XGBClassifier(objective = 'binary:logistic',
                              predictor = 'gpu_predictor',
                              tree_method = 'gpu_hist',
                              sampling_method = 'gradient_based',
                              verbosity = 2)

In [None]:
# grid search
grid_search_xgb = GridSearchCV(estimator = model_xgb,
                               param_grid = param_grid_xgb,
                               cv = 5,
                               verbose = 2)

In [None]:
grid_search_xgb.fit(X = X_train.drop(columns = 'customer_ID').to_pandas(),
                    y = cupy.asarray(y_train).get())

#### 4.1.1 Train single model

In [None]:
# convert data to DMatrix
dtrain = xgb.DMatrix(X_train.drop(columns = 'customer_ID'), 
                     label = y_train)

dval = xgb.DMatrix(X_val.drop(columns = 'customer_ID'), 
                   label = y_val)

In [None]:
# train the model
model_xgb = xgb.train(params = {'tree_method' : 'gpu_hist',
                                'objective': 'binary:logistic',
                                'verbosity' : 2,
                                'max_depth' : 3,
                                'subsample' : 0.5,
                                'eta' : 0.05,
                                'sampling_method' : 'gradient_based',
                                'colsample_bytree' : 0.5,
                                'predictor':'gpu_predictor',
                                'disable_default_eval_metric' : 1},
                      dtrain = dtrain,
                      num_boost_round = 7000,
                      evals = [(dtrain, 'train'),
                               (dval, 'validation')],
                      early_stopping_rounds = 100,
                      verbose_eval = 100,
                      custom_metric = amex_metric_xgboost,
                      maximize = True)

In [None]:
# check model attributes
model_xgb.attributes()

In [None]:
# save the model
model_xgb.save_model(f"model_xgb_{date.today()}.json")

#### 4.1.2 Train multiple models using K-fold strategy

In [None]:
# initialize the folds
kf = KFold(n_splits = FOLDS, shuffle = True, random_state = 42)
kfolds =  kf.split(df_train)

In [None]:
# train k models over k folds

evals = {}


for k, (train_idx, val_idx) in enumerate(kfolds):
    
    # create training datasets
    X_train = df_train.loc[train_idx].drop(columns = ['target'])
    y_train = df_train.loc[train_idx, 'target']
    
    # create vvalidation datasets
    X_val = df_train.loc[val_idx].drop(columns = ['target'])
    y_val = df_train.loc[val_idx, 'target']
    
    # convert datasets to DMatrix
    dtrain = xgb.DMatrix(X_train.drop(columns = 'customer_ID'), 
                         label = y_train)

    dval = xgb.DMatrix(X_val.drop(columns = 'customer_ID'), 
                       label = y_val)
    
    # print status
    clear_output(wait = True)
    print(f'###########  Training model {k+1}  ###########')
    
    # train the model
    model_xgb = xgb.train(params = {'tree_method' : 'gpu_hist',
                                    'objective': 'binary:logistic',
                                    'verbosity' : 2,
                                    'max_depth' : 3,
                                    'subsample' : 0.5,
                                    'eta' : 0.05,
                                    'sampling_method' : 'gradient_based',
                                    'colsample_bytree' : 0.5,
                                    'predictor':'gpu_predictor',
                                    'disable_default_eval_metric' : 1},
                          dtrain = dtrain,
                          num_boost_round = 2000,
                          evals = [(dtrain, 'train'),
                                   (dval, 'validation')],
                          early_stopping_rounds = 100,
                          verbose_eval = 100,
                          custom_metric = amex_metric_xgboost,
                          maximize = True)
    
    # save the model
    model_xgb.save_model(f"model_xgb_fold_{k+1}_{date.today()}.json")
    
    
    # update evals dict
    evals[k+1] = float(model_xgb.attributes().get('best_score'))

    # free up memory
    del X_train, y_train, X_val, y_val, dtrain, dval
    gc.collect()

In [None]:
# view results for each model
evals

## Section 5: Predictions on test data

In [None]:
# define chunk size
chunk_size = 50000


def score_test_data(full_df, chunk_size, model_obj, date_field, cat_features, num_features, fold):
    
    # print
    clear_output(wait = True)
    print(f'######### Prediction using model {fold+1} #########')
    
    # get list of unique customer ids
    test_customer_ids = list(set(full_df['customer_ID'].to_arrow().to_pylist()))
    
    # create list of prediction dfs
    chunk_pred_list = []
    
    # chunk count
    chunk_count = 1
    
    # read data in chunks
    for i in range(0, len(test_customer_ids), chunk_size):
        
        # print chunk counter
        print(f'Processing chunk {chunk_count}')
        
        # get the customer id chunk
        customer_id_chunk = test_customer_ids[i : i+chunk_size]
        
        # get the data chunk
        chunk = full_df[full_df['customer_ID'].isin(customer_id_chunk)]
        
        # process the chunks
        # step 1: change date string to date
        chunk = stringToDate(chunk, date_field)
        
        # step 2: create features
        chunk = createFeatures(chunk, cat_features, num_features)
        
        # convert to DMatrix (for XGBoost only)
        dtest = xgb.DMatrix(chunk.drop(columns = 'customer_ID'))
        
        # make predictions
        chunk_pred = model_obj.predict(data = dtest)
        
        # merge with customer ids
        chunk_pred_df = cudf.DataFrame(data = {'customer_ID' : chunk['customer_ID'],
                                               f'prediction_{fold+1}' : chunk_pred})
        
        # add to chunk prediction list
        chunk_pred_list.append(chunk_pred_df)
        
        # increment chunk counter
        chunk_count = chunk_count + 1
        
        # delete the chunk
        del chunk, dtest, chunk_pred, chunk_pred_df
        
    # print statment
    print('All chunks processed, merging individual chunks')
    
    # concatenate chunk predictions
    df_test_pred = cudf.concat(chunk_pred_list)
    
    # return prediction df
    return df_test_pred

In [None]:
# predictions on test data

pred_df_list = []

# read the full df
full_df = cudf.read_parquet(test_X_path)

for fold in range(FOLDS):
    
    # load model
    model_xgb_saved = xgb.Booster()
    model_xgb_saved.load_model(f'../input/amex-default-prediction-v1/model_xgb_fold_{fold+1}_2022-06-22.json')
    
    # predictions on test data
    df_test_pred = score_test_data(full_df, chunk_size, model_xgb_saved, date_field, cat_features, num_features, fold)
    
    # sort df
    df_test_pred =  df_test_pred.sort_values(by = 'customer_ID')
    
    
    # merge dfs
    if fold == 0:
        df_test_pred_full = df_test_pred.copy()
        
    else:
        df_test_pred_full = cudf.merge(df_test_pred_full, df_test_pred, on = 'customer_ID')

In [None]:
df_test_pred_full['prediction'] = df_test_pred_full.drop(columns = 'customer_ID').mean(axis = 1)

In [None]:
df_test_pred_full = df_test_pred_full.drop(columns = ['prediction_1', 'prediction_2', 'prediction_3', 'prediction_4', 'prediction_5'])

In [None]:
df_test_pred_full.head()

In [None]:
# output to csv
df_test_pred_full.to_csv(f'Amex predictions_{datetime.now()}.csv', index = False)