## Import Libraries

#### Change from version 1: We will be using the RAPIDS architecture to make use of the GPU environment

In [1]:
# Basic
import numpy as np
import pandas as pd
import os
import pyarrow
from datetime import datetime
from datetime import datetime, date
import joblib
import csv
import math
import pickle
from scipy import stats
import numpy as np
import statistics as st

# ML scikit-learn
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, ParameterGrid

# XGBoost
import xgboost as xgb

In [2]:
# Additional GPU libraries
import cudf
import cupy
import cuml
from cuml.model_selection import GridSearchCV, train_test_split
cudf.set_allocator("managed")

## Section 1: Create functions and pipeline classes

In [3]:
def stringToDate(X, date_col):
    
    # convert column to date time
    X[date_col] = cudf.to_datetime(X[date_col], infer_datetime_format = True)
        
    # return df
    return X

In [4]:
def createFeatures(X, cat_cols, num_cols):
    
    # sort dataframe
    X = X.sort_values(by = ['customer_ID', 'S_2'], ascending = False)
        
    ## create features from numeric cols
    X_num = X.groupby("customer_ID", as_index = False)[num_cols].agg(['mean', 'median', 'std', 'min', 'max', 'last', 'first'])
    X_num.columns = ['_'.join(x) for x in X_num.columns]
    X_num = X_num.reset_index()
    
    # fill na values
    num_cols = [col for col in X_num.columns if col not in ['customer_ID']]
    for col in num_cols:
        X_num[col].fillna(X_num[col].mean().astype(cupy.float32), inplace = True)
    

    ## create features from categorical cols
    X_cat = X.groupby("customer_ID", as_index = False)[cat_cols].agg(['count', 'last', 'first', 'nunique'])
    X_cat.columns = ['_'.join(x) for x in X_cat.columns]
    X_cat = X_cat.reset_index()
    
    # fill na values
    cat_cols = [col for col in X_cat.columns if col not in ['customer_ID']]
    for col in cat_cols:
        X_cat[col].fillna(stats.mode(X_cat[col].to_pandas(),nan_policy = 'omit')[0][0], inplace = True)
        
        
    # merge the dataframes
    X_updated = cudf.merge(X_num, X_cat, on = 'customer_ID', how = 'outer')
    
    # fill na after merging
    X_updated.fillna(method = 'ffill', inplace = True)
    X_updated.fillna(method = 'bfill', inplace = True)
        
    # remove intermediate dfs
    del X_num, X_cat
        
    # return updated df
    return X_updated    

## Section 2: Read Data

In [5]:
# initialize project directory
project_dir = '/kaggle'

In [6]:
# list file names in input directory
for dirname, _, filenames in os.walk(os.path.join(project_dir, 'input')):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/amex-data-integer-dtypes-parquet-format/train.parquet
/kaggle/input/amex-data-integer-dtypes-parquet-format/test.parquet
/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv
/kaggle/input/model-xgb-20220616/model_xgb_20220616.json
/kaggle/input/amex-default-prediction-v1/__results__.html
/kaggle/input/amex-default-prediction-v1/__notebook_source__.ipynb
/kaggle/input/amex-default-prediction-v1/__notebook__.ipynb
/kaggle/input/amex-default-prediction-v1/__output__.json
/kaggle/input/amex-default-prediction-v1/custom.css
/kaggle/input/model-xgb-v2/model_xbg_v2.pkl
/kaggle/input/model-xgb-v3/model_xbg_v2.json
/kaggle/input/amex-prediction-model-xgb/model_xgb.json


In [7]:
## path to files
# Train
train_X_path = '/kaggle/input/amex-data-integer-dtypes-parquet-format/train.parquet'
train_y_path = '/kaggle/input/amex-default-prediction/train_labels.csv'

# Test
test_X_path = '/kaggle/input/amex-data-integer-dtypes-parquet-format/test.parquet'

In [8]:
# read train data
df_train_X = cudf.read_parquet(train_X_path)

In [9]:
# view sample
print(df_train_X.shape)
df_train_X.head(5)

(5531451, 190)


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0,0.008724,1.006838,0.009228,0.124035,0.0,0.004709,...,-1,-1,-1,0,0,0.0,,0,0.00061,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0,0.004923,1.000653,0.006151,0.12675,0.0,0.002714,...,-1,-1,-1,0,0,0.0,,0,0.005492,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,3,0.021655,1.009672,0.006815,0.123977,0.0,0.009423,...,-1,-1,-1,0,0,0.0,,0,0.006986,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0,0.013683,1.0027,0.001373,0.117169,0.0,0.005531,...,-1,-1,-1,0,0,0.0,,0,0.006527,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0,0.015193,1.000727,0.007605,0.117325,0.0,0.009312,...,-1,-1,-1,0,0,0.0,,0,0.008126,0


## Section 3: Feature engineering

In [10]:
# initialize relevant cols
all_cols = [c for c in list(df_train_X.columns) if c not in ['customer_ID','S_2']]

cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
num_features = [col for col in all_cols if col not in cat_features]

date_field = 'S_2'

In [11]:
# step 1: change date string to date
df_train_X = stringToDate(df_train_X, date_field)

In [12]:
# step 2: create features
df_train_X = createFeatures(df_train_X, cat_features, num_features)

In [13]:
# sort data
df_train_X = df_train_X.sort_values(by = 'customer_ID')

In [14]:
# view sample
print(df_train_X.shape)
df_train_X.head()

(458913, 1284)


Unnamed: 0,customer_ID,P_2_mean,P_2_median,P_2_std,P_2_min,P_2_max,P_2_last,P_2_first,D_39_mean,D_39_median,...,D_64_first,D_64_nunique,D_66_count,D_66_last,D_66_first,D_66_nunique,D_68_count,D_68_last,D_68_first,D_68_nunique
28576,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.933824,0.938469,0.024194,0.86858,0.960384,0.938469,0.934745,0.230769,0.0,...,0,1,13,-1,-1,1,13,6,6,1
28580,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.89982,0.904814,0.022119,0.861109,0.929122,0.929122,0.880519,7.153846,7.0,...,0,1,13,-1,-1,1,13,6,6,1
28584,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0.878454,0.884522,0.028911,0.79767,0.904482,0.876615,0.880875,0.0,0.0,...,2,1,13,-1,-1,1,13,6,6,1
28588,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0.598969,0.598278,0.020107,0.567442,0.623392,0.567442,0.621776,1.538462,0.0,...,0,1,13,-1,-1,1,13,2,3,3
28592,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0.891679,0.879238,0.042325,0.805045,0.940382,0.936842,0.8719,0.0,0.0,...,0,1,13,1,1,1,13,6,6,1


## Section 4: Train Models

### Read labels

In [15]:
# read data
df_train_y = cudf.read_csv(train_y_path)

In [16]:
# sort updated df by customer id
df_train_y = df_train_y.sort_values(by = 'customer_ID')

In [17]:
print(df_train_y.shape)
df_train_y.head()

(458913, 2)


Unnamed: 0,customer_ID,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0


In [18]:
df_train_y['target'].value_counts() / df_train_y.shape[0]

0    0.741066
1    0.258934
Name: target, dtype: float64

In [19]:
# merge with X
df_train = cudf.merge(df_train_X, df_train_y, on = 'customer_ID', how = 'left')
print(df_train.shape)

(458913, 1285)


In [20]:
# train validation split
X_train, X_val, y_train, y_val = train_test_split(X = df_train.drop(columns = ['target']),
                                                  y = df_train['target'],
                                                  test_size = 0.25, random_state = 42)

In [21]:
# check incidence rate
print(y_train.sum() / y_train.count(), y_val.sum() / y_val.count())

0.2589014628760696 0.25903005369221116


In [22]:
# function for custom evaulation metric for the amex competition (specifically for xgboost)
def amex_metric_xgboost(predt: np.ndarray, dtrain: xgb.DMatrix):
    
    # convert to pandas dataframe
    y_true = pd.DataFrame(data = {'target' : dtrain.get_label()})
    y_pred = pd.DataFrame(data = {'prediction' : predt})

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = pd.concat([y_true, y_pred], axis = 'columns').sort_values('prediction', ascending=False)
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = pd.concat([y_true, y_pred], axis = 'columns').sort_values('prediction', ascending=False)
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return ('Amex_Metric', 0.5 * (g + d))

In [23]:
# function for custom evaulation metric for the amex competition (generic)
def amex_metric_generic(y_true: cupy.ndarray, y_pred: cupy.ndarray):
    
    # convert to pandas dataframe
    y_true = cudf.DataFrame(data = {'target' : y_true}).to_pandas()
    y_pred = cudf.DataFrame(data = {'prediction' : y_pred}).to_pandas()

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = pd.concat([y_true, y_pred], axis = 'columns').sort_values('prediction', ascending=False)
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = pd.concat([y_true, y_pred], axis = 'columns').sort_values('prediction', ascending=False)
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return ('Amex_Metric', 0.5 * (g + d))

### 4.1 XGBoost

In [24]:
# convert data to DMatrix
dtrain = xgb.DMatrix(X_train.drop(columns = 'customer_ID'), 
                     label = y_train)

dval = xgb.DMatrix(X_val.drop(columns = 'customer_ID'), 
                   label = y_val)

In [25]:
# train the model
model_xgb = xgb.train(params = {'tree_method' : 'gpu_hist',
                                'objective': 'binary:logistic',
                                'verbosity' : 2,
                                'max_depth' : 3,
                                'subsample' : 0.5,
                                'eta' : 0.05,
                                'sampling_method' : 'gradient_based',
                                'colsample_bytree' : 0.5,
                                'predictor':'gpu_predictor',
                                'disable_default_eval_metric' : 1},
                      dtrain = dtrain,
                      num_boost_round = 7000,
                      evals = [(dtrain, 'train'),
                               (dval, 'validation')],
                      early_stopping_rounds = 100,
                      verbose_eval = 100,
                      custom_metric = amex_metric_xgboost,
                      maximize = True)

[0]	train-Amex_Metric:0.62527	validation-Amex_Metric:0.62680
[100]	train-Amex_Metric:0.76068	validation-Amex_Metric:0.75872
[200]	train-Amex_Metric:0.77836	validation-Amex_Metric:0.77525
[300]	train-Amex_Metric:0.78577	validation-Amex_Metric:0.78082
[400]	train-Amex_Metric:0.79058	validation-Amex_Metric:0.78323
[500]	train-Amex_Metric:0.79496	validation-Amex_Metric:0.78583
[600]	train-Amex_Metric:0.79776	validation-Amex_Metric:0.78714
[700]	train-Amex_Metric:0.80072	validation-Amex_Metric:0.78848
[800]	train-Amex_Metric:0.80285	validation-Amex_Metric:0.78928
[900]	train-Amex_Metric:0.80459	validation-Amex_Metric:0.78975
[1000]	train-Amex_Metric:0.80653	validation-Amex_Metric:0.79025
[1100]	train-Amex_Metric:0.80868	validation-Amex_Metric:0.79126
[1200]	train-Amex_Metric:0.81008	validation-Amex_Metric:0.79120
[1300]	train-Amex_Metric:0.81198	validation-Amex_Metric:0.79188
[1400]	train-Amex_Metric:0.81373	validation-Amex_Metric:0.79187
[1465]	train-Amex_Metric:0.81460	validation-Amex_Met

In [29]:
# check model attributes
model_xgb.attributes()

{'best_iteration': '1366',
 'best_ntree_limit': '1367',
 'best_score': '0.792278'}

In [30]:
# save the model
model_xgb.save_model(f"model_xgb_{date.today()}.json")

## Section 5: Predictions on test data

In [None]:
# load model
model_xgb_saved = xgb.Booster()
model_xgb_saved.load_model(f'../input/amex-default-prediction-v1/model_xgb_{date.today()}.json')

In [None]:
# define chunk size
chunk_size = 50000

In [None]:
def score_test_data(file_path, chunk_size, model_obj, date_field, cat_features, num_features):
    
    
    # read the full df
    full_df = cudf.read_parquet(file_path)
    
    # get list of unique customer ids
    test_customer_ids = list(set(full_df['customer_ID'].to_arrow().to_pylist()))
    
    # create list of prediction dfs
    chunk_pred_list = []
    
    # chunk count
    chunk_count = 1
    
    # read data in chunks
    for i in range(0, len(test_customer_ids), chunk_size):
        
        # print chunk counter
        print(f'Processing chunk {chunk_count}')
        
        # get the customer id chunk
        customer_id_chunk = test_customer_ids[i : i+chunk_size]
        
        # get the data chunk
        chunk = full_df[full_df['customer_ID'].isin(customer_id_chunk)]
        
        # process the chunks
        # step 1: change date string to date
        chunk = stringToDate(chunk, date_field)
        
        # step 2: create features
        chunk = createFeatures(chunk, cat_features, num_features)
        
        # convert to DMatrix (for XGBoost only)
        dtest = xgb.DMatrix(chunk.drop(columns = 'customer_ID'))
        
        # make predictions
        chunk_pred = model_obj.predict(data = dtest)
        
        # merge with customer ids
        chunk_pred_df = cudf.DataFrame(data = {'customer_ID' : chunk['customer_ID'],
                                               'prediction' : chunk_pred})
        
        # add to chunk prediction list
        chunk_pred_list.append(chunk_pred_df)
        
        # increment chunk counter
        chunk_count = chunk_count + 1
        
        # delete the chunk
        del chunk, dtest, chunk_pred, chunk_pred_df
        
    # print statment
    print('All chunks processed, merging individual chunks')
    
    # concatenate chunk predictions
    df_test_pred = cudf.concat(chunk_pred_list)
    
    # return prediction df
    return df_test_pred

In [None]:
# predictions on test data
df_test_pred = score_test_data(test_X_path, chunk_size, model_xgb_saved, date_field, cat_features, num_features)

In [None]:
df_test_pred.head()

In [None]:
# output to csv
df_test_pred.to_csv(f'Amex predictions_{datetime.now()}.csv', index = False)