In [1]:
%config IPCompleter.use_jedi=False

## Compressing the data files

In [128]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import pandas as pd
import os
import gc
import cupy
import cudf
from datetime import datetime
from tqdm import tqdm

In [3]:
# parameters for the preprocessing
DATA_DIR = "/home/silo1/mas322/amex-default-prediction/"
PROCESS_DATA = False
NUM_SPLITS = 10
PAD_CUSTOMER_TO_13_ROWS = True
COLS = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_66', 'D_68', 'D_63', 'D_64', 'P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3', 'D_42', 'D_43', 'D_44', 'B_4', 'D_45', 'B_5', 'R_2', 'D_46', 'D_47', 'D_48', 'D_49', 'B_6', 'B_7', 'B_8', 'D_50', 'D_51', 'B_9', 'R_3', 'D_52', 'P_3', 'B_10', 'D_53', 'S_5', 'B_11', 'S_6', 'D_54', 'R_4', 'S_7', 'B_12', 'S_8', 'D_55', 'D_56', 'B_13', 'R_5', 'D_58', 'S_9', 'B_14', 'D_59', 'D_60', 'D_61', 'B_15', 'S_11', 'D_62', 'D_65', 'B_16', 'B_17', 'B_18', 'B_19', 'B_20', 'S_12', 'R_6', 'S_13', 'B_21', 'D_69', 'B_22', 'D_70', 'D_71', 'D_72', 'S_15', 'B_23', 'D_73', 'P_4', 'D_74', 'D_75', 'D_76', 'B_24', 'R_7', 'D_77', 'B_25', 'B_26', 'D_78', 'D_79', 'R_8', 'R_9', 'S_16', 'D_80', 'R_10', 'R_11', 'B_27', 'D_81', 'D_82', 'S_17', 'R_12', 'B_28', 'R_13', 'D_83', 'R_14', 'R_15', 'D_84', 'R_16', 'B_29', 'S_18', 'D_86', 'D_87', 'R_17', 'R_18', 'D_88', 'B_31', 'S_19', 'R_19', 'B_32', 'S_20', 'R_20', 'R_21', 'B_33', 'D_89', 'R_22', 'R_23', 'D_91', 'D_92', 'D_93', 'D_94', 'R_24', 'R_25', 'D_96', 'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'D_102', 'D_103', 'D_104', 'D_105', 'D_106', 'D_107', 'B_36', 'B_37', 'R_26', 'R_27', 'D_108', 'D_109', 'D_110', 'D_111', 'B_39', 'D_112', 'B_40', 'S_27', 'D_113', 'D_115', 'D_118', 'D_119', 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_127', 'D_128', 'D_129', 'B_41', 'B_42', 'D_130', 'D_131', 'D_132', 'D_133', 'R_28', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 'D_143', 'D_144', 'D_145']
DEV = torch.device('cuda')

## Formatting the data for use in RNN

We will be using [Raddar's denoised](https://www.kaggle.com/datasets/raddar/amex-data-integer-dtypes-parquet-format) dataset, which is stored on disk as `train.parquet` and `test.parquet` in `$DATA_DIR/derived/`. We also have the original raw data in `$DATA_DIR/raw/train_data.csv` and `$DATA_DIR/raw/test_data.csv`. The denoised dataset from Raddar has already mapped the different categorical entries to numerical integer indices.

In this section, we will process this data to split it into `NUM_SPLITS` and store seperate NumPy files on disk of shape `num_of_customers_in_split x 13 x 188`. These NumPy files will have the categorical values changes to numeric where each category is mapped to a distinct integer, even `NA`s. The `NA` in the numeric columns will be replaced by `-.5`. The preprocessing here is based on [this GRU starter blogpost](https://www.kaggle.com/code/cdeotte/tensorflow-gru-starter-0-790)

In [4]:
if PROCESS_DATA:
    for f in ['train_data', 'test_data', 'train_labels']:
        print(f"Converting {f}")
        df = pd.read_csv(os.path.join(dataset_dir, "raw", f"{f}.csv"))
        df.to_feather(os.path.join(dataset_dir, "derived", f"{f}.feather"))
        del df

In [3]:
if PROCESS_DATA:
    df = cudf.read_parquet(os.path.join(DATA_DIR, "derived", "train.parquet"))
    df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
df.head() if PROCESS_DATA else None

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,-4532153018459703766,2017-03-09,0.938469,0,0.008724,1.006838,0.009228,0.124035,0.0,0.004709,...,-1,-1,-1,0,0,0.0,,0,0.00061,0
1,-4532153018459703766,2017-04-07,0.936665,0,0.004923,1.000653,0.006151,0.12675,0.0,0.002714,...,-1,-1,-1,0,0,0.0,,0,0.005492,0
2,-4532153018459703766,2017-05-28,0.95418,3,0.021655,1.009672,0.006815,0.123977,0.0,0.009423,...,-1,-1,-1,0,0,0.0,,0,0.006986,0
3,-4532153018459703766,2017-06-13,0.960384,0,0.013683,1.0027,0.001373,0.117169,0.0,0.005531,...,-1,-1,-1,0,0,0.0,,0,0.006527,0
4,-4532153018459703766,2017-07-16,0.947248,0,0.015193,1.000727,0.007605,0.117325,0.0,0.009312,...,-1,-1,-1,0,0,0.0,,0,0.008126,0


In [4]:
if PROCESS_DATA:
    # LOAD TARGETS
    targets = cudf.read_feather(os.path.join(DATA_DIR, 'derived', 'train_labels.feather'))
    targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    print(f'There are {targets.shape[0]} train targets')

    # GET TRAIN COLUMN NAMES
    train = cudf.read_csv(os.path.join(DATA_DIR, 'raw', 'train_data.csv'), nrows=1)
    T_COLS = train.columns
    print(f'There are {len(T_COLS)} train dataframe columns')

    customers = df.customer_ID.unique().values.flatten()
    print(f'There are {len(customers)} unique customers in train.')

    # extract the Y, M and D from the date column, then sort by time (after customer_ID)
    df.S_2 = cudf.to_datetime(df.S_2)
    df['year'] = (df.S_2.dt.year-2000).astype('int8')
    df['month'] = (df.S_2.dt.month).astype('int8')
    df['day'] = (df.S_2.dt.day).astype('int8')
    del df['S_2']

df if PROCESS_DATA else None



There are 458913 train targets
There are 190 train dataframe columns
There are 458913 unique customers in train.


Unnamed: 0,customer_ID,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,...,D_139,D_140,D_141,D_142,D_143,D_144,D_145,year,month,day
0,-4532153018459703766,0.938469,0,0.008724,1.006838,0.009228,0.124035157,0.0,0.004709,,...,0,0,0.0,,0,0.000610,0,17,3,9
1,-4532153018459703766,0.936665,0,0.004923,1.000653,0.006151,0.126749977,0.0,0.002714,,...,0,0,0.0,,0,0.005492,0,17,4,7
2,-4532153018459703766,0.954180,3,0.021655,1.009672,0.006815,0.123976685,0.0,0.009423,,...,0,0,0.0,,0,0.006986,0,17,5,28
3,-4532153018459703766,0.960384,0,0.013683,1.002700,0.001373,0.117169224,0.0,0.005531,,...,0,0,0.0,,0,0.006527,0,17,6,13
4,-4532153018459703766,0.947248,0,0.015193,1.000727,0.007605,0.117324777,0.0,0.009312,,...,0,0,0.0,,0,0.008126,0,17,7,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5531446,-8425848485496994175,0.979333,14,0.020818,0.828199,0.003487,0.090742894,0.0,0.025139,,...,0,0,0.0,,0,0.001498,0,17,11,5
5531447,-8425848485496994175,0.984907,10,0.007209,0.812610,0.005904,0.079886191,0.0,0.023691,,...,0,0,0.0,,0,0.008225,0,17,12,23
5531448,-8425848485496994175,0.983019,15,0.013151,0.815422,0.003457,0.100502573,0.0,0.012343,,...,0,0,0.0,,0,0.006773,0,18,1,6
5531449,-8425848485496994175,0.969861,15,0.009855,1.003541,0.005117,0.101802148,0.0,0.008578,,...,0,0,0.0,,0,0.001168,0,18,2,6


Numerical columns are padded with `-3`, as that is not used anywhere in the data. Categorical columns are padded with `-2`. Notes that this gives a gap for feature `D_63`, which seemingly is not missing any values:
```
Feature B_30: min=-1, max=2
	Nan count: 0
Feature B_38: min=-1, max=7
	Nan count: 0
Feature D_114: min=-1, max=1
	Nan count: 0
Feature D_116: min=-1, max=1
	Nan count: 0
Feature D_117: min=-1, max=7
	Nan count: 0
Feature D_120: min=-1, max=1
	Nan count: 0
Feature D_126: min=-1, max=2
	Nan count: 0
Feature D_66: min=-1, max=1
	Nan count: 0
Feature D_68: min=-1, max=6
	Nan count: 0
Feature D_63: min=0, max=5
	Nan count: 0
Feature D_64: min=-1, max=3
	Nan count: 0
```
Also, code for checking if any conflicts with padded values:
```
for c in T_COLS:
    if c == 'S_2': continue
    print(c, df[df[c] == -2].shape[0])
```

In [5]:
# Not all the customers have 13 rows, so pad all the ones with fewer rows
# df[['customer_ID']].groupby('customer_ID').customer_ID.agg('count').mean()

CATS = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_66', 'D_68'] + ['D_63','D_64']

if PAD_CUSTOMER_TO_13_ROWS and PROCESS_DATA:
    tmp = df[['customer_ID']].groupby('customer_ID').customer_ID.agg('count')
    more = cupy.array([], dtype='int64') 
    for j in range(1, 13):
        i = tmp.loc[tmp == j].index.values
        more = cupy.concatenate([more, cupy.repeat(i, 13-j)])
    df_pad = df.iloc[:len(more)].copy().fillna(0)
    df_pad = df_pad * 0 - 3 #pad numerical columns with -3
    df_pad[CATS] = (df_pad[CATS] * 0 - 2).astype('int8') #pad categorical columns with -2
    df_pad['customer_ID'] = more
    df = cudf.concat([df, df_pad], axis = 0, ignore_index=True)
    
    del tmp, df_pad
    gc.collect()
df if PAD_CUSTOMER_TO_13_ROWS and PROCESS_DATA else None

Unnamed: 0,customer_ID,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,...,D_139,D_140,D_141,D_142,D_143,D_144,D_145,year,month,day
0,-4532153018459703766,0.938469,0,0.008724,1.006838,0.009228,0.124035157,0.0,0.004709,,...,0,0,0.0,,0,0.000610,0,17,3,9
1,-4532153018459703766,0.936665,0,0.004923,1.000653,0.006151,0.126749977,0.0,0.002714,,...,0,0,0.0,,0,0.005492,0,17,4,7
2,-4532153018459703766,0.954180,3,0.021655,1.009672,0.006815,0.123976685,0.0,0.009423,,...,0,0,0.0,,0,0.006986,0,17,5,28
3,-4532153018459703766,0.960384,0,0.013683,1.002700,0.001373,0.117169224,0.0,0.005531,,...,0,0,0.0,,0,0.006527,0,17,6,13
4,-4532153018459703766,0.947248,0,0.015193,1.000727,0.007605,0.117324777,0.0,0.009312,,...,0,0,0.0,,0,0.008126,0,17,7,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5965864,-55748515817554379,-3.000000,-3,-3.000000,-3.000000,-3.000000,-3.0,-3.0,-3.000000,-3.0,...,-3,-3,-3.0,-3.0,-3,-3.000000,-3,-3,-3,-3
5965865,-2688056353488173321,-3.000000,-3,-3.000000,-3.000000,-3.000000,-3.0,-3.0,-3.000000,-3.0,...,-3,-3,-3.0,-3.0,-3,-3.000000,-3,-3,-3,-3
5965866,8782972297529978422,-3.000000,-3,-3.000000,-3.000000,-3.000000,-3.0,-3.0,-3.000000,-3.0,...,-3,-3,-3.0,-3.0,-3,-3.000000,-3,-3,-3,-3
5965867,7318692200354110648,-3.000000,-3,-3.000000,-3.000000,-3.000000,-3.0,-3.0,-3.000000,-3.0,...,-3,-3,-3.0,-3.0,-3,-3.000000,-3,-3,-3,-3


In [6]:
# not all the time-series are of length 13
df[['customer_ID']].groupby('customer_ID').customer_ID.agg('count').mean() if PROCESS_DATA else None

13.0

In [7]:
# merge the targets
if PROCESS_DATA:
    df = df.merge(targets, on='customer_ID', how='left')
    df.target = df.target.astype('int8')

In [8]:
# sort by customer ID, then by data. Then rearrange columns with 11 cats first
if PROCESS_DATA:
    df = df.sort_values(['customer_ID', 'year', 'month', 'day']).reset_index(drop = True)
    df = df.drop(['year', 'month', 'day'], axis=1)
    
    COLS = list(df.columns[1:])
    COLS = ['customer_ID'] + CATS + [c for c in COLS if c not in CATS]
    df = df[COLS]
df if PROCESS_DATA else None

Unnamed: 0,customer_ID,B_30,B_38,D_114,D_116,D_117,D_120,D_126,D_66,D_68,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,-9223358381327749917,1,6,1,0,0,1,2,-1,2,...,-1,-1,0,0,0.0,,0,0.004787,0,1
1,-9223358381327749917,1,6,1,0,3,1,2,-1,2,...,-1,-1,0,0,0.0,,0,0.003442,0,1
2,-9223358381327749917,1,3,1,0,3,1,2,-1,2,...,-1,-1,0,0,0.0,,0,0.003340,0,1
3,-9223358381327749917,1,3,1,0,0,1,2,-1,2,...,-1,-1,0,0,0.0,,0,0.007556,0,1
4,-9223358381327749917,1,3,1,0,0,1,2,-1,3,...,-1,-1,0,0,0.0,,0,0.005299,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5965864,9223350112805974911,1,6,1,0,5,0,2,-1,6,...,-1,-1,0,0,0.0,,0,0.003152,0,1
5965865,9223350112805974911,1,6,1,0,5,0,2,-1,6,...,-1,-1,0,0,0.0,,0,0.002049,0,1
5965866,9223350112805974911,1,7,1,0,5,0,2,-1,6,...,-1,-1,0,0,0.0,,0,0.000250,0,1
5965867,9223350112805974911,1,7,1,0,5,0,2,-1,6,...,-1,-1,0,0,0.0,,0,0.007640,0,1


In [9]:
# fill remaining NaNs with -0.5
df = df.fillna(-0.5)

In [10]:
for i in range(NUM_SPLITS):
    lower = (len(customers) // NUM_SPLITS) * i
    upper = (len(customers) // NUM_SPLITS) * (i + 1)
    if i == NUM_SPLITS - 1:
        upper = len(customers)
        
    sub_df = df[df.customer_ID.isin(customers[lower:upper])]
    print(f"[{i + 1} / {NUM_SPLITS}] Saving data with {len(sub_df)} rows...")
    
    sub_targets = sub_df[['customer_ID', 'target']].drop_duplicates().sort_index()
    sub_targets.to_parquet(os.path.join(DATA_DIR, 'derived', 'processed-splits', f"train-targets_{i}.parquet"))
    
    # remove the customer ID and the target column 190 -> 188
    sub_data = sub_df.iloc[:, 1:-1].values.reshape((-1, 13, 188))
    cupy.save(os.path.join(DATA_DIR, "derived", "processed-splits", f"train-data_{i}.npy"), sub_data.astype('float32'))
    
    del sub_df, sub_targets, sub_data

[1 / 10] Saving data with 596583 rows...
[2 / 10] Saving data with 596583 rows...
[3 / 10] Saving data with 596583 rows...
[4 / 10] Saving data with 596583 rows...
[5 / 10] Saving data with 596583 rows...
[6 / 10] Saving data with 596583 rows...
[7 / 10] Saving data with 596583 rows...
[8 / 10] Saving data with 596583 rows...
[9 / 10] Saving data with 596583 rows...
[10 / 10] Saving data with 596622 rows...


In [11]:
# categorical values already converted to numeric integers, but what happened to the missing values? Were there none for D_63?
# cats = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_66', 'D_68', 'D_63', 'D_64']
# for c in cats:
#     print(f"Feature {c}: min={df[c].min()}, max={df[c].max()}")
#     print(f"\tNan count: {df[c].isna().sum()}")

In [12]:
# clean up
del df

## Some EDA on the data splits

In [5]:
X_train = np.load(os.path.join(DATA_DIR, 'derived', 'processed-splits', 'train-data_0.npy'))
y_train = pd.read_parquet(os.path.join(DATA_DIR, 'derived', 'processed-splits', 'train-targets_0.parquet'))

In [6]:
X_train.shape, y_train.shape

((45891, 13, 188), (45891, 2))

In [69]:
len(COLS)

188

In [67]:
for i in range(188):
    print(f"Variable {COLS[i]} proportion of missing values: {X_train[X_train[:, 0, i] == -0.5].shape[0] / X_train.shape[0]:.4f}")

Variable B_30 proportion of missing values: 0.0000
Variable B_38 proportion of missing values: 0.0000
Variable D_114 proportion of missing values: 0.0000
Variable D_116 proportion of missing values: 0.0000
Variable D_117 proportion of missing values: 0.0000
Variable D_120 proportion of missing values: 0.0000
Variable D_126 proportion of missing values: 0.0000
Variable D_66 proportion of missing values: 0.0000
Variable D_68 proportion of missing values: 0.0000
Variable D_63 proportion of missing values: 0.0000
Variable D_64 proportion of missing values: 0.0000
Variable P_2 proportion of missing values: 0.0082
Variable D_39 proportion of missing values: 0.0000
Variable B_1 proportion of missing values: 0.0000
Variable B_2 proportion of missing values: 0.0004
Variable R_1 proportion of missing values: 0.0000
Variable S_3 proportion of missing values: 0.1561
Variable D_41 proportion of missing values: 0.0004
Variable B_3 proportion of missing values: 0.0004
Variable D_42 proportion of miss

In [7]:
X_train.shape

(45891, 13, 188)

In [12]:
np.array(X_train == -4).mean()

0.0

## Preparing the data for PyTorch

In [82]:
# COMPETITION METRIC FROM Konstantin Yakovlev
# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

In [75]:
def load_numpy_data(val_idx : list, num_cat_columns = None, nan_fill = -0.5, pad_cat_fill : int = -2, pad_numeric_fill = -1, **kwargs) ->\
    (torch.utils.data.DataLoader, torch.utils.data.DataLoader):
    """
    val_idx is a list of integers denoting which of the [0, 1, ..., NUM_SPLITS-1] splits to use
    as validation data, and the rest will be used as training data
    
    if num_cat_columns is set to an integer, the first num_cat_columns columns will have their values shifted to all be non-negative,
    as these are categorical integer indices.
    """
    split_data_dir = os.path.join(DATA_DIR, "derived", "processed-splits")
    
    def load_aux(idx):
        Xs = []; ys = []
        for k in val_idx:
            Xs.append(np.load(os.path.join(split_data_dir, f"train-data_{k}.npy")))
            ys.append(pd.read_parquet(os.path.join(split_data_dir, f"train-targets_{k}.parquet")))
        
        Xs = np.concatenate(Xs, axis = 0)
        ys = pd.concat(ys).target.values
        
        # fill NAs and padded values with provided numerics
        # (See PAD_CUSTOMER_TO_13_ROWS code)
        na_mask = (Xs == -0.5)
        pad_cat_mask = (Xs == -2)
        pad_numeric_mask = (Xs == -3)
        
        Xs[na_mask] = nan_fill
        Xs[pad_cat_mask] = pad_cat_fill
        Xs[pad_numeric_mask] = pad_numeric_fill
        
        if num_cat_columns is not None:
            Xs[:, :, :num_cat_columns] = Xs[:, :, :num_cat_columns] - np.amin(Xs[:, :, :num_cat_columns], axis = 0, keepdims = True)
        
        data_loader = torch.utils.data.DataLoader(
            dataset = torch.utils.data.TensorDataset(torch.from_numpy(Xs).type(torch.float32), torch.from_numpy(ys).type(torch.float32)),
            **kwargs
        )
        
        return data_loader
    
    train_idx = [i for i in list(range(NUM_SPLITS)) if i not in val_idx]
    
    return load_aux(train_idx), load_aux(val_idx)

In [76]:
train_loader, val_loader = load_numpy_data([8, 9], num_cat_columns = 11, batch_size = 1024, drop_last = True, shuffle = True)

In [77]:
for x, y in train_loader:
    print(x.shape, y.shape)
    break

torch.Size([1024, 13, 188]) torch.Size([1024])


In [139]:
def train_one_epoch(model, loss_fn, training_loader, optimizer, epoch_number):
    running_loss = 0.
    running_metric = 0.
    # last_loss = 0.
    # last_metric = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in tqdm(enumerate(training_loader), desc = f"EPOCH {epoch_number}", total = len(training_loader)):
        # Every data instance is an input + label pair
        inputs, labels = data
        inputs, labels = inputs.to(DEV), labels.to(DEV)

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        running_metric += amex_metric_mod(labels.detach().cpu().numpy(), outputs.detach().cpu().numpy())

    last_loss = running_loss / (i + 1) # loss per batch
    last_metric = running_metric / (i + 1) # metric per batch
    # print('  batch {} loss: {} metric: {}'.format(i + 1, last_loss, last_metric))
    # tb_x = epoch_index * len(training_loader) + i + 1
    # tb_writer.add_scalar('Loss/train', last_loss, tb_x)

    return last_loss, last_metric


def fit_model(model, loss_fn, train_loader, val_loader, optimizer, num_epochs):
    best_vloss = 1_000_000.
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

    for epoch in range(num_epochs):
        # print('EPOCH {}:'.format(epoch + 1))

        # Make sure gradient tracking is on, and do a pass over the data
        model.train(True)
        avg_loss, avg_metric = train_one_epoch(model, loss_fn, train_loader, optimizer, epoch + 1)

        running_vloss = 0.0
        running_vmetric = 0.0
        # Set the model to evaluation mode, disabling dropout and using population
        # statistics for batch normalization.
        model.eval()

        # Disable gradient computation and reduce memory consumption.
        with torch.no_grad():
            for i, vdata in enumerate(val_loader):
                vinputs, vlabels = vdata
                vinputs, vlabels = vinputs.to(DEV), vlabels.to(DEV)
                voutputs = model(vinputs)
                vloss = loss_fn(voutputs, vlabels)
                vmetric = amex_metric_mod(vlabels.cpu().numpy(), voutputs.cpu().numpy())
                running_vloss += vloss
                running_vmetric += vmetric

        avg_vloss = running_vloss / (i + 1)
        avg_vmetric = running_vmetric / (i + 1)
        print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
        print('AMEX metric train {} valid {}'.format(avg_metric, avg_vmetric))

        # Log the running loss averaged per batch
        # for both training and validation
        # writer.add_scalars('Training vs. Validation Loss',
        #                 { 'Training' : avg_loss, 'Validation' : avg_vloss },
        #                 epoch_number + 1)
        # writer.flush()

        # Track best performance, and save the model's state
        # if avg_vloss < best_vloss:
        #     best_vloss = avg_vloss
        #     model_path = 'model_{}_{}'.format(timestamp, epoch)
        #     torch.save(model.state_dict(), model_path)

We now define our model

In [140]:
class GRUNet(nn.Module):
    def __init__(self, num_features, hidden_dim, layer_dim, emb_dim, num_cat_columns = 11, dropout_prob = 0.2):
        super(GRUNet, self).__init__()
        
        # save the params
        self.layer_dim = layer_dim
        self.hidden_dim = hidden_dim
        self.num_cat_columns = num_cat_columns
        
        # the layers we need
        emb_layers = []
        for k in range(num_cat_columns):
            emb_layers.append(nn.Embedding(10, emb_dim))
        self.emb_layers = nn.ModuleList(emb_layers)
        
        self.gru = nn.GRU(
            input_size = num_features - num_cat_columns + num_cat_columns * emb_dim,
            hidden_size = hidden_dim,
            num_layers = layer_dim,
            batch_first = True,
            dropout = dropout_prob
        )
        
        self.fc1 = nn.Linear(hidden_dim, 64)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        # First 11 columns are categorical, next 177 are numerical
        embedding_outs = []
        for k in range(self.num_cat_columns):
            emb = self.emb_layers[k]
            col = x[:, :, k].type(torch.int32)
            embedding_outs.append(emb(col))
        
        x = torch.concat([x[:, :, self.num_cat_columns:]] + embedding_outs, dim = -1)
        
        # Initializing hidden state for first input with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim, device = DEV).requires_grad_()

        # Forward propagation by passing in the input and hidden state into the model
        out, _ = self.gru(x, h0.detach())

        # Reshaping the outputs in the shape of (batch_size, seq_length, hidden_size)
        # so that it can fit into the fully connected layer
        out = out[:, -1, :]
        
        out = self.relu1(self.fc1(out))
        out = self.relu2(self.fc2(out))
        out = self.sigmoid(self.fc3(out))
        
        return out.squeeze(1)

In [141]:
# create the model and test it on a sample input
model = GRUNet(188, 128, 2, 4).to(DEV)
for x, y in train_loader:
    out = model(x.to(DEV))
    print(f"Shape response {y.shape} out {out.shape}")
    print(out.detach()[:10])
    print(y[:10])
    break

loss_fn = F.binary_cross_entropy
optim = torch.optim.Adam(model.parameters(), lr = 0.001)

Shape response torch.Size([1024]) out torch.Size([1024])
tensor([0.4943, 0.5111, 0.5064, 0.5111, 0.5072, 0.5121, 0.5076, 0.5160, 0.5091,
        0.5013], device='cuda:0')
tensor([0., 0., 0., 0., 1., 0., 1., 0., 0., 0.])


In [142]:
# fit the model
fit_model(model, loss_fn, train_loader, val_loader, optim, 20)

EPOCH 1: 100%|███████████████████████████████████████████████████████████| 89/89 [00:02<00:00, 44.19it/s]


LOSS train 0.37838248389490536 valid 0.27238842844963074
AMEX metric train 0.5622904316000089 valid 0.7186406691629785


EPOCH 2: 100%|███████████████████████████████████████████████████████████| 89/89 [00:01<00:00, 48.69it/s]


LOSS train 0.2646306982535995 valid 0.2474704533815384
AMEX metric train 0.7281802878591545 valid 0.751721849213594


EPOCH 3: 100%|███████████████████████████████████████████████████████████| 89/89 [00:01<00:00, 48.73it/s]


LOSS train 0.2514062706339225 valid 0.24342888593673706
AMEX metric train 0.7503392859573731 valid 0.7639981335004394


EPOCH 4: 100%|███████████████████████████████████████████████████████████| 89/89 [00:01<00:00, 48.71it/s]


LOSS train 0.24448585577225418 valid 0.23735807836055756
AMEX metric train 0.7580422909912574 valid 0.769780667000673


EPOCH 5: 100%|███████████████████████████████████████████████████████████| 89/89 [00:02<00:00, 44.28it/s]


LOSS train 0.24163941084668877 valid 0.23646026849746704
AMEX metric train 0.7606216692302461 valid 0.773709865663206


EPOCH 6: 100%|███████████████████████████████████████████████████████████| 89/89 [00:02<00:00, 44.19it/s]


LOSS train 0.23908414047085838 valid 0.2332218438386917
AMEX metric train 0.7675756405329249 valid 0.7779207161009236


EPOCH 7: 100%|███████████████████████████████████████████████████████████| 89/89 [00:02<00:00, 44.39it/s]


LOSS train 0.23588028667348154 valid 0.23453271389007568
AMEX metric train 0.7697534111600287 valid 0.7794573416431376


EPOCH 8: 100%|███████████████████████████████████████████████████████████| 89/89 [00:01<00:00, 44.61it/s]


LOSS train 0.2333378666237499 valid 0.22462153434753418
AMEX metric train 0.7741466305477933 valid 0.7857406708001174


EPOCH 9: 100%|███████████████████████████████████████████████████████████| 89/89 [00:01<00:00, 49.00it/s]


LOSS train 0.232428351982256 valid 0.22408828139305115
AMEX metric train 0.7765115699804226 valid 0.7881993894544327


EPOCH 10: 100%|██████████████████████████████████████████████████████████| 89/89 [00:01<00:00, 48.90it/s]


LOSS train 0.2299778193570255 valid 0.22078512609004974
AMEX metric train 0.7809023289587772 valid 0.7917111402591096


EPOCH 11: 100%|██████████████████████████████████████████████████████████| 89/89 [00:01<00:00, 49.04it/s]


LOSS train 0.22782183731539865 valid 0.22478976845741272
AMEX metric train 0.7836455971501474 valid 0.7981618180846662


EPOCH 12: 100%|██████████████████████████████████████████████████████████| 89/89 [00:02<00:00, 44.31it/s]


LOSS train 0.22499398786700173 valid 0.21714532375335693
AMEX metric train 0.7880768311031928 valid 0.7996890601679875


EPOCH 13: 100%|██████████████████████████████████████████████████████████| 89/89 [00:01<00:00, 44.88it/s]


LOSS train 0.2261721861831258 valid 0.21864835917949677
AMEX metric train 0.7871542984495714 valid 0.8014989168468776


EPOCH 14: 100%|██████████████████████████████████████████████████████████| 89/89 [00:01<00:00, 44.58it/s]


LOSS train 0.2205199816923463 valid 0.21272417902946472
AMEX metric train 0.7925713041631075 valid 0.8016777275375853


EPOCH 15: 100%|██████████████████████████████████████████████████████████| 89/89 [00:02<00:00, 44.11it/s]


LOSS train 0.21932380102323681 valid 0.20893004536628723
AMEX metric train 0.7960390862584771 valid 0.8131654012480037


EPOCH 16: 100%|██████████████████████████████████████████████████████████| 89/89 [00:01<00:00, 45.18it/s]


LOSS train 0.21772775824150342 valid 0.2086150348186493
AMEX metric train 0.8001984408652408 valid 0.814034710079006


EPOCH 17: 100%|██████████████████████████████████████████████████████████| 89/89 [00:01<00:00, 48.89it/s]


LOSS train 0.21386683656928246 valid 0.2045663595199585
AMEX metric train 0.8059786561042942 valid 0.8162932625840926


EPOCH 18: 100%|██████████████████████████████████████████████████████████| 89/89 [00:01<00:00, 49.06it/s]


LOSS train 0.21135514689965196 valid 0.2025982141494751
AMEX metric train 0.8099177701256985 valid 0.8245440969043561


EPOCH 19: 100%|██████████████████████████████████████████████████████████| 89/89 [00:02<00:00, 44.38it/s]


LOSS train 0.20953080895241727 valid 0.19339419901371002
AMEX metric train 0.8150925614393286 valid 0.8354560274730982


EPOCH 20: 100%|██████████████████████████████████████████████████████████| 89/89 [00:01<00:00, 44.60it/s]


LOSS train 0.20401950582359615 valid 0.19352427124977112
AMEX metric train 0.8199797862989568 valid 0.8358212309964939


In [138]:
len(train_loader)

89

In [144]:
!du -sh /home/silo1/mas322/amex-default-prediction/derived/*

4.2G	/home/silo1/mas322/amex-default-prediction/derived/processed-splits
1.5G	/home/silo1/mas322/amex-default-prediction/derived/splits
14G	/home/silo1/mas322/amex-default-prediction/derived/test_data.feather
3.1G	/home/silo1/mas322/amex-default-prediction/derived/test.parquet
6.4G	/home/silo1/mas322/amex-default-prediction/derived/train_data.feather
31M	/home/silo1/mas322/amex-default-prediction/derived/train_labels.feather
1.6G	/home/silo1/mas322/amex-default-prediction/derived/train.parquet


## Example torch model

In [4]:
df2 = cudf.read_parquet(os.path.join(dataset_dir, 'derived', 'splits', 'train_0.parquet'))

# look at average values for last 3 months
df_train = df2.groupby('customer_ID', sort=False).tail(3).groupby('customer_ID').mean()
df_train

Unnamed: 0_level_0,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-7948374999843709812,0.456345,5.000000,0.026282,1.004786,0.005375,0.104417205,0.055832,0.008278,,0.1532619,...,-1.0,-1.0,0.000000,0.0,0.000000,,0.000000,0.006185,0.000000,0.0
-8645856538965131241,0.756145,6.000000,0.721992,0.293841,0.006862,0.597289562,0.000000,0.012906,0.020493843,,...,-1.0,-1.0,-0.333333,0.0,0.000000,,-0.333333,0.007025,-0.333333,0.0
-8791650985376527784,0.902983,0.000000,0.015144,1.006431,0.005585,-0.04807209,0.000000,0.005553,,,...,-1.0,-1.0,0.000000,0.0,0.000000,,0.000000,0.004280,0.000000,0.0
-8454268923601939076,0.823955,10.000000,0.029923,1.006844,0.002948,0.080841521,0.000000,0.009003,,0.01998057,...,-1.0,-1.0,0.000000,0.0,0.000000,,0.000000,0.006056,0.000000,0.0
-7657899299564496661,0.704843,8.666667,0.010633,0.939805,0.006357,0.090517749,0.000000,0.010066,0.023059726,,...,-1.0,-1.0,1.000000,0.0,0.862662,0.057864944,1.000000,0.007062,4.333333,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
-8713127797801704272,0.485146,6.333333,0.458467,0.024092,0.172573,0.283588727,0.000000,0.549716,,0.019031034,...,-1.0,-1.0,0.000000,0.0,0.000000,,0.000000,0.004491,0.000000,0.0
-7807490439052895278,0.815616,0.000000,0.004466,0.814304,0.004319,0.413618008,0.000000,0.005084,,0.01740142,...,-1.0,-1.0,1.000000,0.0,0.985106,0.573939919,1.000000,0.760147,2.000000,0.0
-8686894840230819858,0.385107,7.000000,0.024687,1.006793,0.003685,0.136240949,0.000000,0.005758,,0.058567395,...,-1.0,-1.0,0.000000,0.0,0.000000,,0.000000,0.005170,0.000000,0.0
-8354179577782462610,0.871444,1.000000,0.046527,0.148453,0.005374,0.17705514,0.000000,0.120511,,,...,-1.0,-1.0,0.000000,0.0,0.000000,,0.000000,0.004699,0.000000,0.0


In [5]:
X_train = torch.tensor(df_train.fillna(0.5).iloc[:, :(df_train.shape[1] - 1)].values, dtype = torch.float32)

In [6]:
y_train = torch.tensor(df_train.target, dtype=torch.float32)

In [7]:
class ExampleNet(nn.Module):
    
    def __init__(self):
        super(ExampleNet, self).__init__()
        
        self.fc1 = nn.Linear(X_train.shape[1], 64)
        self.bn1 = nn.BatchNorm1d(64)
        self.fc2 = nn.Linear(64, 32)
        self.bn2 = nn.BatchNorm1d(32)
        self.fc3 = nn.Linear(32, 1)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.bn1(x)
        x = F.relu(self.fc2(x))
        x = self.bn2(x)
        x = self.fc3(x)
        
        return x
    
mod = ExampleNet()
mod

ExampleNet(
  (fc1): Linear(in_features=188, out_features=64, bias=True)
  (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (bn2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=32, out_features=1, bias=True)
)

In [8]:
criterion = nn.BCEWithLogitsLoss()
train_dat = torch.utils.data.DataLoader(torch.hstack((X_train, y_train.unsqueeze(1))), batch_size = 64)
optim = torch.optim.Adam(mod.parameters(), lr = 0.001)


def train_one_epoch(epoch_index):
    running_loss = 0.
    num_batches = 0
    
    
    for i, data in enumerate(train_dat):
        inputs, labels = data[:, :-1], data[:, -1]
        inputs = inputs.to('cuda')
        labels = labels.unsqueeze(1).to('cuda')
        
        optim.zero_grad()
        
        out = mod(inputs)
        
        loss = criterion(out, labels)
        loss.backward()
        
        optim.step()
        
        # Gather data and report
        running_loss += loss.item()
        num_batches += 1
    
    return running_loss / num_batches

# move everything to the GPU
mod = mod.to('cuda')

In [9]:
epoch_number = 0

EPOCHS = 5

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    mod.train(True)
    avg_loss = train_one_epoch(epoch_number)
    print(avg_loss)

    epoch_number += 1

EPOCH 1:
0.32613843656599023
EPOCH 2:
0.26247388131210897
EPOCH 3:
0.2575088815042401
EPOCH 4:
0.2529879827285834
EPOCH 5:
0.2500592030268318


In [11]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [18]:
mod.train(False)

y_target_all = None
y_pred_all = None

for dat in train_dat:
    X_train, y_train = dat[:, :-1], dat[:, -1].unsqueeze(1)
    X_train = X_train.to('cuda')
    y_train = y_train.to('cuda')
    
    y_pred = torch.sigmoid(mod(X_train))
    
    if y_target_all is None:
        y_target_all = y_train.to('cpu').squeeze(1)
        y_pred_all = y_pred.to('cpu').squeeze(1)
    else:
        y_target_all = torch.hstack([y_target_all, y_train.squeeze(1).to('cpu')])
        y_pred_all = torch.hstack([y_pred_all, y_pred.squeeze(1).to('cpu')])

In [24]:
y_true = pd.DataFrame({'target' : y_target_all.detach().numpy()})
y_pred = pd.DataFrame({'prediction' : y_pred_all.detach().numpy()})

amex_metric(y_true, y_pred)

0.7600681393020587

## Example preprocessing code

The below code is taken from the example processing of the training data, by Chris Deotte, on their [TensorFlow GRU Starter](https://www.kaggle.com/code/cdeotte/tensorflow-gru-starter-0-790) kaggle discussion post.

TODO: look at the below code and change accordingly...
also figure out how to do this withou `cudf`, the improved dataframe library...
Consider if need this library later when getting on to the preprocessing methods...

In [None]:
# CALCULATE SIZE OF EACH SEPARATE FILE
def get_rows(customers, train, NUM_FILES = 10, verbose = ''):
    chunk = len(customers)//NUM_FILES
    if verbose != '':
        print(f'We will split {verbose} data into {NUM_FILES} separate files.')
        print(f'There will be {chunk} customers in each file (except the last file).')
        print('Below are number of rows in each file:')
    rows = []

    for k in range(NUM_FILES):
        if k==NUM_FILES-1: cc = customers[k*chunk:]
        else: cc = customers[k*chunk:(k+1)*chunk]
        s = train.loc[train.customer_ID.isin(cc)].shape[0]
        rows.append(s)
    if verbose != '': print( rows )
    return rows

if PROCESS_DATA:
    NUM_FILES = 10
    rows = get_rows(customers, train, NUM_FILES = NUM_FILES, verbose = 'train')

In [None]:
def feature_engineer(train, PAD_CUSTOMER_TO_13_ROWS = True, targets = None):
        
    # REDUCE STRING COLUMNS 
    # from 64 bytes to 8 bytes, and 10 bytes to 3 bytes respectively
    train['customer_ID'] = train['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    train.S_2 = cudf.to_datetime( train.S_2 )
    train['year'] = (train.S_2.dt.year-2000).astype('int8')
    train['month'] = (train.S_2.dt.month).astype('int8')
    train['day'] = (train.S_2.dt.day).astype('int8')
    del train['S_2']
        
    # LABEL ENCODE CAT COLUMNS (and reduce to 1 byte)
    # with 0: padding, 1: nan, 2,3,4,etc: values
    d_63_map = {'CL':2, 'CO':3, 'CR':4, 'XL':5, 'XM':6, 'XZ':7}
    train['D_63'] = train.D_63.map(d_63_map).fillna(1).astype('int8')

    d_64_map = {'-1':2,'O':3, 'R':4, 'U':5}
    train['D_64'] = train.D_64.map(d_64_map).fillna(1).astype('int8')
    
    CATS = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_66', 'D_68']
    OFFSETS = [2,1,2,2,3,2,3,2,2] #2 minus minimal value in full train csv
    # then 0 will be padding, 1 will be NAN, 2,3,4,etc will be values
    for c,s in zip(CATS,OFFSETS):
        train[c] = train[c] + s
        train[c] = train[c].fillna(1).astype('int8')
    CATS += ['D_63','D_64']
    
    # ADD NEW FEATURES HERE
    # EXAMPLE: train['feature_189'] = etc etc etc
    # EXAMPLE: train['feature_190'] = etc etc etc
    # IF CATEGORICAL, THEN ADD TO CATS WITH: CATS += ['feaure_190'] etc etc etc
    
    # REDUCE MEMORY DTYPE
    SKIP = ['customer_ID','year','month','day']
    for c in train.columns:
        if c in SKIP: continue
        if str( train[c].dtype )=='int64':
            train[c] = train[c].astype('int32')
        if str( train[c].dtype )=='float64':
            train[c] = train[c].astype('float32')
            
    # PAD ROWS SO EACH CUSTOMER HAS 13 ROWS
    if PAD_CUSTOMER_TO_13_ROWS:
        tmp = train[['customer_ID']].groupby('customer_ID').customer_ID.agg('count')
        more = cupy.array([],dtype='int64') 
        for j in range(1,13):
            i = tmp.loc[tmp==j].index.values
            more = cupy.concatenate([more,cupy.repeat(i,13-j)])
        df = train.iloc[:len(more)].copy().fillna(0)
        df = df * 0 - 1 #pad numerical columns with -1
        df[CATS] = (df[CATS] * 0).astype('int8') #pad categorical columns with 0
        df['customer_ID'] = more
        train = cudf.concat([train,df],axis=0,ignore_index=True)
        
    # ADD TARGETS (and reduce to 1 byte)
    if targets is not None:
        train = train.merge(targets,on='customer_ID',how='left')
        train.target = train.target.astype('int8')
        
    # FILL NAN
    train = train.fillna(-0.5) #this applies to numerical columns
    
    # SORT BY CUSTOMER THEN DATE
    train = train.sort_values(['customer_ID','year','month','day']).reset_index(drop=True)
    train = train.drop(['year','month','day'],axis=1)
    
    # REARRANGE COLUMNS WITH 11 CATS FIRST
    COLS = list(train.columns[1:])
    COLS = ['customer_ID'] + CATS + [c for c in COLS if c not in CATS]
    train = train[COLS]
    
    return train

In [None]:
if PROCESS_DATA:
    # CREATE PROCESSED TRAIN FILES AND SAVE TO DISK        
    for k in range(NUM_FILES):

        # READ CHUNK OF TRAIN CSV FILE
        skip = int(np.sum( rows[:k] ) + 1) #the plus one is for skipping header
        train = cudf.read_csv('../input/amex-default-prediction/train_data.csv', nrows=rows[k], 
                              skiprows=skip, header=None, names=T_COLS)

        # FEATURE ENGINEER DATAFRAME
        train = feature_engineer(train, targets = targets)

        # SAVE FILES
        print(f'Train_File_{k+1} has {train.customer_ID.nunique()} customers and shape',train.shape)
        tar = train[['customer_ID','target']].drop_duplicates().sort_index()
        if not os.path.exists(PATH_TO_DATA): os.makedirs(PATH_TO_DATA)
        tar.to_parquet(f'{PATH_TO_DATA}targets_{k+1}.pqt',index=False)
        data = train.iloc[:,1:-1].values.reshape((-1,13,188))
        cupy.save(f'{PATH_TO_DATA}data_{k+1}',data.astype('float32'))

    # CLEAN MEMORY
    del train, tar, data
    del targets
    gc.collect()