Inspired by https://www.kaggle.com/code/cdeotte/tensorflow-gru-starter-0-790

In [1]:
import pandas as pd
import datetime
import numpy as np
import gc
import tensorflow as tf
from keras import backend as K 
# GPU LIBRARIES, are these useful? Do they result in any meaningful speedup? I have replaced cupy with np and cudf with pd in the below code.
#import cupy, cudf 

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
# Paths

MAIN_PATH = "/home/mahesh/Desktop/ML/kaggle/amex/"

# Data
PATH_TO_DATA                = MAIN_PATH + "data/"
PATH_TO_PROCESSED_DATA      = PATH_TO_DATA + "processed/"
FILENAME_TRAIN_DATA_CSV     = PATH_TO_DATA + "orig/train_data.csv"
FILENAME_TRAIN_LABELS_CSV   = PATH_TO_DATA + "orig/train_labels.csv"
FILENAME_CID_MAP            = PATH_TO_PROCESSED_DATA + "cid_map.csv"
FILENAME_TRAIN_DATA_FEATHER = PATH_TO_PROCESSED_DATA + "train_data.f"

# Models
PATH_TO_MODEL   = MAIN_PATH + "models/"

In [None]:
print(datetime.datetime.now())
#train_df = pd.read_feather(FILENAME_TRAIN_DATA_FEATHER)
train_df = pd.read_csv(FILENAME_TRAIN_DATA_CSV)
T_COLS = train_df.columns
print(datetime.datetime.now())
train_df.info(memory_usage="deep")

2022-08-12 18:12:17.580682


In [31]:
targets =  pd.read_csv(FILENAME_TRAIN_LABELS_CSV)
targets['customer_ID'] = targets['customer_ID'].str[-16:].apply(lambda x:int(x,16)).astype('int64')

#### Split train data into folds

For now we will just have 10 folds of non-stratified data.   
***todo:*** *Need to stratify the folds to make sure they have similar ditribution of data. One way to do this is to use the distribution of output labels.*

In [9]:
customers = train_df["customer_ID"]
customers = customers.drop_duplicates().sort_index().values.flatten()
len(customers)
type(customers)

numpy.ndarray

In [16]:
def get_rows(customers, train, NUM_FOLDS = 10, verbose = ''):
    chunk = len(customers)//NUM_FOLDS
    if verbose != '':
        print(f'We will split {verbose} data into {NUM_FOLDS} separate folds.')
        print(f'There will be {chunk} customers in each fold (except the last fold).')
        print('Below are number of rows in each fold:')
    rows = []

    for k in range(NUM_FOLDS):
        if k==NUM_FOLDS-1: cc = customers[k*chunk:]
        else: cc = customers[k*chunk:(k+1)*chunk]
        s = train.loc[train.customer_ID.isin(cc)].shape[0]
        rows.append(s)
    if verbose != '': print( rows )
    return rows

NUM_FOLDS = 10
rows = get_rows(customers, train_df, NUM_FOLDS = NUM_FOLDS, verbose = 'train')

del train_df
gc.collect()

We will split train data into 10 separate folds.
There will be 45891 customers in each fold (except the last fold).
Below are number of rows in each fold:
[553403, 552855, 554025, 554330, 552004, 552378, 552822, 553151, 553493, 552990]


0

#### Feature Engineering:

    1. Handle NaNs 
    2. Make each customer have 13 months(?) of data, add zeroed out months? 
    3. Save processed data to disk? 
    4. Train/Validation split?
    

In [24]:
def feature_engineer(train, PAD_CUSTOMER_TO_13_ROWS = True, targets = None):
        
    # REDUCE STRING COLUMNS 
    # from 64 bytes to 8 bytes, and 10 bytes to 3 bytes respectively
    #train['customer_ID'] = train['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    train['customer_ID'] = train['customer_ID'].str[-16:].apply(lambda x:int(x,16)).astype('int64')
    train.S_2 = pd.to_datetime( train.S_2 )
    train['year'] = (train.S_2.dt.year-2000).astype('int8')
    train['month'] = (train.S_2.dt.month).astype('int8')
    train['day'] = (train.S_2.dt.day).astype('int8')
    del train['S_2']
        
    # LABEL ENCODE CAT COLUMNS (and reduce to 1 byte)
    # with 0: padding, 1: nan, 2,3,4,etc: values
    d_63_map = {'CL':2, 'CO':3, 'CR':4, 'XL':5, 'XM':6, 'XZ':7}
    train['D_63'] = train.D_63.map(d_63_map).fillna(1).astype('int8')

    d_64_map = {'-1':2,'O':3, 'R':4, 'U':5}
    train['D_64'] = train.D_64.map(d_64_map).fillna(1).astype('int8')
    
    CATS = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_66', 'D_68']
    OFFSETS = [2,1,2,2,3,2,3,2,2] #2 minus minimal value in full train csv
    # then 0 will be padding, 1 will be NAN, 2,3,4,etc will be values
    for c,s in zip(CATS,OFFSETS):
        train[c] = train[c] + s
        train[c] = train[c].fillna(1).astype('int8')
    CATS += ['D_63','D_64']
    
    # ADD NEW FEATURES HERE
    # EXAMPLE: train['feature_189'] = etc etc etc
    # EXAMPLE: train['feature_190'] = etc etc etc
    # IF CATEGORICAL, THEN ADD TO CATS WITH: CATS += ['feaure_190'] etc etc etc
    
    # REDUCE MEMORY DTYPE
    SKIP = ['customer_ID','year','month','day']
    for c in train.columns:
        if c in SKIP: continue
        if str( train[c].dtype )=='int64':
            train[c] = train[c].astype('int32')
        if str( train[c].dtype )=='float64':
            train[c] = train[c].astype('float32')
            
    # PAD ROWS SO EACH CUSTOMER HAS 13 ROWS
    if PAD_CUSTOMER_TO_13_ROWS:
        tmp = train[['customer_ID']].groupby('customer_ID').customer_ID.agg('count')
        more = np.array([],dtype='int64') 
        for j in range(1,13):
            i = tmp.loc[tmp==j].index.values
            more = np.concatenate([more,np.repeat(i,13-j)])
        df = train.iloc[:len(more)].copy().fillna(0)
        df = df * 0 - 1 #pad numerical columns with -1
        df[CATS] = (df[CATS] * 0).astype('int8') #pad categorical columns with 0
        df['customer_ID'] = more
        train = pd.concat([train,df],axis=0,ignore_index=True)
        
    # ADD TARGETS (and reduce to 1 byte)
    if targets is not None:
        train = train.merge(targets,on='customer_ID',how='left')
        train.target = train.target.astype('int8')
        
    # FILL NAN
    train = train.fillna(-0.5) #this applies to numerical columns
    
    # SORT BY CUSTOMER THEN DATE
    train = train.sort_values(['customer_ID','year','month','day']).reset_index(drop=True)
    train = train.drop(['year','month','day'],axis=1)
    
    # REARRANGE COLUMNS WITH 11 CATS FIRST
    COLS = list(train.columns[1:])
    COLS = ['customer_ID'] + CATS + [c for c in COLS if c not in CATS]
    train = train[COLS]
    
    return train

In [32]:
# CREATE PROCESSED TRAIN FOLDS AND SAVE TO DISK        
print(datetime.datetime.now())
for k in range(NUM_FOLDS):

    # READ CHUNK OF TRAIN CSV FILE
    skip = int(np.sum( rows[:k] ) + 1) #the plus one is for skipping header
    train = pd.read_csv(FILENAME_TRAIN_DATA_CSV, nrows=rows[k], 
                              skiprows=skip, header=None, names=T_COLS)

    # FEATURE ENGINEER DATAFRAME
    train = feature_engineer(train, targets = targets)

    # SAVE FILES
    print(f'Train_File_{k+1} has {train.customer_ID.nunique()} customers and shape',train.shape)
    tar = train[['customer_ID','target']].drop_duplicates().sort_index()
    #if not os.path.exists(PATH_TO_PROCESSED_DATA): os.makedirs(PATH_TO_DATA)
    tar.to_parquet(f'{PATH_TO_PROCESSED_DATA}targets_{k+1}.pqt',index=False)
    data = train.iloc[:,1:-1].values.reshape((-1,13,188))
    np.save(f'{PATH_TO_PROCESSED_DATA}data_{k+1}',data.astype('float32'))

    # CLEAN MEMORY
    del train, tar, data
    gc.collect()
del targets
gc.collect()
print(datetime.datetime.now())

Train_File_1 has 45891 customers and shape (596583, 190)
Train_File_2 has 45891 customers and shape (596583, 190)
Train_File_3 has 45891 customers and shape (596583, 190)
Train_File_4 has 45891 customers and shape (596583, 190)
Train_File_5 has 45891 customers and shape (596583, 190)
Train_File_6 has 45891 customers and shape (596583, 190)
Train_File_7 has 45891 customers and shape (596583, 190)
Train_File_8 has 45891 customers and shape (596583, 190)
Train_File_9 has 45891 customers and shape (596583, 190)
Train_File_10 has 45894 customers and shape (596622, 190)


0

#### Model building:

In [3]:
def build_model():
    
    # INPUT - FIRST 11 COLUMNS ARE CAT, NEXT 177 ARE NUMERIC
    inp = tf.keras.Input(shape=(13,188))
    embeddings = []
    for k in range(11):
        emb = tf.keras.layers.Embedding(10,4)
        embeddings.append( emb(inp[:,:,k]) )
    x = tf.keras.layers.Concatenate()([inp[:,:,11:]]+embeddings)
    
    # SIMPLE RNN BACKBONE
    x = tf.keras.layers.GRU(units=128, return_sequences=False)(x)
    x = tf.keras.layers.Dense(64,activation='relu')(x)
    x = tf.keras.layers.Dense(32,activation='relu')(x)
    
    # OUTPUT
    x = tf.keras.layers.Dense(1,activation='sigmoid')(x)
    
    # COMPILE MODEL
    model = tf.keras.Model(inputs=inp, outputs=x)
    opt = tf.keras.optimizers.Adam(learning_rate=0.001)
    loss = tf.keras.losses.BinaryCrossentropy()
    model.compile(loss=loss, optimizer = opt)
    
    return model

In [4]:
# CUSTOM LEARNING SCHEUDLE
def lrfn(epoch):
    lr = [1e-3]*5 + [1e-4]*2 + [1e-5]*1
    return lr[epoch]
LR = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = False)

In [5]:
# COMPETITION METRIC FROM Konstantin Yakovlev
# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

#### Model Training:

In [6]:
if True:
    # SAVE TRUE AND OOF
    true = np.array([])
    oof = np.array([])
    VERBOSE = 2 # use 1 for interactive 

    for fold in range(5):

        # INDICES OF TRAIN AND VALID FOLDS
        valid_idx = [2*fold+1, 2*fold+2]
        train_idx = [x for x in [1,2,3,4,5,6,7,8,9,10] if x not in valid_idx]

        print('#'*25)
        print(f'### Fold {fold+1} with valid files', valid_idx)

        # READ TRAIN DATA FROM DISK
        X_train = []; y_train = []
        for k in train_idx:
            X_train.append( np.load(f'{PATH_TO_PROCESSED_DATA}data_{k}.npy'))
            y_train.append( pd.read_parquet(f'{PATH_TO_PROCESSED_DATA}targets_{k}.pqt') )
        X_train = np.concatenate(X_train,axis=0)
        y_train = pd.concat(y_train).target.values
        print('### Training data shapes', X_train.shape, y_train.shape)

        # READ VALID DATA FROM DISK
        X_valid = []; y_valid = []
        for k in valid_idx:
            X_valid.append( np.load(f'{PATH_TO_PROCESSED_DATA}data_{k}.npy'))
            y_valid.append( pd.read_parquet(f'{PATH_TO_PROCESSED_DATA}targets_{k}.pqt') )
        X_valid = np.concatenate(X_valid,axis=0)
        y_valid = pd.concat(y_valid).target.values
        print('### Validation data shapes', X_valid.shape, y_valid.shape)
        print('#'*25)

        # BUILD AND TRAIN MODEL
        K.clear_session()
        model = build_model()
        h = model.fit(X_train,y_train, 
                      validation_data = (X_valid,y_valid),
                      batch_size=512, epochs=8, verbose=VERBOSE,
                      callbacks = [LR])
        #if not os.path.exists(PATH_TO_MODEL): os.makedirs(PATH_TO_MODEL)
        model.save_weights(f'{PATH_TO_MODEL}gru_fold_{fold+1}.h5')

        # INFER VALID DATA
        print('Inferring validation data...')
        p = model.predict(X_valid, batch_size=512, verbose=VERBOSE).flatten()

        print()
        print(f'Fold {fold+1} CV=', amex_metric_mod(y_valid, p) )
        print()
        true = np.concatenate([true, y_valid])
        oof = np.concatenate([oof, p])
        
        # CLEAN MEMORY
        del model, X_train, y_train, X_valid, y_valid, p
        gc.collect()

    # PRINT OVERALL RESULTS
    print('#'*25)
    print(f'Overall CV =', amex_metric_mod(true, oof) )
    K.clear_session()

#########################
### Fold 1 with valid files [1, 2]
### Training data shapes (367131, 13, 188) (367131,)
### Validation data shapes (91782, 13, 188) (91782,)
#########################
Epoch 1/8
718/718 - 23s - loss: 0.2378 - val_loss: 0.2373 - lr: 0.0010 - 23s/epoch - 32ms/step
Epoch 2/8
718/718 - 21s - loss: 0.2268 - val_loss: 0.2288 - lr: 0.0010 - 21s/epoch - 29ms/step
Epoch 3/8
718/718 - 21s - loss: 0.2231 - val_loss: 0.2262 - lr: 0.0010 - 21s/epoch - 29ms/step
Epoch 4/8
718/718 - 21s - loss: 0.2207 - val_loss: 0.2263 - lr: 0.0010 - 21s/epoch - 29ms/step
Epoch 5/8
718/718 - 21s - loss: 0.2184 - val_loss: 0.2281 - lr: 0.0010 - 21s/epoch - 29ms/step
Epoch 6/8
718/718 - 21s - loss: 0.2120 - val_loss: 0.2233 - lr: 1.0000e-04 - 21s/epoch - 29ms/step
Epoch 7/8
718/718 - 21s - loss: 0.2107 - val_loss: 0.2231 - lr: 1.0000e-04 - 21s/epoch - 30ms/step
Epoch 8/8
718/718 - 21s - loss: 0.2094 - val_loss: 0.2231 - lr: 1.0000e-05 - 21s/epoch - 29ms/step
Inferring validation data...
180/18

#### Model evaluation: