In [1]:
import pandas as pd
import datetime
import numpy as np
import gc
import tensorflow as tf
from keras import backend as K 
from keras.regularizers import L2
# GPU LIBRARIES, are these useful? Do they result in any meaningful speedup? I have replaced cupy with np and cudf with pd in the below code.
#import cupy, cudf 

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### High level steps

**EDA:** See what data we have.  
**Data wrangling:** handle N/A, handle categorical data, data normalization for real valued data, not all customers have 13 time data (what to do for these?, is the missing data non contiguous?)  
**Feature engineering:** Understand the important features (how to do this? Run a random forest model and see what the key features are?), dimensionality reduction?, drop any features?  
**Train, val, test split:** Split data.  
**Models:** Start with a simple model, ensembles?, have a mix of simple and advanced algorithms. Use embeddings to express users along latent dimensions with different models and then have an ensemble?, what will be the use of embeddings here?  


**Flow:**  
feature engineering pipeline -> save/restore data to/from disk (optional depending on how long the feature engineering takes) -> Build models -> evaluate models -> iterate


In [2]:
# Making sure we are using GPU and CUDA for training
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
assert (tf.test.is_built_with_cuda())

Num GPUs Available:  0


In [3]:
# Paths

MAIN_PATH = "/home/mahesh/Desktop/ML/kaggle/amex/"

# Data
PATH_TO_DATA                = MAIN_PATH + "data/"
PATH_TO_PROCESSED_DATA      = PATH_TO_DATA + "processed/"
FILENAME_TRAIN_DATA_CSV     = PATH_TO_DATA + "orig/train_data.csv"
FILENAME_TRAIN_LABELS_CSV   = PATH_TO_DATA + "orig/train_labels.csv"
FILENAME_CID_MAP            = PATH_TO_PROCESSED_DATA + "cid_map.csv"
FILENAME_TRAIN_DATA_FEATHER = PATH_TO_PROCESSED_DATA + "train_data.f"

# Models
PATH_TO_MODEL   = MAIN_PATH + "models/"

#### EDA

#### Model dev

In [4]:
# COMPETITION METRIC FROM Konstantin Yakovlev
# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

In [5]:
def build_model():
    
    # INPUT - FIRST 11 COLUMNS ARE CAT, NEXT 177 ARE NUMERIC
    inp = tf.keras.Input(shape=(13,188))
    embeddings = []
    for k in range(11):
        emb = tf.keras.layers.Embedding(10,4)
        embeddings.append( emb(inp[:,:,k]) )
    x = tf.keras.layers.Concatenate()([inp[:,:,11:]]+embeddings)
    
    # SIMPLE RNN BACKBONE
    #x = tf.keras.layers.GRU(units=128, return_sequences=False)(x)
    
    # The returned output should be a batch of sequences.
    # x = tf.keras.layers.LSTM(units = 128, return_sequences= True, kernel_regularizer=L2(0.01))(x)
    #x = tf.keras.layers.LSTM(units = 256, return_sequences= True)(x)
    x = tf.keras.layers.LSTM(units = 512, return_sequences= False)(x)
    # Add dropout with a probability of 0.5
    # x = tf.keras.layers.Dropout(rate=0.3)(x)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # The returned output should be a single hidden state, not a batch of sequences.
    #x = tf.keras.layers.LSTM(units = 64, kernel_regularizer=L2(0.01))(x)
    # x = tf.keras.layers.LSTM(units = 64)(x)
    # Add dropout with a probability of 0.5
    # x = tf.keras.layers.Dropout(rate=0.2)(x)
    # Propagate X through a Dense layer with 5 units
    #x = tf.keras.layers.Dense(units=5)(x)
    
    #x = tf.keras.layers.Dense(64,activation='relu')(x)
    x = tf.keras.layers.Dense(128,activation='relu')(x)
    
    # OUTPUT
    x = tf.keras.layers.Dense(1,activation='sigmoid')(x)
    
    # COMPILE MODEL
    model = tf.keras.Model(inputs=inp, outputs=x)
    opt = tf.keras.optimizers.Adam(learning_rate=0.001)
    loss = tf.keras.losses.BinaryCrossentropy()
    model.compile(loss=loss, optimizer = opt)
    
    return model

In [6]:
import math
TOTAL_EPOCHS = 12

# CUSTOM LEARNING SCHEUDLE
def lrfn(epoch):
    lr = [1e-3]*5 + [1e-4]*2 + [1e-5]*1
    i = math.floor(len(lr) * (epoch/TOTAL_EPOCHS))
    return lr[i]
LR = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = False)

In [7]:
if True:
    # SAVE TRUE AND OOF
    true = np.array([])
    oof = np.array([])
    VERBOSE = 2 # use 1 for interactive 

    for fold in range(5):
        
        print(datetime.datetime.now())

        # INDICES OF TRAIN AND VALID FOLDS
        valid_idx = [2*fold+1, 2*fold+2]
        train_idx = [x for x in [1,2,3,4,5,6,7,8,9,10] if x not in valid_idx]

        print('#'*25)
        print(f'### Fold {fold+1} with valid files', valid_idx)

        # READ TRAIN DATA FROM DISK
        X_train = []; y_train = []
        for k in train_idx:
            X_train.append( np.load(f'{PATH_TO_PROCESSED_DATA}data_{k}.npy'))
            y_train.append( pd.read_parquet(f'{PATH_TO_PROCESSED_DATA}targets_{k}.pqt') )
        X_train = np.concatenate(X_train,axis=0)
        y_train = pd.concat(y_train).target.values
        print('### Training data shapes', X_train.shape, y_train.shape)

        # READ VALID DATA FROM DISK
        X_valid = []; y_valid = []
        for k in valid_idx:
            X_valid.append( np.load(f'{PATH_TO_PROCESSED_DATA}data_{k}.npy'))
            y_valid.append( pd.read_parquet(f'{PATH_TO_PROCESSED_DATA}targets_{k}.pqt') )
        X_valid = np.concatenate(X_valid,axis=0)
        y_valid = pd.concat(y_valid).target.values
        print('### Validation data shapes', X_valid.shape, y_valid.shape)
        print('#'*25)

        # BUILD AND TRAIN MODEL
        K.clear_session()
        model = build_model()
        BATCH_SIZE = 2048
        h = model.fit(X_train,y_train, 
                      validation_data = (X_valid,y_valid),
                      batch_size=BATCH_SIZE, epochs=TOTAL_EPOCHS, verbose=VERBOSE,
                      callbacks = [LR])
        #if not os.path.exists(PATH_TO_MODEL): os.makedirs(PATH_TO_MODEL)
        model.save_weights(f'{PATH_TO_MODEL}LSTM_fold_{fold+1}.h5')

        # INFER VALID DATA
        print('Inferring validation data...')
        p = model.predict(X_valid, batch_size=512, verbose=VERBOSE).flatten()

        print()
        print(f'Fold {fold+1} CV=', amex_metric_mod(y_valid, p) )
        print()
        true = np.concatenate([true, y_valid])
        oof = np.concatenate([oof, p])
        
        # CLEAN MEMORY
        del model, X_train, y_train, X_valid, y_valid, p
        gc.collect()
        
        print(datetime.datetime.now())

    # PRINT OVERALL RESULTS
    print('#'*25)
    print(f'Overall CV =', amex_metric_mod(true, oof) )
    K.clear_session()

2022-08-17 14:30:51.452530
#########################
### Fold 1 with valid files [1, 2]
### Training data shapes (367131, 13, 188) (367131,)
### Validation data shapes (91782, 13, 188) (91782,)
#########################
Epoch 1/12
180/180 - 121s - loss: 0.2449 - val_loss: 0.2348 - lr: 0.0010 - 121s/epoch - 672ms/step
Epoch 2/12
180/180 - 119s - loss: 0.2278 - val_loss: 0.2288 - lr: 0.0010 - 119s/epoch - 661ms/step
Epoch 3/12
180/180 - 119s - loss: 0.2235 - val_loss: 0.2267 - lr: 0.0010 - 119s/epoch - 660ms/step
Epoch 4/12
180/180 - 119s - loss: 0.2211 - val_loss: 0.2281 - lr: 0.0010 - 119s/epoch - 660ms/step
Epoch 5/12
180/180 - 119s - loss: 0.2186 - val_loss: 0.2271 - lr: 0.0010 - 119s/epoch - 661ms/step
Epoch 6/12
180/180 - 119s - loss: 0.2164 - val_loss: 0.2264 - lr: 0.0010 - 119s/epoch - 661ms/step
Epoch 7/12
180/180 - 119s - loss: 0.2139 - val_loss: 0.2264 - lr: 0.0010 - 119s/epoch - 660ms/step
Epoch 8/12
180/180 - 119s - loss: 0.2108 - val_loss: 0.2264 - lr: 0.0010 - 119s/epoch -