In [1]:
import utils

utils.widen_ipython_window()

import pandas as pd
import datetime
import numpy as np
import gc
import tensorflow as tf
from keras import backend as K 



In [2]:
#
# Paths
#

MAIN_PATH = "/home/mahesh/Desktop/ML/kaggle/amex/"

# Data
PATH_TO_DATA                = MAIN_PATH + "data/"
PATH_TO_PROCESSED_DATA      = PATH_TO_DATA + "processed/"
PATH_TO_PROCESSED2_DATA     = PATH_TO_DATA + "processed2/"
PATH_TO_PROCESSED4_DATA     = PATH_TO_DATA + "processed4/"

FILENAME_TRAIN_DATA_CSV     = PATH_TO_DATA + "orig/train_data.csv"
FILENAME_TRAIN_LABELS_CSV   = PATH_TO_DATA + "orig/train_labels.csv"
FILENAME_TEST_DATA_CSV      = PATH_TO_DATA + "orig/test_data.csv"
FILENAME_SAMPLE_SUBMISSION_CSV = PATH_TO_DATA + "orig/sample_submission.csv"

FILENAME_TRAIN_DATA_FEATHER = PATH_TO_PROCESSED_DATA + "train_data.f"
FILENAME_TRAIN_PROCESSED2_DATA_FEATHER   = PATH_TO_PROCESSED2_DATA + "train_data.f"
FILENAME_TRAIN_PROCESSED2_LABELS_FEATHER = PATH_TO_PROCESSED2_DATA + "train_labels.f"
FILENAME_TRAIN_PROCESSED2_DATA_CAT_NOCHANGE_FEATHER   = PATH_TO_PROCESSED2_DATA + "train_data_cat_nochange.f"

FILENAME_TEST_CUSTOMER_HASHES  = PATH_TO_PROCESSED2_DATA + "test_customer_hashes_data.pq"
FILENAME_TEST_HASH_DATA        = PATH_TO_PROCESSED2_DATA + "test_hashes_data"
FILENAME_GRU_SUBMISSION        = PATH_TO_PROCESSED2_DATA + "submission_gru.csv"

FILENAME_TRAIN_PROCESSED4_FE_DATA_RNN_FEATHER = PATH_TO_PROCESSED4_DATA + "train_FE_data_RNN.f"

# Models
PATH_TO_MODEL   = MAIN_PATH + "models/"

    1. Feature engineer the data
    2. Iterate over the folds
        a. Load the customer IDs corresponding to the train and val data
        b. Extract the data and labels corresponding to the customer IDs

In [25]:
RUN_FEATURE_ENGINEERING = 0

def feature_engineer(train, PAD_CUSTOMER_TO_13_ROWS = True, targets = None, edit_cid_time = False):
        
    # REDUCE STRING COLUMNS 
    # from 64 bytes to 8 bytes, and 10 bytes to 3 bytes respectively
    #train['customer_ID'] = train['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    if edit_cid_time:
        train['customer_ID'] = train['customer_ID'].str[-16:].apply(lambda x:int(x,16)).astype('int64')
        train.S_2 = pd.to_datetime( train.S_2 )
    train['year'] = (train.S_2.dt.year-2000).astype('int8')
    train['month'] = (train.S_2.dt.month).astype('int8')
    train['day'] = (train.S_2.dt.day).astype('int8')
    del train['S_2']
        
    # LABEL ENCODE CAT COLUMNS (and reduce to 1 byte)
    # with 0: padding, 1: nan, 2,3,4,etc: values
    d_63_map = {'CL':2, 'CO':3, 'CR':4, 'XL':5, 'XM':6, 'XZ':7}
    train['D_63'] = train.D_63.map(d_63_map).fillna(1).astype('int8')

    d_64_map = {'-1':2,'O':3, 'R':4, 'U':5}
    train['D_64'] = train.D_64.map(d_64_map).fillna(1).astype('int8')
    
    CATS = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_66', 'D_68']
    OFFSETS = [2,1,2,2,3,2,3,2,2] #2 minus minimal value in full train csv
    # then 0 will be padding, 1 will be NAN, 2,3,4,etc will be values
    for c,s in zip(CATS,OFFSETS):
        train[c] = train[c] + s
        train[c] = train[c].fillna(1).astype('int8')
    CATS += ['D_63','D_64']
    
    # ADD NEW FEATURES HERE
    # EXAMPLE: train['feature_189'] = etc etc etc
    # EXAMPLE: train['feature_190'] = etc etc etc
    # IF CATEGORICAL, THEN ADD TO CATS WITH: CATS += ['feaure_190'] etc etc etc
    
    # REDUCE MEMORY DTYPE
    SKIP = ['customer_ID','year','month','day']
    for c in train.columns:
        if c in SKIP: continue
        if str( train[c].dtype )=='int64':
            train[c] = train[c].astype('int32')
        if str( train[c].dtype )=='float64':
            train[c] = train[c].astype('float32')
            
    # PAD ROWS SO EACH CUSTOMER HAS 13 ROWS
    if PAD_CUSTOMER_TO_13_ROWS:
        tmp = train[['customer_ID']].groupby('customer_ID').customer_ID.agg('count')
        more = np.array([],dtype='int64') 
        for j in range(1,13):
            i = tmp.loc[tmp==j].index.values
            more = np.concatenate([more,np.repeat(i,13-j)])
        df = train.iloc[:len(more)].copy().fillna(0)
        df = df * 0 - 1 #pad numerical columns with -1
        df[CATS] = (df[CATS] * 0).astype('int8') #pad categorical columns with 0
        df['customer_ID'] = more
        train = pd.concat([train,df],axis=0,ignore_index=True)
        
    # ADD TARGETS (and reduce to 1 byte)
    if targets is not None:
        train = train.merge(targets,on='customer_ID',how='left')
        train.target = train.target.astype('int8')
        
    # FILL NAN
    train = train.fillna(-0.5) #this applies to numerical columns
    
    # SORT BY CUSTOMER THEN DATE
    train = train.sort_values(['customer_ID','year','month','day']).reset_index(drop=True)
    train = train.drop(['year','month','day'],axis=1)
    
    # REARRANGE COLUMNS WITH 11 CATS FIRST
    COLS = list(train.columns[1:])
    COLS = ['customer_ID'] + CATS + [c for c in COLS if c not in CATS]
    train = train[COLS]
    
    return train

def read_train_data():
    # Load the data
    train_full_data   = pd.read_feather(FILENAME_TRAIN_PROCESSED2_DATA_CAT_NOCHANGE_FEATHER)
    train_full_labels = pd.read_feather(FILENAME_TRAIN_PROCESSED2_LABELS_FEATHER)
    utils.pt("Reading raw data")
    train_full_data.info(memory_usage="deep")
    return (train_full_data, train_full_labels)

def feature_engineer_full_data_and_save_to_file():
    (train_full_data, train_full_labels) = read_train_data()
    
    # Feature engineer
    utils.pt("Starting feature engineering")
    train_FE_data = feature_engineer(train_full_data, PAD_CUSTOMER_TO_13_ROWS = True, targets = train_full_labels)
    utils.pt("Completed feature engineering")
    
    train_FE_data.info(memory_usage="deep")
    
    utils.pt("Writing feature engineered data to disk")
    train_FE_data.to_feather(FILENAME_TRAIN_PROCESSED4_FE_DATA_RNN_FEATHER)
    
    utils.gc_l([train_full_data, train_full_labels, train_FE_data])
    
if RUN_FEATURE_ENGINEERING:
    feature_engineer_full_data_and_save_to_file()

In [None]:
TRAIN_MODELS = 1
#
# CONFIGS
#
VERBOSE   = 2
SEED      = 42
NUM_FOLDS = 5

EPOCHS     = 8
BATCH_SIZE = 512

TARGET_LABEL      = 'target'
CUSTOMER_ID_LABEL = "customer_ID"



def build_gru_model():
    
    # INPUT - FIRST 11 COLUMNS ARE CAT, NEXT 177 ARE NUMERIC
    inp = tf.keras.Input(shape=(13,188))
    embeddings = []
    for k in range(11):
        emb = tf.keras.layers.Embedding(10,4)
        embeddings.append( emb(inp[:,:,k]) )
    x = tf.keras.layers.Concatenate()([inp[:,:,11:]]+embeddings)
    
    # SIMPLE RNN BACKBONE
    x = tf.keras.layers.GRU(units=128, return_sequences=False)(x)
    x = tf.keras.layers.Dense(64,activation='relu')(x)
    x = tf.keras.layers.Dense(32,activation='relu')(x)
    
    # OUTPUT
    x = tf.keras.layers.Dense(1,activation='sigmoid')(x)
    
    # COMPILE MODEL
    model = tf.keras.Model(inputs=inp, outputs=x)
    opt = tf.keras.optimizers.Adam(learning_rate=0.001)
    loss = tf.keras.losses.BinaryCrossentropy()
    model.compile(loss=loss, optimizer = opt)
    
    return model

# CUSTOM LEARNING SCHEUDLE
def lrfn(epoch):
    lr = [1e-3]*5 + [1e-4]*2 + [1e-5]*1
    return lr[epoch]
LR = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = False)


def extract_X_Y(FE_data, cids):
    #utils.pt(str(FE_data.shape))
    #utils.pt(str(cids.shape))
    
    data = FE_data.loc[(FE_data.customer_ID.isin(cids.customer_ID.values))]
    data = data.reset_index() # this adds "index" column to the data-frame
    
    #utils.pt(str(data.shape))
    #data.info()
    #utils.pt(str(data.iloc[:,1:-1].values.shape))
    #utils.pt(str(data.columns))
    
    Y = data[['customer_ID','target']].drop_duplicates().sort_index().target.values
    X = data.iloc[:,2:-1].values.reshape((-1,13,188))
    
    return (X,Y)

def extract_val_train_data(fold, train_FE_data):
    train_cids = pd.read_feather(f'{PATH_TO_PROCESSED4_DATA}/train_{CUSTOMER_ID_LABEL}_fold_{fold}.f')
    val_cids   = pd.read_feather(f'{PATH_TO_PROCESSED4_DATA}/val_{CUSTOMER_ID_LABEL}_fold_{fold}.f')
    
    (X_train, Y_train) = extract_X_Y(train_FE_data, train_cids) 
    (X_val  , Y_val  ) = extract_X_Y(train_FE_data, val_cids  )
    
    return (X_train, Y_train, X_val, Y_val)

def extract_test_data(train_FE_data):
    test_cids = pd.read_feather(f'{PATH_TO_PROCESSED4_DATA}/test_{CUSTOMER_ID_LABEL}.f')
    return (extract_X_Y(train_FE_data, test_cids))
    
    
def train_gru_models():
    
    utils.pt(f'Reading feature engineered data.')
    train_FE_data = pd.read_feather(FILENAME_TRAIN_PROCESSED4_FE_DATA_RNN_FEATHER)
    
    utils.pt(f'Extracting test data.')
    
    (X_test, Y_test) = extract_test_data(train_FE_data)

    utils.pt(f'### Test data shapes   {X_test.shape} , {Y_test.shape}')
    
    for fold in range(0,NUM_FOLDS):
        
        utils.pt(f'#### Fold -{fold} ####')
        
        utils.pt(f'Extracting train and val data.')
        (X_train, Y_train, X_val, Y_val) = extract_val_train_data(fold, train_FE_data)
        
        utils.pt(f'### Training data shapes   {X_train.shape} , {Y_train.shape}')
        utils.pt(f'### Validation data shapes {X_val.shape}   , {Y_val.shape}  ')
        
        utils.pt(f'Starting model training.')
        # BUILD AND TRAIN MODEL
        K.clear_session()
        model = build_gru_model()
        h = model.fit(X_train,Y_train, 
                      validation_data = (X_val,Y_val),
                      batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=VERBOSE,
                      callbacks = [LR])        
        utils.pt(f'Completed model training.')
        
        utils.pt(f'Saving model to file.')
        model.save_weights(f'{PATH_TO_MODEL}gru_fold_{fold}.h5')
        
        # INFER VALID DATA
        utils.pt('Inferring validation data...')
        preds = model.predict(X_val, batch_size=512, verbose=VERBOSE).flatten()
        amm = utils.amex_metric_mod(Y_val, preds)
        utils.pt(f'Fold {fold} CV = {amm}')
        
        
        # INFER TEST DATA
        utils.pt('Inferring test data...')
        preds = model.predict(X_test, batch_size=512, verbose=VERBOSE).flatten()
        amm = utils.amex_metric_mod(Y_test, preds)
        utils.pt(f'Fold {fold} CV = {amm}')
        
        print()
        
        utils.gc_l([model, X_train, Y_train, X_val, Y_val, preds])

    print()
    utils.pt(f'*** Completed model training for all folds ***')

if TRAIN_MODELS :
    train_gru_models()

2022-08-29 17:34:33.773624 : Reading feature engineered data.
2022-08-29 17:34:35.990073 : Extracting test data.
2022-08-29 17:34:36.330665 : ### Test data shapes   (22946, 13, 188) , (22946,)
2022-08-29 17:34:36.330736 : #### Fold -0 ####
2022-08-29 17:34:36.330750 : Extracting train and val data.
2022-08-29 17:34:41.595207 : ### Training data shapes   (348773, 13, 188) , (348773,)
2022-08-29 17:34:41.595355 : ### Validation data shapes (87194, 13, 188)   , (87194,)  
2022-08-29 17:34:41.595368 : Starting model training.
Epoch 1/8
682/682 - 10s - loss: 0.2443 - val_loss: 0.2274 - lr: 0.0010 - 10s/epoch - 14ms/step
Epoch 2/8
682/682 - 6s - loss: 0.2291 - val_loss: 0.2234 - lr: 0.0010 - 6s/epoch - 8ms/step
Epoch 3/8
682/682 - 5s - loss: 0.2263 - val_loss: 0.2209 - lr: 0.0010 - 5s/epoch - 8ms/step
Epoch 4/8
682/682 - 6s - loss: 0.2235 - val_loss: 0.2219 - lr: 0.0010 - 6s/epoch - 8ms/step
Epoch 5/8
682/682 - 6s - loss: 0.2210 - val_loss: 0.2200 - lr: 0.0010 - 6s/epoch - 8ms/step
Epoch 6/8

In [30]:
def get_rows(customers, train, NUM_FOLDS = 10, verbose = ''):
    chunk = len(customers)//NUM_FOLDS
    if verbose != '':
        utils.pt(f'We will split {verbose} data into {NUM_FOLDS} separate folds.')
        utils.pt(f'There will be {chunk} customers in each fold (except the last fold).')
        utils.pt('Below are number of rows in each fold:')
    rows = []

    for k in range(NUM_FOLDS):
        if k==NUM_FOLDS-1: cc = customers[k*chunk:]
        else: cc = customers[k*chunk:(k+1)*chunk]
        s = train.loc[train.customer_ID.isin(cc)].shape[0]
        rows.append(s)
    if verbose != '': utils.pt( str(rows) )
    return rows

def get_column_names_cids(read_customer_hashes):
    
    # GET TEST COLUMN NAMES
    test = pd.read_csv(FILENAME_TEST_DATA_CSV, nrows=1)
    T_COLS = test.columns
    utils.pt(f'There are {len(T_COLS)} test dataframe columns')
    
    if read_customer_hashes:
        test_cids = pd.read_parquet(FILENAME_TEST_CUSTOMER_HASHES)
    else:
        test_cids = pd.read_csv(FILENAME_TEST_DATA_CSV, usecols=['customer_ID'])
        test_cids.to_parquet(FILENAME_TEST_CUSTOMER_HASHES)
    
    test_cids['customer_ID'] = test_cids['customer_ID'].str[-16:].apply(lambda x:int(x,16)).astype('int64')
    cids = test_cids.drop_duplicates().sort_index().values.flatten()
    
    utils.pt(f'There are {len(cids)} unique customers in test.')
    
    return (T_COLS, cids, test_cids)

def break_up_test_data(T_COLS, NUM_FILES, rows):
    
    # SAVE TEST CUSTOMERS INDEX
    test_customer_hashes = np.array([],dtype='int64')
    
    # CREATE PROCESSED TEST FILES AND SAVE TO DISK
    for k in range(NUM_FILES):

        # READ CHUNK OF TEST CSV FILE
        skip = int(np.sum( rows[:k] ) + 1) #the plus one is for skipping header
        test = pd.read_csv(FILENAME_TEST_DATA_CSV, nrows=rows[k], 
                              skiprows=skip, header=None, names=T_COLS)

        # FEATURE ENGINEER DATAFRAME
        test = feature_engineer(test, targets = None, edit_cid_time = True)
        
        # SAVE TEST CUSTOMERS INDEX
        cust = test[['customer_ID']].drop_duplicates().sort_index().values.flatten()
        test_customer_hashes = np.concatenate([test_customer_hashes,cust])
        
        # SAVE FILES
        utils.pt(f'Test_File_{k+1} has {test.customer_ID.nunique()} customers and shape {test.shape}')
        data = test.iloc[:,1:].values.reshape((-1,13,188))
        np.save(f'{PATH_TO_PROCESSED2_DATA}test_data_{k+1}',data.astype('float32'))
        
        # CLEAN MEMORY
        utils.gc_l([test, data])
    
    # SAVE CUSTOMER INDEX OF ALL TEST FILES
    np.save(FILENAME_TEST_HASH_DATA, test_customer_hashes)   

In [31]:
PROCESS_TEST_DATA = 0
NUM_TEST_FILES = 20
READ_CUSTOMER_HASHES = 1

if PROCESS_TEST_DATA:
    (T_COLS,cids, test_cids) = get_column_names_cids(READ_CUSTOMER_HASHES)
    
    rows = get_rows(cids, test_cids, NUM_FOLDS = NUM_TEST_FILES, verbose = 'test')
    
    break_up_test_data(T_COLS, NUM_TEST_FILES, rows)

2022-08-24 13:53:07.632611 : There are 190 test dataframe columns
2022-08-24 13:53:15.475086 : There are 924621 unique customers in test.
2022-08-24 13:53:15.475487 : We will split test data into 20 separate folds.
2022-08-24 13:53:15.475511 : There will be 46231 customers in each fold (except the last fold).
2022-08-24 13:53:15.475523 : Below are number of rows in each fold:
2022-08-24 13:53:16.997456 : [567933, 568482, 569369, 567886, 567539, 568041, 568138, 567596, 568543, 567539, 568421, 568745, 568279, 568333, 568327, 568901, 568300, 568001, 567372, 568017]
2022-08-24 13:53:40.155512 : Test_File_1 has 46231 customers and shape (601003, 189)
2022-08-24 13:54:11.279151 : Test_File_2 has 46231 customers and shape (601003, 189)
2022-08-24 13:54:47.048248 : Test_File_3 has 46231 customers and shape (601003, 189)
2022-08-24 13:55:24.140497 : Test_File_4 has 46231 customers and shape (601003, 189)
2022-08-24 13:56:05.133124 : Test_File_5 has 46231 customers and shape (601003, 189)
2022-0

In [34]:
INFER_TEST_DATA = 1

def infer_test_data():
    # INFER TEST DATA
    start = 0; end = 0
    sub = pd.read_csv(FILENAME_SAMPLE_SUBMISSION_CSV)
    
    # REARANGE SUB ROWS TO MATCH PROCESSED TEST FILES
    sub['hash'] = sub['customer_ID'].str[-16:].apply(lambda x:int(x,16)).astype('int64')
    test_hash_index = np.load(f'{FILENAME_TEST_HASH_DATA}.npy')
    sub = sub.set_index('hash').loc[test_hash_index].reset_index(drop=True)
    
    for k in range(NUM_TEST_FILES):
        # BUILD MODEL
        K.clear_session()
        model = build_gru_model()
        
        # LOAD TEST DATA
        utils.pt(f'Inferring Test_File_{k+1}')
        X_test = np.load(f'{PATH_TO_PROCESSED2_DATA}test_data_{k+1}.npy')
        end = start + X_test.shape[0]

        # INFER 5 FOLD MODELS
        model.load_weights(f'{PATH_TO_MODEL}gru_fold_0.h5')
        p = model.predict(X_test, batch_size=512, verbose=0).flatten() 
        for j in range(1,5):
            model.load_weights(f'{PATH_TO_MODEL}gru_fold_{j}.h5')
            p += model.predict(X_test, batch_size=512, verbose=0).flatten()
        p /= 5.0

        # SAVE TEST PREDICTIONS
        sub.loc[start:end-1,'prediction'] = p
        start = end
        
        # CLEAN MEMORY
        utils.gc_l([ model, X_test, p])
     
    sub.to_csv(FILENAME_GRU_SUBMISSION,index=False)
    print('Submission file shape is', sub.shape )
    display( sub.head() )

    
if INFER_TEST_DATA:
    infer_test_data()

2022-08-24 14:18:31.977190 : Inferring Test_File_1
2022-08-24 14:18:38.203911 : Inferring Test_File_2
2022-08-24 14:18:42.979605 : Inferring Test_File_3
2022-08-24 14:18:47.433188 : Inferring Test_File_4
2022-08-24 14:18:52.029295 : Inferring Test_File_5
2022-08-24 14:18:56.513967 : Inferring Test_File_6
2022-08-24 14:19:01.364393 : Inferring Test_File_7
2022-08-24 14:19:05.891329 : Inferring Test_File_8
2022-08-24 14:19:10.600180 : Inferring Test_File_9
2022-08-24 14:19:15.565886 : Inferring Test_File_10
2022-08-24 14:19:20.203828 : Inferring Test_File_11
2022-08-24 14:19:24.918371 : Inferring Test_File_12
2022-08-24 14:19:29.641204 : Inferring Test_File_13
2022-08-24 14:19:34.306264 : Inferring Test_File_14
2022-08-24 14:19:39.226580 : Inferring Test_File_15
2022-08-24 14:19:44.010206 : Inferring Test_File_16
2022-08-24 14:19:48.542276 : Inferring Test_File_17
2022-08-24 14:19:53.236500 : Inferring Test_File_18
2022-08-24 14:19:57.942190 : Inferring Test_File_19
2022-08-24 14:20:02.5

Unnamed: 0,customer_ID,prediction
0,038be0571bd6b3776cb8512731968f4de302c811030124...,0.003534
1,0074a0233ef766b52884608cc8cf9098f59d885b5d59fc...,0.000186
2,060b8b7f30f795a0e93995d45b29461ffa6ece0eeb5c3d...,0.103047
3,03a1d125bdd776000bf0b28238d0bea240ad581d332e70...,0.129862
4,0290f245dd35ba899af52316ccc62b2627e7ae18cd76a2...,0.315075
