In [1]:
import utils

utils.widen_ipython_window()

import pandas as pd
import datetime
import numpy as np
import gc
import tensorflow as tf
from keras import backend as K 



In [2]:
#
# Paths
#

MAIN_PATH = "/home/mahesh/Desktop/ML/kaggle/amex/"

# Data
PATH_TO_DATA                = MAIN_PATH + "data/"
PATH_TO_PROCESSED_DATA      = PATH_TO_DATA + "processed/"
PATH_TO_PROCESSED2_DATA     = PATH_TO_DATA + "processed2/"
PATH_TO_PROCESSED4_DATA     = PATH_TO_DATA + "processed4/"

PATH_TO_GRU_NAN_EMBEDDINGS_DATA = PATH_TO_DATA + "gru_nan_embeddings_full/"

FILENAME_TRAIN_DATA_CSV     = PATH_TO_DATA + "orig/train_data.csv"
FILENAME_TRAIN_LABELS_CSV   = PATH_TO_DATA + "orig/train_labels.csv"
FILENAME_TEST_DATA_CSV      = PATH_TO_DATA + "orig/test_data.csv"
FILENAME_SAMPLE_SUBMISSION_CSV = PATH_TO_DATA + "orig/sample_submission.csv"

FILENAME_TRAIN_DATA_FEATHER = PATH_TO_PROCESSED_DATA + "train_data.f"
FILENAME_TRAIN_PROCESSED2_DATA_FEATHER   = PATH_TO_PROCESSED2_DATA + "train_data.f"
FILENAME_TRAIN_PROCESSED2_LABELS_FEATHER = PATH_TO_PROCESSED2_DATA + "train_labels.f"
FILENAME_TRAIN_PROCESSED2_DATA_CAT_NOCHANGE_FEATHER   = PATH_TO_PROCESSED2_DATA + "train_data_cat_nochange.f"

FILENAME_TRAIN_PROCESSED_GRU_NAN_EMBEDDINGS_FEATHER = PATH_TO_GRU_NAN_EMBEDDINGS_DATA + "train_data.f"

FILENAME_TEST_CUSTOMER_HASHES  = PATH_TO_PROCESSED2_DATA + "test_customer_hashes_data.pq"
FILENAME_TEST_HASH_DATA        = PATH_TO_PROCESSED2_DATA + "test_hashes_data"
FILENAME_GRU_SUBMISSION        = PATH_TO_PROCESSED2_DATA + "submission_gru.csv"

FILENAME_TRAIN_PROCESSED4_FE_DATA_RNN_FEATHER = PATH_TO_PROCESSED4_DATA + "train_FE_data_RNN.f"

PATH_TO_RNN_NAN_EMBEDDINGS_NN_STATS_DATA = PATH_TO_DATA + "rnn_nn/"
FILENAME_TRAIN_RNN_NN_DATA_FEATHER = PATH_TO_RNN_NAN_EMBEDDINGS_NN_STATS_DATA + "train_nn_data.f"
FILENAME_TRAIN_RNN_RNN_DATA_FEATHER = PATH_TO_RNN_NAN_EMBEDDINGS_NN_STATS_DATA + "train_rnn_data.f"

# Models
PATH_TO_MODEL   = PATH_TO_RNN_NAN_EMBEDDINGS_NN_STATS_DATA

CAT_FEATURES = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]

In [None]:
TRAIN_MODELS = 1
#
# CONFIGS
#
VERBOSE   = 2
SEED      = 42
NUM_FOLDS = 5

EPOCHS     = 8
BATCH_SIZE = 512

TARGET_LABEL      = 'target'
CUSTOMER_ID_LABEL = "customer_ID"

CATS = 11
NUMS_NO_E = 0
NUMS_WITH_E = 354
NUM_FEATURES = CATS + NUMS_NO_E + NUMS_WITH_E 

#NUM_FEATURES = 226



def build_gru_model(seq_count, nonseq_count):
    
    (seq_cat_count, seq_num_count) = seq_count
    (nonseq_cat_count, nonseq_num_count) = nonseq_count
    
    seq_total_features    = seq_cat_count + seq_num_count
    nonseq_total_features = nonseq_cat_count + nonseq_num_count
    
    # INPUT - FIRST 11 COLUMNS ARE CAT, NEXT 177 ARE NUMERIC
    seq_inp = tf.keras.Input(shape=(13,seq_total_features))
    
    nonseq_inp = tf.keras.Input(shape=(nonseq_total_features))
    
    # input org is {CATS, NUM_NO_E, NUMS_WITH_E}
    
    # Categorical embeddings
    cat_embeddings = []
    for k in range(seq_cat_count):
        emb = tf.keras.layers.Embedding(10,4)
        cat_embeddings.append( emb(seq_inp[:,:,k]) )
    
    # NaN embeddings
    num_embeddings = []
    for k in range (seq_cat_count, seq_total_features, 2):
        l = tf.keras.layers.Dense(1,activation = 'relu')
        num_embeddings.append(l(seq_inp[:,:,k:k+2]))
        
    # x = tf.keras.layers.Concatenate()([inp[:,:,11:]]+embeddings)
    # x = tf.keras.layers.Concatenate()([inp[:,:,CATS:(CATS+NUMS_NO_E)]] + embeddings + ips)
    x1 = tf.keras.layers.Concatenate()(cat_embeddings + num_embeddings)
    
    # SIMPLE RNN BACKBONE
    x1 = tf.keras.layers.GRU(units=256, return_sequences=False)(x1)
    x = tf.keras.layers.Concatenate()([x1 , nonseq_inp])
    x = tf.keras.layers.Dense(64,activation='relu')(x)
    x = tf.keras.layers.Dense(32,activation='relu')(x)
    
    # OUTPUT
    x = tf.keras.layers.Dense(1,activation='sigmoid')(x)
    
    # COMPILE MODEL
    model = tf.keras.Model(inputs=[seq_inp, nonseq_inp], outputs=x)
    opt = tf.keras.optimizers.Adam(learning_rate=0.001)
    loss = tf.keras.losses.BinaryCrossentropy()
    model.compile(loss=loss, optimizer = opt)
    
    return model

# CUSTOM LEARNING SCHEUDLE
import math
TOTAL_EPOCHS = EPOCHS

# CUSTOM LEARNING SCHEUDLE
def lrfn(epoch):
    lr = [1e-3]*5 + [1e-4]*2 + [1e-5]*1
    i = math.floor(len(lr) * (epoch/TOTAL_EPOCHS))
    return lr[i]
LR = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = False)


def extract_X_Y(FE_data, cids, rnn):
    #utils.pt(str(FE_data.shape))
    #utils.pt(str(cids.shape))
    
    data = FE_data.loc[(FE_data.customer_ID.isin(cids.customer_ID.values))]
    data = data.reset_index(drop=True)
    
    data.info(memory_usage='deep')
    
    #utils.pt(str(data.shape))
    #data.info()
    #utils.pt(str(data.iloc[:,1:-1].values.shape))
    #utils.pt(str(data.columns))
    
    non_features = ['customer_ID','target']
    features = [col for col in data if col not in non_features]
    utils.pt(str(features))
    if rnn:
        num_features = len(features)
        Y = data[non_features].drop_duplicates().sort_index().target.values
        utils.pt('Reshaping data')
        X = data.iloc[:,1:-1].values.reshape((-1,13, num_features))
    else:
        X = data[features]
        Y = None
    
    #FE_data.drop((FE_data.customer_ID.isin(cids.customer_ID.values)).index, inplace = True)
    
    return (X,Y)

def extract_val_train_data(fold, train_FE_data, rnn):
    train_cids = pd.read_feather(f'{PATH_TO_PROCESSED4_DATA}/train_{CUSTOMER_ID_LABEL}_fold_{fold}.f')
    val_cids   = pd.read_feather(f'{PATH_TO_PROCESSED4_DATA}/val_{CUSTOMER_ID_LABEL}_fold_{fold}.f')
    
    (X_train, Y_train) = extract_X_Y(train_FE_data, train_cids, rnn)
    (X_val  , Y_val  ) = extract_X_Y(train_FE_data, val_cids  , rnn)
    
    return (X_train, Y_train, X_val, Y_val)

def extract_test_data(train_FE_data, rnn):
    test_cids = pd.read_feather(f'{PATH_TO_PROCESSED4_DATA}/test_{CUSTOMER_ID_LABEL}.f')
    return (extract_X_Y(train_FE_data, test_cids, rnn))


def get_cat_num_features_count(cols):
    cat_count = 0
    num_count = 0
    for c in cols:
        if c in ['customer_ID', 'target']:
            pass
        elif any(substring in c for substring in CAT_FEATURES):
            cat_count += 1
        else:
            num_count += 1
    
    return (cat_count, num_count)
    
def train_gru_models():
    
    utils.pt(f'Reading feature engineered data.')
#    train_FE_data = pd.read_feather(FILENAME_TRAIN_PROCESSED_GRU_NAN_EMBEDDINGS_FEATHER)
    
#     utils.pt(f'Extracting test data.')
    
#     (X_test, Y_test) = extract_test_data(train_FE_data)

#     utils.pt(f'### Test data shapes   {X_test.shape} , {Y_test.shape}')
    
    for fold in range(0,NUM_FOLDS):
        
        train_seq_data    = pd.read_feather(FILENAME_TRAIN_RNN_RNN_DATA_FEATHER)
        train_nonseq_data = pd.read_feather(FILENAME_TRAIN_RNN_NN_DATA_FEATHER)
        
        (seq_cat_count   , seq_num_count)    = get_cat_num_features_count(list(train_seq_data.columns))
        (nonseq_cat_count, nonseq_num_count) = get_cat_num_features_count(list(train_nonseq_data.columns))
        
        utils.pt(f'#### Fold : {fold} ####')
        
        utils.pt(f'Extracting train and val data.')
        (X_train_seq   , Y_train  , X_val_seq   , Y_val    ) = extract_val_train_data(fold, train_seq_data   , rnn = True)
        (X_train_nonseq, Y_ignore1, X_val_nonseq, Y_ignore2) = extract_val_train_data(fold, train_nonseq_data, rnn = False)
        
        #utils.gc_l([train_FE_data])
        
        utils.pt(f'### Training data shapes:   {X_train_seq.shape} + {X_train_nonseq.shape} , {Y_train.shape}')
        utils.pt(f'### Validation data shapes: {X_val_seq.shape} + {X_val_nonseq.shape} , {Y_val.shape}  ')
        
        
        utils.pt(f'Starting model training.')
        # BUILD AND TRAIN MODEL
        K.clear_session()
        model = build_gru_model((seq_cat_count, seq_num_count), (nonseq_cat_count , nonseq_num_count))
        h = model.fit([X_train_seq, X_train_nonseq],Y_train, 
                      validation_data = ([X_val_seq,X_val_nonseq],Y_val),
                      batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=VERBOSE,
                      callbacks = [LR])        
        utils.pt(f'Completed model training.')
        
        utils.pt(f'Saving model to file.')
        model.save_weights(f'{PATH_TO_MODEL}gru_fold_{fold}.h5')
        
        # INFER VALID DATA
        utils.pt('Inferring validation data...')
        preds = model.predict([X_val_seq, X_val_nonseq], batch_size=512, verbose=VERBOSE).flatten()
        amm = utils.amex_metric_mod(Y_val, preds)
        utils.pt(f'Fold {fold} CV = {amm}')
        
        
        # INFER TEST DATA
#         utils.pt('Inferring test data...')
#         preds = model.predict(X_test, batch_size=512, verbose=VERBOSE).flatten()
#         amm = utils.amex_metric_mod(Y_test, preds)
#         utils.pt(f'Fold {fold} CV = {amm}')
        
        print()
        
        utils.gc_l([model, X_train_seq, X_train_nonseq, Y_train, X_val_seq, X_val_nonseq, Y_val, preds, train_FE_data, amm, h])
        K.clear_session()
        
#        train_FE_data = pd.read_feather(FILENAME_TRAIN_PROCESSED_GRU_NAN_EMBEDDINGS_FEATHER)

    print()
    utils.pt(f'*** Completed model training for all folds ***')

if TRAIN_MODELS :
    train_gru_models()

2022-09-07 13:43:15.316849 : Reading feature engineered data.
2022-09-07 13:43:16.964454 : #### Fold : 0 ####
2022-09-07 13:43:16.964527 : Extracting train and val data.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4534049 entries, 0 to 4534048
Columns: 248 entries, customer_ID to target
dtypes: float32(121), int32(1), int64(1), int8(125)
memory usage: 2.6 GB
2022-09-07 13:43:18.643738 : ['B_30', 'B_38', 'P_2', 'P_2_exists', 'D_39', 'D_39_exists', 'B_1', 'B_1_exists', 'B_2', 'B_2_exists', 'R_1', 'R_1_exists', 'S_3', 'S_3_exists', 'D_41', 'D_41_exists', 'B_3', 'B_3_exists', 'D_42', 'D_42_exists', 'D_43', 'D_43_exists', 'D_44', 'D_44_exists', 'B_4', 'B_4_exists', 'D_45', 'D_45_exists', 'B_5', 'B_5_exists', 'R_2', 'R_2_exists', 'D_46', 'D_46_exists', 'D_47', 'D_47_exists', 'D_48', 'D_48_exists', 'D_49', 'D_49_exists', 'B_7', 'B_7_exists', 'B_8', 'B_8_exists', 'D_50', 'D_50_exists', 'D_51', 'D_51_exists', 'B_9', 'B_9_exists', 'R_3', 'R_3_exists', 'D_52', 'D_52_exists', 'P_3', 'P_3_exi

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87194 entries, 0 to 87193
Columns: 536 entries, customer_ID to D_126_last_round2
dtypes: float32(426), float64(84), int32(3), int64(21), int8(2)
memory usage: 212.7 MB
2022-09-07 13:43:23.143541 : ['B_6_mean', 'B_6_std', 'B_6_min', 'B_6_max', 'B_6_last', 'S_6_mean', 'S_6_std', 'S_6_min', 'S_6_max', 'S_6_last', 'B_13_mean', 'B_13_std', 'B_13_min', 'B_13_max', 'B_13_last', 'D_58_mean', 'D_58_std', 'D_58_min', 'D_58_max', 'D_58_last', 'D_60_mean', 'D_60_std', 'D_60_min', 'D_60_max', 'D_60_last', 'B_15_mean', 'B_15_std', 'B_15_min', 'B_15_max', 'B_15_last', 'B_16_mean', 'B_16_std', 'B_16_min', 'B_16_max', 'B_16_last', 'B_19_mean', 'B_19_std', 'B_19_min', 'B_19_max', 'B_19_last', 'D_69_mean', 'D_69_std', 'D_69_min', 'D_69_max', 'D_69_last', 'D_71_mean', 'D_71_std', 'D_71_min', 'D_71_max', 'D_71_last', 'D_73_mean', 'D_73_std', 'D_73_min', 'D_73_max', 'D_73_last', 'P_4_mean', 'P_4_std', 'P_4_min', 'P_4_max', 'P_4_last', 'D_76_mean', 'D_76_std'