In [1]:
import pandas as pd
import datetime
import numpy as np
import gc
import tensorflow as tf
from keras import backend as K 
# GPU LIBRARIES, are these useful? Do they result in any meaningful speedup? I have replaced cupy with np and cudf with pd in the below code.
#import cupy, cudf 

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
# Paths

MAIN_PATH = "/home/mahesh/Desktop/ML/kaggle/amex/"

# Data
PATH_TO_DATA                = MAIN_PATH + "data/"
PATH_TO_PROCESSED2_DATA     = PATH_TO_DATA + "processed2/"
#FILENAME_TRAIN_DATA_CSV     = PATH_TO_DATA + "orig/train_data.csv"
FILENAME_TRAIN_LABELS_CSV   = PATH_TO_DATA + "orig/train_labels.csv"
#FILENAME_TRAIN_DATA_FEATHER = PATH_TO_DATA + "orig/train_data.f"     

# Processed data
#FILENAME_CID_MAP                      = PATH_TO_PROCESSED_DATA + "cid_map.csv"
FILENAME_TRAIN_PROCESSED_DATA_FEATHER = PATH_TO_PROCESSED2_DATA + "train_data.f"

# Models
PATH_TO_MODEL   = MAIN_PATH + "models/"

In [3]:
print(datetime.datetime.now())
# Read from Feather
train_df = pd.read_feather(FILENAME_TRAIN_PROCESSED_DATA_FEATHER)
# Read from CSV
#train_df = pd.read_csv(FILENAME_TRAIN_DATA_CSV)
print(datetime.datetime.now())
train_df.info(memory_usage="deep")

2022-07-30 18:57:55.338327
2022-07-30 18:57:57.337160
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5531451 entries, 0 to 5531450
Columns: 190 entries, customer_ID to D_145
dtypes: category(11), datetime64[ns](1), float32(176), int32(1), int64(1)
memory usage: 3.8 GB


In [4]:
targets =  pd.read_csv(FILENAME_TRAIN_LABELS_CSV)
#targets['customer_ID'] = targets['customer_ID'].str[-16:].apply(lambda x:int(x,16)).astype('int64')

In [5]:
# All features
T_COLS = set(train_df.columns)
# Features to be dropped that are irrelevant signals
id_time_cols = set(['customer_ID', 'S_2'])
# Categorical features
cat_cols = set(['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68'])
# Numerical features
num_cols = T_COLS - id_time_cols - cat_cols

#print(len(num_cols))

#### Flow
    1. Read the low-memory representation DF from Feather Format.  
    2. Do a stratified split of data into train+val and test data. Which feature to do the stratified split on?  
    3. Create feature engineering pipeline.  
    4. Use decision tree to create a base model. 
    5. Idenitify important features, drop unimportant features, drop features with large number of NaNs?  

#### Feature Engineering pipeline:

    1. Make each customer have 13 months(?) of data, add zeroed out months?  
    2. Split features into numerical and categorical.  
    3. Numerical features:  
        1. Handle NaNs : Fill with Median, Mean?
        2. Feature scaling (only useful for NN models?) : Numerical data should be standardized to aid easier training.  
    4. Categorical data :  
        1. Handle NaNs : Fill with special value.  
        2. One hot encoding ? Embeddings?  

In [37]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self._attribute_names = attribute_names
        
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        return X[self._attribute_names].values

class DataFrameCreator(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self._attribute_names = attribute_names
        
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        return X[self._attribute_names].values    

class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, create_t, debug = False):
        self._col_transformers = []
        self._create_t = create_t
        self._debug = debug
        
    def fit(self,X,y=None):
        if self._debug:
            print(datetime.datetime.now())
        for col in range(X.shape[1]):
            col_data_npa = (X[:,col]).reshape(-1,1)
            t = self._create_t()
            self._col_transformers.append(t.fit(col_data_npa))
    
        if self._debug:
            print(datetime.datetime.now())

        return self
    
    def transform(self,X):
        ctl = []
        if self._debug:
            print(datetime.datetime.now())
            
        for col in range(X.shape[1]):
            col_data_npa = (X[:,col]).reshape(-1,1)
            ctl.append(self._col_transformers.transform(col_data_npa))

        res = np.concatenate(ctl,axis=1)
        
        if self._debug:
            print(datetime.datetime.now())

        return res

    
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


def apply_transformer(X_df, cols, create_t, debug = False):
    col_transformers = []
    ctl = []
    if debug:
        print(datetime.datetime.now())
    for c in cols:
        col_data_npa = X_df[[c]].values
        #t = OrdinalEncoder(dtype = np.int8, encoded_missing_value = -1)
        t = create_t()
        col_transformers.append(t)
        col_data_trans_npa = t.fit_transform(col_data_npa)
        ctl.append(cat_data_trans_npa)
    
    if debug:
        print(datetime.datetime.now())

    cols_transformed_npa = np.concatenate(ctl,axis=1)
    
    if debug:
        print(datetime.datetime.now())
        
    return (cols_transformed_npa, col_transformers)

def create_simple_imputer_strategy_median():
    return SimpleImputer(strategy="median")


def create_ordinal_encoder():
    return OrdinalEncoder(dtype = np.int8, encoded_missing_value = -1)


num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_cols)),
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

from sklearn.preprocessing import OrdinalEncoder
    
# cat_pipeline = Pipeline([
#     ('selector', DataFrameSelector(cat_cols)),
#     ('encoder', OrdinalEncoder(dtype = np.int8, encoded_missing_value = -1))
# ])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_cols)),
    ('encoder', CustomTransformer(create_ordinal_encoder, debug = True))
])


In [39]:
#cat_pipeline.fit_transform(train_df)

t = apply_transformer(train_df, cat_cols, create_ordinal_encoder, True)

2022-08-01 16:59:51.403635
2022-08-01 16:59:55.818751
2022-08-01 16:59:55.878931


In [None]:
num_data_npa = train_df[(list(num_cols))[0:10]].values
#num_data_npa = train_df[num_cols].values
imputer = SimpleImputer(strategy="median",copy = False)
#print(num_data_npa[1])
t = imputer.fit_transform(num_data_npa)
imputer.statistics_

In [None]:
from sklearn.preprocessing import OrdinalEncoder
le = OrdinalEncoder(dtype = np.int8, encoded_missing_value = -1)
#print(cat_data_npa[0:9])
#le.fit_transform(cat_data_npa[0:9])

In [11]:
from sklearn.compose import ColumnTransformer

col_transform = []
for i in range(0, 2):
    col_transform.append((f'categorical_transformer_{i}',OrdinalEncoder(dtype = np.int8, encoded_missing_value = -1),slice(i,i+1)))

print(col_transform)
ct = ColumnTransformer(col_transform)

[('categorical_transformer_0', OrdinalEncoder(dtype=<class 'numpy.int8'>, encoded_missing_value=-1), slice(0, 1, None)), ('categorical_transformer_1', OrdinalEncoder(dtype=<class 'numpy.int8'>, encoded_missing_value=-1), slice(1, 2, None))]


In [12]:
cat_data_npa = train_df[cat_cols].values
print(datetime.datetime.now())
ct.fit_transform(cat_data_npa)
print(datetime.datetime.now())

2022-07-30 19:56:24.296774


KeyboardInterrupt: 

In [23]:
cat_col_transformers = []
ctl = []
print(datetime.datetime.now())
for e in cat_cols:
    cat_data_npa = train_df[[e]].values
    t = OrdinalEncoder(dtype = np.int8, encoded_missing_value = -1)
    cat_col_transformers.append(t)
    cat_data_trans_npa = t.fit_transform(cat_data_npa)
    ctl.append(cat_data_trans_npa)

print(datetime.datetime.now())

print(datetime.datetime.now())
cat_cols_transformed_npa = np.concatenate(ctl,axis=1)
print(datetime.datetime.now())

2022-08-01 16:08:45.726226
2022-08-01 16:08:50.066941
2022-08-01 16:08:50.067023
2022-08-01 16:08:50.128861


In [22]:
print(cat_data_npa_transformed[800:810])

[[ 2  1  1  1  0 -1  5  0  6  1  0]
 [ 2  1  1  1  0 -1  5  0  6  1  0]
 [ 2  2  1  1  0 -1  4  1  6  1  0]
 [ 2  2  1  1  0 -1  4  0  6  1  0]
 [ 2  2  1  1  0 -1  4  0  6  1  0]
 [ 2  2  1  2  0 -1  4  0  6  1  0]
 [ 2  2  1  1  0 -1  4  0  6  1  0]
 [ 2  2  1  1  0 -1  4  0  6  1  0]
 [ 2  2  1  1  0 -1  4  0  6  1  0]
 [ 2  2  1  1  0 -1  4  0  6  1  0]]


In [16]:
print(cat_data_npa_transformed.shape)
for e in col_transformers:
    print(e.categories)

(5531451, 11)
auto
auto
auto
auto
auto
auto
auto
auto
auto
auto
auto


In [None]:
num_col_transformers = []
ctl = []
print(datetime.datetime.now())
for e in num_cols:
    num_data_npa = train_df[[e]].values
    t = OrdinalEncoder(dtype = np.int8, encoded_missing_value = -1)
    cat_col_transformers.append(t)
    cat_data_trans_npa = t.fit_transform(cat_data_npa)
    ctl.append(cat_data_trans_npa)

print(datetime.datetime.now())

print(datetime.datetime.now())
cat_cols_transformed_npa = np.concatenate(ctl,axis=1)
print(datetime.datetime.now())

In [None]:
def feature_engineer(train, PAD_CUSTOMER_TO_13_ROWS = True, targets = None):
        
    # REDUCE STRING COLUMNS 
    # from 64 bytes to 8 bytes, and 10 bytes to 3 bytes respectively
    #train['customer_ID'] = train['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    train['customer_ID'] = train['customer_ID'].str[-16:].apply(lambda x:int(x,16)).astype('int64')
    train.S_2 = pd.to_datetime( train.S_2 )
    train['year'] = (train.S_2.dt.year-2000).astype('int8')
    train['month'] = (train.S_2.dt.month).astype('int8')
    train['day'] = (train.S_2.dt.day).astype('int8')
    del train['S_2']
        
    # LABEL ENCODE CAT COLUMNS (and reduce to 1 byte)
    # with 0: padding, 1: nan, 2,3,4,etc: values
    d_63_map = {'CL':2, 'CO':3, 'CR':4, 'XL':5, 'XM':6, 'XZ':7}
    train['D_63'] = train.D_63.map(d_63_map).fillna(1).astype('int8')

    d_64_map = {'-1':2,'O':3, 'R':4, 'U':5}
    train['D_64'] = train.D_64.map(d_64_map).fillna(1).astype('int8')
    
    CATS = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_66', 'D_68']
    OFFSETS = [2,1,2,2,3,2,3,2,2] #2 minus minimal value in full train csv
    # then 0 will be padding, 1 will be NAN, 2,3,4,etc will be values
    for c,s in zip(CATS,OFFSETS):
        train[c] = train[c] + s
        train[c] = train[c].fillna(1).astype('int8')
    CATS += ['D_63','D_64']
    
    # ADD NEW FEATURES HERE
    # EXAMPLE: train['feature_189'] = etc etc etc
    # EXAMPLE: train['feature_190'] = etc etc etc
    # IF CATEGORICAL, THEN ADD TO CATS WITH: CATS += ['feaure_190'] etc etc etc
    
    # REDUCE MEMORY DTYPE
    SKIP = ['customer_ID','year','month','day']
    for c in train.columns:
        if c in SKIP: continue
        if str( train[c].dtype )=='int64':
            train[c] = train[c].astype('int32')
        if str( train[c].dtype )=='float64':
            train[c] = train[c].astype('float32')
            
    # PAD ROWS SO EACH CUSTOMER HAS 13 ROWS
    if PAD_CUSTOMER_TO_13_ROWS:
        tmp = train[['customer_ID']].groupby('customer_ID').customer_ID.agg('count')
        more = np.array([],dtype='int64') 
        for j in range(1,13):
            i = tmp.loc[tmp==j].index.values
            more = np.concatenate([more,np.repeat(i,13-j)])
        df = train.iloc[:len(more)].copy().fillna(0)
        df = df * 0 - 1 #pad numerical columns with -1
        df[CATS] = (df[CATS] * 0).astype('int8') #pad categorical columns with 0
        df['customer_ID'] = more
        train = pd.concat([train,df],axis=0,ignore_index=True)
        
    # ADD TARGETS (and reduce to 1 byte)
    if targets is not None:
        train = train.merge(targets,on='customer_ID',how='left')
        train.target = train.target.astype('int8')
        
    # FILL NAN
    train = train.fillna(-0.5) #this applies to numerical columns
    
    # SORT BY CUSTOMER THEN DATE
    train = train.sort_values(['customer_ID','year','month','day']).reset_index(drop=True)
    train = train.drop(['year','month','day'],axis=1)
    
    # REARRANGE COLUMNS WITH 11 CATS FIRST
    COLS = list(train.columns[1:])
    COLS = ['customer_ID'] + CATS + [c for c in COLS if c not in CATS]
    train = train[COLS]
    
    return train

In [None]:
# CREATE PROCESSED TRAIN FOLDS AND SAVE TO DISK        
print(datetime.datetime.now())
for k in range(NUM_FOLDS):

    # READ CHUNK OF TRAIN CSV FILE
    skip = int(np.sum( rows[:k] ) + 1) #the plus one is for skipping header
    train = pd.read_csv(FILENAME_TRAIN_DATA_CSV, nrows=rows[k], 
                              skiprows=skip, header=None, names=T_COLS)

    # FEATURE ENGINEER DATAFRAME
    train = feature_engineer(train, targets = targets)

    # SAVE FILES
    print(f'Train_File_{k+1} has {train.customer_ID.nunique()} customers and shape',train.shape)
    tar = train[['customer_ID','target']].drop_duplicates().sort_index()
    #if not os.path.exists(PATH_TO_PROCESSED_DATA): os.makedirs(PATH_TO_DATA)
    tar.to_parquet(f'{PATH_TO_PROCESSED_DATA}targets_{k+1}.pqt',index=False)
    data = train.iloc[:,1:-1].values.reshape((-1,13,188))
    np.save(f'{PATH_TO_PROCESSED_DATA}data_{k+1}',data.astype('float32'))

    # CLEAN MEMORY
    del train, tar, data
    gc.collect()
del targets
gc.collect()
print(datetime.datetime.now())

#### Model building:

In [None]:
def build_model():
    
    # INPUT - FIRST 11 COLUMNS ARE CAT, NEXT 177 ARE NUMERIC
    inp = tf.keras.Input(shape=(13,188))
    embeddings = []
    for k in range(11):
        emb = tf.keras.layers.Embedding(10,4)
        embeddings.append( emb(inp[:,:,k]) )
    x = tf.keras.layers.Concatenate()([inp[:,:,11:]]+embeddings)
    
    # SIMPLE RNN BACKBONE
    x = tf.keras.layers.GRU(units=128, return_sequences=False)(x)
    x = tf.keras.layers.Dense(64,activation='relu')(x)
    x = tf.keras.layers.Dense(32,activation='relu')(x)
    
    # OUTPUT
    x = tf.keras.layers.Dense(1,activation='sigmoid')(x)
    
    # COMPILE MODEL
    model = tf.keras.Model(inputs=inp, outputs=x)
    opt = tf.keras.optimizers.Adam(learning_rate=0.001)
    loss = tf.keras.losses.BinaryCrossentropy()
    model.compile(loss=loss, optimizer = opt)
    
    return model

In [None]:
# CUSTOM LEARNING SCHEUDLE
def lrfn(epoch):
    lr = [1e-3]*5 + [1e-4]*2 + [1e-5]*1
    return lr[epoch]
LR = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = False)

In [None]:
# COMPETITION METRIC FROM Konstantin Yakovlev
# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

#### Model Training:

In [None]:
if True:
    # SAVE TRUE AND OOF
    true = np.array([])
    oof = np.array([])
    VERBOSE = 2 # use 1 for interactive 

    for fold in range(5):

        # INDICES OF TRAIN AND VALID FOLDS
        valid_idx = [2*fold+1, 2*fold+2]
        train_idx = [x for x in [1,2,3,4,5,6,7,8,9,10] if x not in valid_idx]

        print('#'*25)
        print(f'### Fold {fold+1} with valid files', valid_idx)

        # READ TRAIN DATA FROM DISK
        X_train = []; y_train = []
        for k in train_idx:
            X_train.append( np.load(f'{PATH_TO_PROCESSED_DATA}data_{k}.npy'))
            y_train.append( pd.read_parquet(f'{PATH_TO_PROCESSED_DATA}targets_{k}.pqt') )
        X_train = np.concatenate(X_train,axis=0)
        y_train = pd.concat(y_train).target.values
        print('### Training data shapes', X_train.shape, y_train.shape)

        # READ VALID DATA FROM DISK
        X_valid = []; y_valid = []
        for k in valid_idx:
            X_valid.append( np.load(f'{PATH_TO_PROCESSED_DATA}data_{k}.npy'))
            y_valid.append( pd.read_parquet(f'{PATH_TO_PROCESSED_DATA}targets_{k}.pqt') )
        X_valid = np.concatenate(X_valid,axis=0)
        y_valid = pd.concat(y_valid).target.values
        print('### Validation data shapes', X_valid.shape, y_valid.shape)
        print('#'*25)

        # BUILD AND TRAIN MODEL
        K.clear_session()
        model = build_model()
        h = model.fit(X_train,y_train, 
                      validation_data = (X_valid,y_valid),
                      batch_size=512, epochs=8, verbose=VERBOSE,
                      callbacks = [LR])
        #if not os.path.exists(PATH_TO_MODEL): os.makedirs(PATH_TO_MODEL)
        model.save_weights(f'{PATH_TO_MODEL}gru_fold_{fold+1}.h5')

        # INFER VALID DATA
        print('Inferring validation data...')
        p = model.predict(X_valid, batch_size=512, verbose=VERBOSE).flatten()

        print()
        print(f'Fold {fold+1} CV=', amex_metric_mod(y_valid, p) )
        print()
        true = np.concatenate([true, y_valid])
        oof = np.concatenate([oof, p])
        
        # CLEAN MEMORY
        del model, X_train, y_train, X_valid, y_valid, p
        gc.collect()

    # PRINT OVERALL RESULTS
    print('#'*25)
    print(f'Overall CV =', amex_metric_mod(true, oof) )
    K.clear_session()

#### Model evaluation: