In [17]:
class Config:
    name = "EDAs/Lag/EDA001"

    n_splits = 5
    seed = 2022
    target = "target"

    # Colab Env
    upload_from_colab = True
    api_path = "/content/drive/MyDrive/workspace/kaggle.json"
    drive_path = "/content/drive/MyDrive/workspace/kaggle-amex"

    # Kaggle Env
    kaggle_dataset_path = None

    # Reka Env
    dir_path = '/home/abe/kaggle/kaggle-amex'

In [18]:
import os
import json
import warnings
import shutil
import logging
import joblib
import random
import datetime
import sys
import gc
import multiprocessing
import joblib
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-pastel')
import seaborn as sns
sns.set_palette("winter_r")

from tqdm.auto import tqdm
tqdm.pandas()
warnings.filterwarnings('ignore')

## Environment Settings

In [19]:
INPUT = os.path.join(Config.dir_path, 'input')
OUTPUT = os.path.join(Config.dir_path, 'output')
SUBMISSION = os.path.join(Config.dir_path, 'submissions')
OUTPUT_EXP = os.path.join(OUTPUT, Config.name)
EXP_MODEL = os.path.join(OUTPUT_EXP, "model")
EXP_FIG = os.path.join(OUTPUT_EXP, "fig")
EXP_PREDS = os.path.join(OUTPUT_EXP, "preds")

# make dirs
for d in [INPUT, SUBMISSION, EXP_MODEL, EXP_FIG, EXP_PREDS]:
    os.makedirs(d, exist_ok=True)

In [20]:
train = pd.read_parquet(os.path.join(INPUT, 'train_small.parquet'))
target = pd.read_csv(os.path.join(INPUT, 'train_labels.csv'))
train = train.merge(target, how='left', on='customer_ID')
test = pd.read_parquet(os.path.join(INPUT, 'test_small.parquet'))

In [24]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 603128 entries, 0 to 603127
Columns: 191 entries, customer_ID to target
dtypes: float32(93), int16(9), int64(1), int8(86), object(2)
memory usage: 292.2+ MB


In [25]:
train.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,f767e9a1c77a72815a2d61bd4ef1de3dcb2f659f86c4c5...,2017-03-08,0.986959,12,0.164017,0.083942,0.504688,0.096874,0.0,0.425046,...,-1,-1,0,0,0.0,,0,0.009946,0,0
1,f767e9a1c77a72815a2d61bd4ef1de3dcb2f659f86c4c5...,2017-04-25,1.001113,0,0.024842,1.009856,0.000869,0.093035,0.0,0.22911,...,-1,-1,0,0,0.0,,0,0.003058,0,0
2,f767e9a1c77a72815a2d61bd4ef1de3dcb2f659f86c4c5...,2017-05-13,1.008732,16,0.038798,1.000989,0.004768,0.089419,0.0,0.171368,...,-1,-1,0,0,0.0,,0,0.00955,0,0
3,f767e9a1c77a72815a2d61bd4ef1de3dcb2f659f86c4c5...,2017-06-29,1.007229,2,0.030237,1.004904,0.005655,0.068247,0.0,0.075452,...,-1,-1,0,0,0.0,,0,0.00793,0,0
4,f767e9a1c77a72815a2d61bd4ef1de3dcb2f659f86c4c5...,2017-07-06,1.007854,9,0.053108,1.000425,0.006688,0.076103,0.0,0.079313,...,-1,-1,0,0,0.0,,0,0.003996,0,0


## Let's look at lag features

1, diff with first record and last record  
2, shifted record for 3 steps  
3, mean with last 3 records  

In [21]:
cat_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
cont_features = [col for col in train.columns if col not in cat_features + [Config.target, 'S_2', 'customer_ID']]

In [37]:
def add_diff_features(df):
    df_diff = df.groupby('customer_ID')[cont_features].diff(1).rename(
        columns={f: f"{f}_diff1" for f in cont_features})
    df_diff = pd.concat([df['customer_ID'], df_diff], axis=1)
    df_diff = df_diff.groupby('customer_ID').tail(1).reset_index(drop=True)
    return df_diff
df_diff1 = add_diff_features(train.copy())

In [38]:
df_diff1

Unnamed: 0,customer_ID,P_2_diff1,D_39_diff1,B_1_diff1,B_2_diff1,R_1_diff1,S_3_diff1,D_41_diff1,B_3_diff1,D_42_diff1,...,D_136_diff1,D_137_diff1,D_138_diff1,D_139_diff1,D_140_diff1,D_141_diff1,D_142_diff1,D_143_diff1,D_144_diff1,D_145_diff1
0,f767e9a1c77a72815a2d61bd4ef1de3dcb2f659f86c4c5...,0.001705,-6.0,0.020633,-0.190721,0.000823,-0.008284,0.000000,0.025717,,...,0.0,0.0,0.0,0.0,0.0,0.000000,,0.0,0.000061,0.0
1,7ae43cfccf1d74d4aa50c12ee12411c4ad264eb7e7232e...,0.059731,1.0,0.012054,0.189584,-0.503416,0.167608,-0.094975,-0.004269,,...,0.0,0.0,0.0,0.0,0.0,0.000000,,0.0,-0.003491,0.0
2,161f768eeb1e1ea84b83693ce04a920a9f110b689e0bba...,0.007374,24.0,0.003360,0.003720,0.003660,0.026764,0.000000,0.026753,-0.018083,...,0.0,0.0,0.0,0.0,0.0,0.000000,,0.0,0.001775,0.0
3,3abf7456317aae86edcfb1829918f9e3f0cbafdab057a6...,-0.020413,0.0,0.089147,0.015177,0.002882,-0.007481,0.000000,0.073336,,...,0.0,0.0,0.0,0.0,0.0,0.000000,,0.0,0.005985,0.0
4,9a130c90c414dbc6fe4be822e73b143a9af6d9829d41b1...,-0.012118,-30.0,-0.022378,-0.020270,-0.502947,,0.434397,-0.181881,,...,0.0,0.0,0.0,0.0,0.0,0.003372,0.002110,0.0,0.006705,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47331,e4ada6d775158e5111c5d4753d48ed9dfd899b7d9c0bfa...,0.003154,-7.0,0.002286,0.009483,0.006075,0.019104,0.000000,-0.065522,,...,0.0,0.0,0.0,0.0,0.0,0.000000,,0.0,-0.002018,0.0
47332,165bd39372f37e4c299daf60b4caae70988d920d2b10fd...,0.003652,7.0,0.004022,0.005214,0.003417,-0.023826,0.000000,0.002238,,...,0.0,0.0,0.0,0.0,0.0,0.000000,,0.0,0.000994,0.0
47333,2f000e866fddd4e1a77b1a545bd97b6afcf6584dc18814...,-0.043740,-10.0,-0.012441,-0.127893,-0.000916,0.075494,0.000000,0.030580,,...,0.0,0.0,0.0,0.0,0.0,0.000000,,0.0,-0.000585,0.0
47334,9f508ccc29282519cf2e40f3b95d48996f4ae96fbe375b...,,-13.0,-0.023427,0.006866,-0.005195,0.189153,0.000000,-0.000491,,...,0.0,0.0,0.0,0.0,0.0,0.000000,,0.0,-0.000740,0.0
