In [1]:
class Config:
    name = "EDAs/Lag/EmbededLasso"

    n_splits = 5
    seed = 2022
    target = "target"

    # Colab Env
    upload_from_colab = True
    api_path = "/content/drive/MyDrive/workspace/kaggle.json"
    drive_path = "/content/drive/MyDrive/workspace/kaggle-amex"

    # Kaggle Env
    kaggle_dataset_path = None

    # Reka Env
    dir_path = '/home/abe/kaggle/kaggle-amex'

In [2]:
import os
import json
import warnings
import shutil
import logging
import joblib
import random
import datetime
import sys
import gc
import multiprocessing
import joblib
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-pastel')
import seaborn as sns
sns.set_palette("winter_r")

from tqdm.auto import tqdm
tqdm.pandas()
warnings.filterwarnings('ignore')

## Environment Settings

In [3]:
INPUT = os.path.join(Config.dir_path, 'input')
OUTPUT = os.path.join(Config.dir_path, 'output')
SUBMISSION = os.path.join(Config.dir_path, 'submissions')
OUTPUT_EXP = os.path.join(OUTPUT, Config.name)
EXP_MODEL = os.path.join(OUTPUT_EXP, "model")
EXP_FIG = os.path.join(OUTPUT_EXP, "fig")
EXP_PREDS = os.path.join(OUTPUT_EXP, "preds")

# make dirs
for d in [INPUT, SUBMISSION, EXP_MODEL, EXP_FIG, EXP_PREDS]:
    os.makedirs(d, exist_ok=True)

In [4]:
train = pd.read_parquet(os.path.join(INPUT, 'train_small.parquet'))
target = pd.read_csv(os.path.join(INPUT, 'train_labels.csv'))
train = train.merge(target, how='left', on='customer_ID')
test = pd.read_parquet(os.path.join(INPUT, 'test_small.parquet'))

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 603128 entries, 0 to 603127
Columns: 191 entries, customer_ID to target
dtypes: float32(93), int16(9), int64(1), int8(86), object(2)
memory usage: 292.2+ MB


In [6]:
train.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,f767e9a1c77a72815a2d61bd4ef1de3dcb2f659f86c4c5...,2017-03-08,0.986959,12,0.164017,0.083942,0.504688,0.096874,0.0,0.425046,...,-1,-1,0,0,0.0,,0,0.009946,0,0
1,f767e9a1c77a72815a2d61bd4ef1de3dcb2f659f86c4c5...,2017-04-25,1.001113,0,0.024842,1.009856,0.000869,0.093035,0.0,0.22911,...,-1,-1,0,0,0.0,,0,0.003058,0,0
2,f767e9a1c77a72815a2d61bd4ef1de3dcb2f659f86c4c5...,2017-05-13,1.008732,16,0.038798,1.000989,0.004768,0.089419,0.0,0.171368,...,-1,-1,0,0,0.0,,0,0.00955,0,0
3,f767e9a1c77a72815a2d61bd4ef1de3dcb2f659f86c4c5...,2017-06-29,1.007229,2,0.030237,1.004904,0.005655,0.068247,0.0,0.075452,...,-1,-1,0,0,0.0,,0,0.00793,0,0
4,f767e9a1c77a72815a2d61bd4ef1de3dcb2f659f86c4c5...,2017-07-06,1.007854,9,0.053108,1.000425,0.006688,0.076103,0.0,0.079313,...,-1,-1,0,0,0.0,,0,0.003996,0,0


## Select important lag features with Lasso

In [7]:
cat_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
cont_features = [col for col in train.columns if col not in cat_features + [Config.target, 'S_2', 'customer_ID']]

In [8]:
def _add_diff_features(args, step=3):
    customer_id, df = args
    dfs = []
    for i in range(step):
        shift = i+1
        df_diff = df[cont_features].diff(shift).rename(columns={f: f"{f}_diff{shift}" for f in cont_features})
        df_diff = df_diff.tail(1).reset_index(drop=True)
        dfs.append(df_diff)
    df = pd.concat(dfs, axis=1)
    df['customer_ID'] = customer_id
    return df


def add_diff_features(df : pd.DataFrame, processes=32):
    with multiprocessing.Pool(processes=processes) as pool:
        dfs = pool.imap_unordered(_add_diff_features, df.groupby('customer_ID'))
        dfs = tqdm(dfs)
        dfs = list(dfs)
    df = pd.concat(dfs)
    return df.reset_index(drop=True).sort_index(axis=1)

train_diff = add_diff_features(train.copy()).merge(target, how='left', on='customer_ID')
test_dfff = add_diff_features(test.copy())

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [45]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

def select_diff_features(df : pd.DataFrame, features, target, max_features=150):
    diff_features_list = []
    train_y = df[target]
    train_X = df[features]
    
    train_X = train_X.fillna(-999)
    
    # select features with L1 norm
    scaler = StandardScaler()
    scaler.fit(train_X)
    for alpha in [3e-3, 5e-3, 7e-3, 1e-2, 3e-2]:
        selector = SelectFromModel(Lasso(alpha=alpha), max_features=max_features)
        selector.fit(scaler.transform(train_X), train_y)
        selected_features = train_X.columns.values[selector.get_support()]
        diff_features_list.append(selected_features)
        print('number of selected features : {}'.format(len(selected_features)))
    
    return diff_features_list
    
        
features = [col for col in train_diff.columns if col not in ['customer_ID', Config.target]]
diff_features_list = select_diff_features(train_diff, features, Config.target)

number of selected features : 65
number of selected features : 54
number of selected features : 47
number of selected features : 41
number of selected features : 19


In [16]:
def _add_shift_features(args, step=3):
    customer_id, df = args
    dfs = []
    for i in range(step):
        shift = i+1
        df_shift = df[cont_features + cat_features].shift(shift).rename(columns={f: f"{f}_shift{shift}" for f in cont_features + cat_features})
        df_shift = df_shift.tail(1).reset_index(drop=True)
        dfs.append(df_shift)
    df = pd.concat(dfs, axis=1)
    df['customer_ID'] = customer_id
    return df

def add_shift_features(df : pd.DataFrame, processes=32):
    with multiprocessing.Pool(processes=processes) as pool:
        dfs = pool.imap_unordered(_add_shift_features, df.groupby('customer_ID'))
        dfs = tqdm(dfs)
        dfs = list(dfs)
    df = pd.concat(dfs)
    return df.reset_index(drop=True).sort_index(axis=1)

train_shift = add_shift_features(train.copy()).merge(target, how='left', on='customer_ID')
test_shift = add_shift_features(test.copy())

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [48]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

def select_shift_features(df : pd.DataFrame, features, target, max_features=150):
    shift_features_list = []
    train_y = df[target]
    train_X = df[features]
    
    cont_shift_features = []
    for shift in [1, 2, 3]:
        for col in cont_features:
            cont_shift_features.append(f"{col}_shift{shift}")
    cat_shift_features = []
    for shift in [1, 2, 3]:
        for col in cat_features:
            cat_shift_features.append(f"{col}_shift{shift}")
    
    # number of categorical features are small
    # simply select continuous features
    train_X = train_X[cont_shift_features].fillna(-999)
    
    # select features with L1 norm
    scaler = StandardScaler()
    scaler.fit(train_X)
    for alpha in [7e-3, 1e-2, 3e-2]:
        selector = SelectFromModel(Lasso(alpha=alpha), max_features=max_features)
        selector.fit(scaler.transform(train_X), train_y)
        selected_features = train_X.columns.values[selector.get_support()]
        selected_features = np.hstack((selected_features, cat_shift_features))
        shift_features_list.append(selected_features)
        print('number of selected features : {}'.format(len(selected_features)))
    
    return shift_features_list
    
        
features = [col for col in train_shift.columns if col not in ['customer_ID', Config.target]]
shift_features_list = select_shift_features(train_shift, features, Config.target)

number of selected features : 86
number of selected features : 80
number of selected features : 58
number of selected features : 35


In [50]:
def _add_pct_features(args, step=3):
    customer_id, df = args
    dfs = []
    for i in range(step):
        shift = i+1
        df_pct = df[cont_features].pct_change(shift).rename(columns={f: f"{f}_pct{shift}" for f in cont_features})
        df_pct = df_pct.tail(1).reset_index(drop=True)
        dfs.append(df_pct)
    df = pd.concat(dfs, axis=1)
    df['customer_ID'] = customer_id
    return df

def add_pct_features(df : pd.DataFrame, processes=32):
    with multiprocessing.Pool(processes=processes) as pool:
        dfs = pool.imap_unordered(_add_pct_features, df.groupby('customer_ID'))
        dfs = tqdm(dfs)
        dfs = list(dfs)
    df = pd.concat(dfs)
    return df.reset_index(drop=True).sort_index(axis=1)

train_pct = add_pct_features(train.copy()).merge(target, how='left', on='customer_ID')
test_pct = add_pct_features(test.copy())

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [83]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

def select_pct_features(df : pd.DataFrame, features, target, max_features=150):
    pct_features_list = []
    train_y = df[target]
    train_X = df[features]
    
    train_X = train_X.replace([np.inf, -np.inf], np.nan)
    train_X = train_X.fillna(-999)
    
    # select features with L1 norm
    scaler = StandardScaler()
    scaler.fit(train_X)
    for alpha in [3e-3, 5e-3, 7e-3, 1e-2, 3e-2]:
        selector = SelectFromModel(Lasso(alpha=alpha), max_features=max_features)
        selector.fit(scaler.transform(train_X), train_y)
        selected_features = train_X.columns.values[selector.get_support()]
        pct_features_list.append(selected_features)
        print('number of selected features : {}'.format(len(selected_features)))
    
    return pct_features_list
    
features = [col for col in train_pct.columns if col not in ['customer_ID', Config.target]]
pct_features_list = select_pct_features(train_pct, features, Config.target)

number of selected features : 118
number of selected features : 90
number of selected features : 76
number of selected features : 64
number of selected features : 34


In [56]:
def _add_avg_features(args, step=3):
    customer_id, df = args
    dfs = []
    for window in [3, 6, 9, 12]:
        df_avg = df[cont_features].rolling(window).mean(skipna=True).rename(columns={f: f"{f}_avg{window}" for f in cont_features})
        df_avg = df_avg.tail(1).reset_index(drop=True)
        dfs.append(df_avg)
    df = pd.concat(dfs, axis=1)
    df['customer_ID'] = customer_id
    return df

def add_avg_features(df : pd.DataFrame, processes=32):
    with multiprocessing.Pool(processes=processes) as pool:
        dfs = pool.imap_unordered(_add_avg_features, df.groupby('customer_ID'))
        dfs = tqdm(dfs)
        dfs = list(dfs)
    df = pd.concat(dfs)
    return df.reset_index(drop=True).sort_index(axis=1)

train_avg = add_avg_features(train.copy()).merge(target, how='left', on='customer_ID')
test_avg = add_avg_features(test.copy())

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [63]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

def select_avg_features(df : pd.DataFrame, features, target, max_features=150):
    avg_features_list = []
    train_y = df[target]
    train_X = df[features]
    
    train_X = train_X.fillna(-999)
    
    # select features with L1 norm
    scaler = StandardScaler()
    scaler.fit(train_X)
    for alpha in [3e-3, 1e-2, 3e-2, 7e-2]:
        selector = SelectFromModel(Lasso(alpha=alpha), max_features=max_features)
        selector.fit(scaler.transform(train_X), train_y)
        selected_features = train_X.columns.values[selector.get_support()]
        avg_features_list.append(selected_features)
        print('number of selected features : {}'.format(len(selected_features)))
    
    return avg_features_list
    
        
features = [col for col in train_avg.columns if col not in ['customer_ID', Config.target]]
avg_features_list = select_avg_features(train_avg, features, Config.target)

number of selected features : 60
number of selected features : 38
number of selected features : 24
number of selected features : 9


In [25]:
train_diff.set_index('customer_ID').join(
    train_shift.set_index('customer_ID'),
    how='left').join(
        train_pct.set_index('customer_ID'),
        how='left').join(
            train_avg.set_index('customer_ID'),
    how='left').reset_index(drop=False)


Unnamed: 0,customer_ID,B_10_diff1,B_10_diff2,B_10_diff3,B_11_diff1,B_11_diff2,B_11_diff3,B_12_diff1,B_12_diff2,B_12_diff3,...,S_7_avg6,S_7_avg9,S_8_avg12,S_8_avg3,S_8_avg6,S_8_avg9,S_9_avg12,S_9_avg3,S_9_avg6,S_9_avg9
0,00031e8be98bc3411f6037cbd4d3eeaf24b3ae221682b7...,19.927452,19.923395,19.904968,0.004417,0.007120,-0.027982,-0.002339,0.010416,0.004515,...,0.152841,0.183534,969.083333,664.000000,636.666667,845.666667,0.015174,0.024189,0.017479,0.015676
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,-0.004486,-0.004114,0.000559,-0.000448,0.008205,0.014154,-0.003547,-0.006914,-0.010948,...,0.122295,0.113279,1237.000000,1100.333333,909.166667,986.444444,,,,
2,000333075fb8ec6d504539852eeeb762643562e701ac79...,0.005244,0.009799,0.011105,-0.009249,0.003968,-0.007162,-0.001556,-0.003359,-0.045376,...,0.056963,0.058759,2904.000000,2642.000000,2642.000000,2816.666667,0.048090,0.028336,0.031513,0.044035
3,00013181a0c5fc8f1ea38cd2b90fe8ad2fa8cad9d9f13e...,0.000184,0.001711,0.002984,0.023236,0.050187,0.057631,-0.000864,0.001148,-0.004457,...,,,212.333333,849.333333,424.666667,283.111111,,,,
4,0002e335892f7998f0feb3a59f32d652f0da7c85e535b9...,0.005607,-0.019749,-0.020489,0.028396,0.050682,0.117197,-0.001075,0.005746,-0.001984,...,,,383.166667,793.333333,766.333333,510.888889,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47331,fff82c674170f98c87408aa5552011105ca20f120d13d0...,-0.001397,0.002399,0.001322,-0.007368,-0.005935,-0.005865,-0.011332,-0.003345,-0.007193,...,,,0.000000,0.000000,0.000000,0.000000,,,,
47332,fffdc0cf228085b4275b38ebe6eb915766af3fecb2ae28...,0.054355,0.112287,0.075323,-0.033441,-0.019170,-0.015189,0.037052,0.038169,0.050114,...,0.031230,0.032574,2773.000000,2642.000000,2642.000000,2642.000000,0.015850,0.017013,0.013213,0.014130
47333,fffcae1af226772f35143cd002489b2c99a0f4acdd5932...,-0.022983,-0.017761,-0.007625,0.010993,-0.016954,0.000151,0.006223,0.006662,0.013123,...,0.160421,0.142370,1591.333333,1727.666667,1530.500000,1630.777778,0.009667,0.008686,0.009100,0.009717
47334,fffe13e28dc3ceadf28249b596ba25df93e38ec53d38cf...,0.061160,0.061242,,-0.024740,-0.037190,,0.040789,0.044289,,...,,,,1708.666667,,,,,,


In [22]:
train_diff.set_index('customer_ID').join(train_shift.set_index('customer_ID'), how='left')

Unnamed: 0_level_0,B_10_diff1,B_10_diff2,B_10_diff3,B_11_diff1,B_11_diff2,B_11_diff3,B_12_diff1,B_12_diff2,B_12_diff3,B_13_diff1,...,S_6_shift3,S_7_shift1,S_7_shift2,S_7_shift3,S_8_shift1,S_8_shift2,S_8_shift3,S_9_shift1,S_9_shift2,S_9_shift3
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00031e8be98bc3411f6037cbd4d3eeaf24b3ae221682b77900533c9bb36ec41f,19.927452,19.923395,19.904968,0.004417,0.007120,-0.027982,-0.002339,0.010416,0.004515,-0.068877,...,0.0,0.124173,0.110130,0.116545,996.0,0.0,528.0,0.014504,0.012081,0.014437
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,-0.004486,-0.004114,0.000559,-0.000448,0.008205,0.014154,-0.003547,-0.006914,-0.010948,0.009185,...,0.0,0.136180,0.099288,0.095556,996.0,1021.0,1021.0,,,
000333075fb8ec6d504539852eeeb762643562e701ac79b2101ab0f9471eeb5a,0.005244,0.009799,0.011105,-0.009249,0.003968,-0.007162,-0.001556,-0.003359,-0.045376,-0.100237,...,0.0,0.045073,0.063592,0.068701,2380.0,2380.0,2380.0,0.028105,0.031975,0.035209
00013181a0c5fc8f1ea38cd2b90fe8ad2fa8cad9d9f13e4063bdf6b0f7d51eb6,0.000184,0.001711,0.002984,0.023236,0.050187,0.057631,-0.000864,0.001148,-0.004457,0.016553,...,1.0,0.237897,0.242219,,1454.0,322.0,0.0,,,
0002e335892f7998f0feb3a59f32d652f0da7c85e535b99ea6f87fd317ed47f4,0.005607,-0.019749,-0.020489,0.028396,0.050682,0.117197,-0.001075,0.005746,-0.001984,-0.002032,...,1.0,0.097007,0.096326,0.107518,0.0,2380.0,2218.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fff82c674170f98c87408aa5552011105ca20f120d13d0b547fb226be9c68e84,-0.001397,0.002399,0.001322,-0.007368,-0.005935,-0.005865,-0.011332,-0.003345,-0.007193,0.006558,...,1.0,,,,0.0,0.0,0.0,,,
fffdc0cf228085b4275b38ebe6eb915766af3fecb2ae28f6ffa70f4e22d7029f,0.054355,0.112287,0.075323,-0.033441,-0.019170,-0.015189,0.037052,0.038169,0.050114,-0.114548,...,0.0,0.020554,0.034710,0.027200,2380.0,2380.0,3166.0,0.013795,0.014503,0.010120
fffcae1af226772f35143cd002489b2c99a0f4acdd593210b0e6f32dae1074d9,-0.022983,-0.017761,-0.007625,0.010993,-0.016954,0.000151,0.006223,0.006662,0.013123,0.008752,...,0.0,0.151570,0.129886,0.154913,1454.0,1511.0,1511.0,0.014643,0.005613,0.008198
fffe13e28dc3ceadf28249b596ba25df93e38ec53d38cf28ee13d50d4b7b1a22,0.061160,0.061242,,-0.024740,-0.037190,,0.040789,0.044289,,0.019153,...,,0.241101,0.166198,,1454.0,1454.0,,,,
