In [1]:
import gc
import numba
import numpy as np
import pandas as pd
from scipy import stats
from tqdm import tqdm
from joblib import Parallel,delayed
import time

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

this datasets aggreates the features over the time dimension

- takes as base this dataset: https://www.kaggle.com/datasets/raddar/amex-data-integer-dtypes-parquet-format
- feat engineering from here: https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
- lag features idea from here: https://www.kaggle.com/code/thedevastator/lag-features-are-all-you-need/

In [2]:
@numba.njit()
def compute_slope(x, y):
    x_mean = x.mean()
    y_mean = y.mean()
    return np.sum((x-x_mean)*(y-y_mean)) / np.sum((x-x_mean)**2)

def compute_slope_cols(df, customer_ID, num_features):
    n = len(df)
    if n > 2:
        x = np.arange(n)
        _df = df[num_features].fillna(method="ffill", axis=0).fillna(method="bfill", axis=0)
        r = _df[num_features].apply(lambda y: compute_slope(x, y.values))
        r = r.to_dict()
    else:
        r = df[num_features].apply(lambda y: 0)
        r = r.to_dict()
    r["customer_ID"] = customer_ID
    return r

def mode_1st(x):
    return x.value_counts().index[0]

def mode_2nd(x):
    try: return x.value_counts().index[1]
    except: return -1 

In [3]:
# references: 
# https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
# https://www.kaggle.com/code/cdeotte/xgboost-starter-0-793
# after pay feats: https://www.kaggle.com/code/jiweiliu/rapids-cudf-feature-engineering-xgb

def build_features(df):
    
    all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
    cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    num_features = [col for col in all_cols if col not in cat_features]
    
    print("Computing 'after pay' features")
    tic = time.time()
    for bcol in [f'B_{i}' for i in [11,14,17]]+['D_39','D_131']+[f'S_{i}' for i in [16,23]]:
        for pcol in ['P_2','P_3']:
            if bcol in df.columns:
                df[f'{bcol}-{pcol}'] = df[bcol] - df[pcol]
                num_features.append(f'{bcol}-{pcol}')
    tac = time.time()
    print(f"Elapsed time: {(tac-tic)/60} min\n")
    
    print("Computing numerical aggregations")
    tic = time.time()
    df_num_agg = df.groupby("customer_ID")[num_features].agg(['first', 'mean', 'std', 'min', 'max', 'last'])
    df_num_agg.columns = ['_'.join(x) for x in df_num_agg.columns]
    tac = time.time()
    print(f"Elapsed time: {(tac-tic)/60} min\n")
    
    print("Computing lag features")
    for col in num_features:
        df_num_agg[f"{col}_lag_sub"] = df_num_agg[f"{col}_last"] - df_num_agg[f"{col}_first"]
        df_num_agg[f"{col}_lag_div"] = df_num_agg[f"{col}_last"] / (df_num_agg[f"{col}_first"]+1e-15)

    print("Computing categorical aggregations")
    tic = time.time()
    df_cat_agg = df.groupby("customer_ID")[cat_features].agg(['first', 'last', 'nunique'])
    df_cat_agg.columns = ['_'.join(x) for x in df_cat_agg.columns]
    tac = time.time()
    print(f"Elapsed time: {(tac-tic)/60} min\n")
    
    #print("Computing slope features")
    #tic = time.time()
    #with Parallel(n_jobs=8) as parallel:
    #        delayed_func = delayed(compute_slope_cols)
    #        results = parallel(
    #            delayed_func(_df, customer_ID, num_features) 
    #            for customer_ID,_df in tqdm(df.groupby("customer_ID"))
    #        )
    #slopes_df = pd.DataFrame(results).fillna(0).set_index("customer_ID")
    #slopes_df.columns = [f"{col}_slope" for col in slopes_df.columns]
    #tac = time.time()
    #print(f"Elapsed time: {(tac-tic)/60} min\n")
    
    print("Building some other features")
    df_count = df.groupby(["customer_ID"])["S_2"].count()
    df_count = pd.DataFrame(df_count).rename({"S_2":"S_2_steps"}, axis=1)

    all_dfs = [df_num_agg, df_cat_agg, df_count]
    df = pd.concat(all_dfs, axis=1)
    del df_num_agg, df_cat_agg, df_count
    gc.collect()

    print('shape after engineering', df.shape )
    
    return df

***
## preproc on train

In [4]:
train = pd.read_parquet("../data/ext/amex-data-integer-dtypes-parquet-format/train.parquet")

In [5]:
%%time
train_agg = build_features(train)
train_agg.to_parquet("../data/processed/dsv02/train.parquet")

del train,train_agg
gc.collect()

Computing 'after pay' features
Elapsed time: 0.002382922172546387 min

Computing numerical aggregations
Elapsed time: 1.184737765789032 min

Computing lag features




Computing categorical aggregations
Elapsed time: 0.10005617141723633 min

Building some other features
shape after engineering (458913, 1562)
CPU times: user 1min 32s, sys: 19.7 s, total: 1min 52s
Wall time: 2min 1s


0

***
## preproc on test

In [6]:
test = pd.read_parquet("../data/ext/amex-data-integer-dtypes-parquet-format/test.parquet")

In [7]:
%%time
test_agg = build_features(test)
test_agg.to_parquet("../data/processed/dsv02/test.parquet")

del test,test_agg
gc.collect()

Computing 'after pay' features
Elapsed time: 0.0032119949658711753 min

Computing numerical aggregations
Elapsed time: 2.7465876301129657 min

Computing lag features




Computing categorical aggregations
Elapsed time: 0.2289643128712972 min

Building some other features
shape after engineering (924621, 1562)
CPU times: user 3min 1s, sys: 56 s, total: 3min 57s
Wall time: 4min 3s


0

***