In [1]:
import gc
import numpy as np
import pandas as pd

- takes as base this dataset: https://www.kaggle.com/datasets/raddar/amex-data-integer-dtypes-parquet-format
- feat engineering from here: https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created

In [2]:
# took from: https://www.kaggle.com/code/cdeotte/xgboost-starter-0-793

def build_features(df):
    # FEATURE ENGINEERING FROM 
    # https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
    
    all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
    cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    num_features = [col for col in all_cols if col not in cat_features]

    df_num_agg = df.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    df_num_agg.columns = ['_'.join(x) for x in df_num_agg.columns]

    df_cat_agg = df.groupby("customer_ID")[cat_features].agg(['last', 'nunique'])
    df_cat_agg.columns = ['_'.join(x) for x in df_cat_agg.columns]

    df = pd.concat([df_num_agg, df_cat_agg], axis=1)
    del df_num_agg, df_cat_agg
    gc.collect()

    print('shape after engineering', df.shape )
    
    return df

***
## preproc on train

In [3]:
train = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/train.parquet")

cid = train.pop("customer_ID")
train["customer_ID"] = cid.str[-16:].apply(lambda x: int(x,16))

del cid
gc.collect()

16

In [4]:
%%time
train_agg = build_features(train)
train_agg.to_parquet("train_agg.parquet")

del train,train_agg
gc.collect()

shape after engineering (458913, 907)
CPU times: user 2min 2s, sys: 16 s, total: 2min 18s
Wall time: 2min 16s


0

***
### preproc on test

In [5]:
test = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/test.parquet")

cid = test.pop("customer_ID")
test["customer_ID"] = cid.str[-16:].apply(lambda x: int(x,16))

del cid
gc.collect()

16

In [None]:
%%time
test_agg = build_features(test)

del test,test_agg
gc.collect()

***