In [1]:
import gc
import numba
import numpy as np
import pandas as pd
from scipy import stats
from tqdm import tqdm
from joblib import Parallel,delayed
import time
import re

#from pandarallel import pandarallel
#pandarallel.initialize(progress_bar=True, use_memory_fs=True)

import sys
sys.path.append("../utils")
from memory import reduce_mem_usage

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

this datasets aggreates the features over the time dimension

- takes as base this dataset: https://www.kaggle.com/datasets/raddar/amex-data-integer-dtypes-parquet-format
- feat engineering from here: https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
- lag features idea from here: https://www.kaggle.com/code/thedevastator/lag-features-are-all-you-need/

In [2]:
@numba.njit()
def compute_slope(x, y):
    x_mean = x.mean()
    y_mean = y.mean()
    return np.sum((x-x_mean)*(y-y_mean)) / np.sum((x-x_mean)**2)

def compute_slope_cols(df, customer_ID, num_features):
    n = len(df)
    if n > 2:
        x = np.arange(n)
        _df = df[num_features].fillna(method="ffill", axis=0).fillna(method="bfill", axis=0)
        r = _df[num_features].apply(lambda y: compute_slope(x, y.values))
        r = r.to_dict()
    else:
        r = df[num_features].apply(lambda y: 0)
        r = r.to_dict()
    r["customer_ID"] = customer_ID
    return r

def mode_1st(x):
    return x.value_counts().index[0]

def mode_2nd(x):
    try: return x.value_counts().index[1]
    except: return -1 

numba.njit()
def compute_last_diff(array):
    if len(array) <= 1:
        return np.nan
    else:
        return array[-1]-array[-2]
    
def compute_last_diff_series(df, col):
    r = df.groupby("customer_ID")[col].apply(lambda x: compute_last_diff(x.values))
    r.name = f"{r.name}_diff"
    return r

In [3]:
# references: 
# https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
# https://www.kaggle.com/code/cdeotte/xgboost-starter-0-793
# after pay feats: https://www.kaggle.com/code/jiweiliu/rapids-cudf-feature-engineering-xgb
# other lag features: https://www.kaggle.com/code/ragnar123/amex-lgbm-dart-cv-0-7977

def remove_noise(df): 
    # removes noise from float columns
    float_cols = df.dtypes[df.dtypes == "float32"].index
    print(f"# of float cols to reduce noise: {len(float_cols)}")
    
    for col in float_cols:
        df[col] = df[col].round(decimals=2)
        
    return df

def build_features(df):
    
    all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
    cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    num_features = [col for col in all_cols if col not in cat_features]
    
    print("Computing 'after pay' features")
    tic = time.time()
    for bcol in [f'B_{i}' for i in [11,14,17]]+['D_39','D_131']+[f'S_{i}' for i in [16,23]]:
        for pcol in ['P_2','P_3']:
            if bcol in df.columns:
                df[f'{bcol}-{pcol}'] = df[bcol] - df[pcol]
                num_features.append(f'{bcol}-{pcol}')
    tac = time.time()
    print(f"Elapsed time: {(tac-tic)/60} min\n")
    
    print("Computing numerical aggregations")
    tic = time.time()
    df_num_agg = df.groupby("customer_ID")[num_features].agg(['first', 'mean', 'std', 'min', 'max', 'last'])
    df_num_agg.columns = ['_'.join(x) for x in df_num_agg.columns]
    tac = time.time()
    print(f"Elapsed time: {(tac-tic)/60} min\n")
    
    print("Computing lag features")
    for col in num_features:
        df_num_agg[f"{col}_diff_wfirst"] = df_num_agg[f"{col}_last"] - df_num_agg[f"{col}_first"]
        df_num_agg[f"{col}_diff_wmean"] = df_num_agg[f"{col}_last"] - df_num_agg[f"{col}_mean"]        

    to_remove = list(filter(re.compile(".*_first").match, df_num_agg.columns))
    df_num_agg.drop(to_remove, axis=1, inplace=True)
    
    print("Computing diff features")
    results = list()
    with Parallel(n_jobs=-1) as parallel:
            delayed_func = delayed(compute_last_diff_series)
            results = parallel(
                delayed_func(df, col) 
                for col in tqdm(num_features)
            )
    df_diff = pd.concat(results, axis=1)
    
    print("Computing categorical aggregations")
    tic = time.time()
    df_cat_agg = df.groupby("customer_ID")[cat_features].agg(['first', 'last', 'nunique'])
    df_cat_agg.columns = ['_'.join(x) for x in df_cat_agg.columns]
    tac = time.time()
    print(f"Elapsed time: {(tac-tic)/60} min\n")
    
    #print("Computing slope features")
    #tic = time.time()
    #with Parallel(n_jobs=-1) as parallel:
    #        delayed_func = delayed(compute_slope_cols)
    #        results = parallel(
    #            delayed_func(_df, customer_ID, num_features) 
    #            for customer_ID,_df in tqdm(df.groupby("customer_ID"))
    #        )
    #slopes_df = pd.DataFrame(results).fillna(0).set_index("customer_ID")
    #slopes_df.columns = [f"{col}_slope" for col in slopes_df.columns]
    #tac = time.time()
    #print(f"Elapsed time: {(tac-tic)/60} min\n")
    
    print("Building some other features")
    df_count = df.groupby(["customer_ID"])["S_2"].count()
    df_count = pd.DataFrame(df_count).rename({"S_2":"S_2_steps"}, axis=1)

    all_dfs = [df_num_agg, df_diff, df_cat_agg, df_count]
    df = pd.concat(all_dfs, axis=1)
    del df_num_agg, df_cat_agg, df_count
    gc.collect()

    print('shape after engineering', df.shape )
    
    return df

***
## preproc on train

In [4]:
train = pd.read_parquet("../data/ext/amex-data-integer-dtypes-parquet-format/train.parquet")

In [5]:
%%time
train = remove_noise(train)
train_agg = build_features(train)
train_agg = reduce_mem_usage(train_agg, verbose=True)
train_agg.to_parquet("../data/processed/dsv02/train.parquet")

del train,train_agg
gc.collect()

# of float cols to reduce noise: 93


Computing 'after pay' features
Elapsed time: 0.002068003018697103 min

Computing numerical aggregations


Elapsed time: 1.1622345209121705 min

Computing lag features




Computing diff features


  0%|                                                                | 0/191 [00:00<?, ?it/s]

  8%|████▌                                                  | 16/191 [00:00<00:05, 30.84it/s]

  8%|████▌                                                  | 16/191 [00:10<00:05, 30.84it/s]

 17%|█████████▏                                             | 32/191 [00:13<01:18,  2.03it/s]

 17%|█████████▌                                             | 33/191 [00:14<01:21,  1.95it/s]

 17%|█████████▌                                             | 33/191 [00:30<01:21,  1.95it/s]

 25%|█████████████▊                                         | 48/191 [00:33<02:09,  1.11it/s]

 26%|██████████████                                         | 49/191 [00:34<02:08,  1.10it/s]

 34%|██████████████████▍                                    | 64/191 [00:54<02:21,  1.12s/it]

 42%|███████████████████████                                | 80/191 [01:13<02:08,  1.15s/it]

 50%|███████████████████████████▋                           | 96/191 [01:32<01:50,  1.17s/it]

 59%|███████████████████████████████▋                      | 112/191 [01:52<01:33,  1.18s/it]

 59%|███████████████████████████████▉                      | 113/191 [01:52<01:31,  1.17s/it]

 67%|████████████████████████████████████▏                 | 128/191 [02:11<01:15,  1.20s/it]

 75%|████████████████████████████████████████▋             | 144/191 [02:30<00:56,  1.20s/it]

 84%|█████████████████████████████████████████████▏        | 160/191 [02:48<00:36,  1.17s/it]

 84%|█████████████████████████████████████████████▌        | 161/191 [02:49<00:35,  1.17s/it]

 92%|█████████████████████████████████████████████████▊    | 176/191 [03:07<00:17,  1.19s/it]

100%|██████████████████████████████████████████████████████| 191/191 [03:07<00:00,  1.02it/s]




Computing categorical aggregations


Elapsed time: 0.10460864702860515 min

Building some other features


shape after engineering (458913, 1562)


Mem. usage decreased to 2266.61 Mb (30.6% reduction)


CPU times: user 4min 49s, sys: 1min 30s, total: 6min 20s
Wall time: 6min 20s


27

***
## preproc on test

In [6]:
test = pd.read_parquet("../data/ext/amex-data-integer-dtypes-parquet-format/test.parquet")

In [7]:
%%time
test = remove_noise(test)
test_agg = build_features(test)
test_agg = reduce_mem_usage(test_agg, verbose=True)
test_agg.to_parquet("../data/processed/dsv02/test.parquet")

del test,test_agg
gc.collect()

# of float cols to reduce noise: 93


Computing 'after pay' features
Elapsed time: 0.0030966917673746746 min

Computing numerical aggregations


Elapsed time: 2.3928688923517862 min

Computing lag features




Computing diff features


  0%|                                                                | 0/191 [00:00<?, ?it/s]

 17%|█████████▏                                             | 32/191 [00:25<02:09,  1.23it/s]

 25%|█████████████▊                                         | 48/191 [01:14<04:08,  1.73s/it]

 34%|██████████████████▍                                    | 64/191 [01:56<04:21,  2.06s/it]

 34%|██████████████████▋                                    | 65/191 [01:57<04:17,  2.04s/it]

 35%|███████████████████                                    | 66/191 [01:57<04:05,  1.96s/it]

 42%|███████████████████████                                | 80/191 [02:34<04:13,  2.28s/it]

 50%|███████████████████████████▋                           | 96/191 [03:15<03:47,  2.40s/it]

 59%|███████████████████████████████▋                      | 112/191 [03:55<03:12,  2.43s/it]

 67%|████████████████████████████████████▏                 | 128/191 [04:35<02:34,  2.45s/it]

 68%|████████████████████████████████████▍                 | 129/191 [04:35<02:27,  2.38s/it]

 75%|████████████████████████████████████████▋             | 144/191 [05:13<01:55,  2.45s/it]

 76%|████████████████████████████████████████▉             | 145/191 [05:15<01:51,  2.42s/it]

 84%|█████████████████████████████████████████████▏        | 160/191 [05:53<01:16,  2.46s/it]

 84%|█████████████████████████████████████████████▌        | 161/191 [05:54<01:12,  2.43s/it]

 92%|█████████████████████████████████████████████████▊    | 176/191 [06:32<00:36,  2.46s/it]

 93%|██████████████████████████████████████████████████    | 177/191 [06:33<00:34,  2.43s/it]

100%|██████████████████████████████████████████████████████| 191/191 [06:33<00:00,  2.06s/it]




Computing categorical aggregations


Elapsed time: 0.237039848168691 min

Building some other features


shape after engineering (924621, 1562)


Mem. usage decreased to 4571.19 Mb (30.6% reduction)


CPU times: user 9min 53s, sys: 3min 3s, total: 12min 57s
Wall time: 12min 53s


27

***