In [None]:
import gc
import numba
import numpy as np
import pandas as pd
from scipy import stats
from tqdm import tqdm
from joblib import Parallel,delayed
import time
import re

#from pandarallel import pandarallel
#pandarallel.initialize(progress_bar=True, use_memory_fs=True)

import sys
sys.path.append("../utils")
from memory import reduce_mem_usage

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

this datasets aggreates the features over the time dimension

- takes as base this dataset: https://www.kaggle.com/datasets/raddar/amex-data-integer-dtypes-parquet-format
- feat engineering from here: https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
- lag features idea from here: https://www.kaggle.com/code/thedevastator/lag-features-are-all-you-need/

In [None]:
@numba.njit()
def compute_slope(x, y):
    x_mean = x.mean()
    y_mean = y.mean()
    return np.sum((x-x_mean)*(y-y_mean)) / np.sum((x-x_mean)**2)

def compute_slope_cols(df, customer_ID, num_features):
    n = len(df)
    if n > 2:
        x = np.arange(n)
        _df = df[num_features].fillna(method="ffill", axis=0).fillna(method="bfill", axis=0)
        r = _df[num_features].apply(lambda y: compute_slope(x, y.values))
        r = r.to_dict()
    else:
        r = df[num_features].apply(lambda y: 0)
        r = r.to_dict()
    r["customer_ID"] = customer_ID
    return r

def mode_1st(x):
    return x.value_counts().index[0]

def mode_2nd(x):
    try: return x.value_counts().index[1]
    except: return -1 

numba.njit()
def compute_last_diff(array):
    if len(array) <= 1:
        return np.nan
    else:
        return array[-1]-array[-2]
    
def compute_last_diff_series(df, col):
    r = df.groupby("customer_ID")[col].apply(lambda x: compute_last_diff(x.values))
    r.name = f"{r.name}_diff"
    return r

In [None]:
# references: 
# https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
# https://www.kaggle.com/code/cdeotte/xgboost-starter-0-793
# after pay feats: https://www.kaggle.com/code/jiweiliu/rapids-cudf-feature-engineering-xgb
# other lag features: https://www.kaggle.com/code/ragnar123/amex-lgbm-dart-cv-0-7977

def remove_noise(df): 
    # removes noise from float columns
    float_cols = df.dtypes[df.dtypes == "float32"].index
    print(f"# of float cols to reduce noise: {len(float_cols)}")
    
    for col in float_cols:
        df[col] = df[col].round(decimals=2)
        
    return df

def build_features(df):
    
    all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
    cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    num_features = [col for col in all_cols if col not in cat_features]
    
    print("Computing 'after pay' features")
    tic = time.time()
    for bcol in [f'B_{i}' for i in [11,14,17]]+['D_39','D_131']+[f'S_{i}' for i in [16,23]]:
        for pcol in ['P_2','P_3']:
            if bcol in df.columns:
                df[f'{bcol}-{pcol}'] = df[bcol] - df[pcol]
                num_features.append(f'{bcol}-{pcol}')
    tac = time.time()
    print(f"Elapsed time: {(tac-tic)/60} min\n")
    
    print("Computing numerical aggregations")
    tic = time.time()
    df_num_agg = df.groupby("customer_ID")[num_features].agg(['first', 'mean', 'std', 'min', 'max', 'last'])
    df_num_agg.columns = ['_'.join(x) for x in df_num_agg.columns]
    tac = time.time()
    print(f"Elapsed time: {(tac-tic)/60} min\n")
    
    print("Computing lag features")
    for col in num_features:
        df_num_agg[f"{col}_diff_wfirst"] = df_num_agg[f"{col}_last"] - df_num_agg[f"{col}_first"]
        df_num_agg[f"{col}_diff_wmean"] = df_num_agg[f"{col}_last"] - df_num_agg[f"{col}_mean"]        

    to_remove = list(filter(re.compile(".*_first").match, df_num_agg.columns))
    df_num_agg.drop(to_remove, axis=1, inplace=True)
    
    print("Computing diff features")
    results = list()
    with Parallel(n_jobs=-1) as parallel:
            delayed_func = delayed(compute_last_diff_series)
            results = parallel(
                delayed_func(df, col) 
                for col in tqdm(num_features)
            )
    df_diff = pd.concat(results, axis=1)
    
    print("Computing categorical aggregations")
    tic = time.time()
    df_cat_agg = df.groupby("customer_ID")[cat_features].agg(['first', 'last', 'nunique'])
    df_cat_agg.columns = ['_'.join(x) for x in df_cat_agg.columns]
    tac = time.time()
    print(f"Elapsed time: {(tac-tic)/60} min\n")
    
    #print("Computing slope features")
    #tic = time.time()
    #with Parallel(n_jobs=-1) as parallel:
    #        delayed_func = delayed(compute_slope_cols)
    #        results = parallel(
    #            delayed_func(_df, customer_ID, num_features) 
    #            for customer_ID,_df in tqdm(df.groupby("customer_ID"))
    #        )
    #slopes_df = pd.DataFrame(results).fillna(0).set_index("customer_ID")
    #slopes_df.columns = [f"{col}_slope" for col in slopes_df.columns]
    #tac = time.time()
    #print(f"Elapsed time: {(tac-tic)/60} min\n")
    
    print("Building some other features")
    df_count = df.groupby(["customer_ID"])["S_2"].count()
    df_count = pd.DataFrame(df_count).rename({"S_2":"S_2_steps"}, axis=1)

    all_dfs = [df_num_agg, df_diff, df_cat_agg, df_count]
    df = pd.concat(all_dfs, axis=1)
    del df_num_agg, df_cat_agg, df_count
    gc.collect()

    print('shape after engineering', df.shape )
    
    return df

***
## preproc on train

In [None]:
train = pd.read_parquet("../data/ext/amex-data-integer-dtypes-parquet-format/train.parquet")

In [None]:
%%time
train = remove_noise(train)
train_agg = build_features(train)
train_agg = reduce_mem_usage(train_agg, verbose=True)
train_agg.to_parquet("../data/processed/dsv02/train.parquet")

del train,train_agg
gc.collect()

***
## preproc on test

In [None]:
test = pd.read_parquet("../data/ext/amex-data-integer-dtypes-parquet-format/test.parquet")

In [None]:
%%time
test = remove_noise(test)
test_agg = build_features(test)
test_agg = reduce_mem_usage(test_agg, verbose=True)
test_agg.to_parquet("../data/processed/dsv02/test.parquet")

del test,test_agg
gc.collect()

***