In [1]:
import gc
import numba
import numpy as np
import pandas as pd
from pathlib import Path
from scipy import stats
from tqdm import tqdm
from joblib import Parallel,delayed
import time
import re

import category_encoders as ce

#from pandarallel import pandarallel
#pandarallel.initialize(progress_bar=True, use_memory_fs=True)

import sys
sys.path.append("../utils")
from memory import reduce_mem_usage

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
OUTPUT_PATH = Path("../data/processed/dsv04")

if not OUTPUT_PATH.exists():
    OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

this datasets aggreates the features over the time dimension

- takes as base this dataset: https://www.kaggle.com/datasets/raddar/amex-data-integer-dtypes-parquet-format
- feat engineering from here: https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
- lag features idea from here: https://www.kaggle.com/code/thedevastator/lag-features-are-all-you-need/

In [3]:
@numba.njit()
def compute_slope(x, y):
    x_mean = x.mean()
    y_mean = y.mean()
    return np.sum((x-x_mean)*(y-y_mean)) / np.sum((x-x_mean)**2)

def compute_slope_cols(df, customer_ID, num_features):
    n = len(df)
    if n > 2:
        x = np.arange(n)
        _df = df[num_features].fillna(method="ffill", axis=0).fillna(method="bfill", axis=0)
        r = _df[num_features].apply(lambda y: compute_slope(x, y.values))
        r = r.to_dict()
    else:
        r = df[num_features].apply(lambda y: 0)
        r = r.to_dict()
    r["customer_ID"] = customer_ID
    return r

def mode_1st(x):
    return x.value_counts().index[0]

def mode_2nd(x):
    try: return x.value_counts().index[1]
    except: return -1 

numba.njit()
def compute_last_diff(array):
    if len(array) <= 1:
        return np.nan
    else:
        return array[-1]-array[-2]
    
def compute_last_diff_series(df, col):
    r = df.groupby("customer_ID")[col].apply(lambda x: compute_last_diff(x.values))
    r.name = f"{r.name}_diff"
    return r

In [4]:
# references: 
# https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
# https://www.kaggle.com/code/cdeotte/xgboost-starter-0-793
# after pay feats: https://www.kaggle.com/code/jiweiliu/rapids-cudf-feature-engineering-xgb
# other lag features: https://www.kaggle.com/code/ragnar123/amex-lgbm-dart-cv-0-7977

def remove_noise(df): 
    # removes noise from float columns
    float_cols = df.dtypes[df.dtypes == "float32"].index
    print(f"# of float cols to reduce noise: {len(float_cols)}")
    
    for col in float_cols:
        df[col] = df[col].round(decimals=2)
        
    return df

def build_features(df, ohe_cols):
    
    all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
    num_features = [col for col in all_cols if col not in ohe_cols]
    
    print("Computing 'after pay' features")
    tic = time.time()
    for bcol in [f'B_{i}' for i in [11,14,17]]+['D_39','D_131']+[f'S_{i}' for i in [16,23]]:
        for pcol in ['P_2','P_3']:
            if bcol in df.columns:
                df[f'{bcol}-{pcol}'] = df[bcol] - df[pcol]
                num_features.append(f'{bcol}-{pcol}')
    tac = time.time()
    print(f"Elapsed time: {(tac-tic)/60} min\n")
    
    print("Computing numerical aggregations")
    tic = time.time()
    df_num_agg = df.groupby("customer_ID")[num_features].agg(['first', 'mean', 'std', 'min', 'max', 'last'])
    df_num_agg.columns = ['_'.join(x) for x in df_num_agg.columns]
    tac = time.time()
    print(f"Elapsed time: {(tac-tic)/60} min\n")
    
    print("Computing lag features")
    for col in num_features:
        df_num_agg[f"{col}_diff_wfirst"] = df_num_agg[f"{col}_last"] - df_num_agg[f"{col}_first"]
        df_num_agg[f"{col}_diff_wmean"] = df_num_agg[f"{col}_last"] - df_num_agg[f"{col}_mean"]        

    to_remove = list(filter(re.compile(".*_first").match, df_num_agg.columns))
    df_num_agg.drop(to_remove, axis=1, inplace=True)
    
    print("Computing diff features")
    results = list()
    with Parallel(n_jobs=-1) as parallel:
            delayed_func = delayed(compute_last_diff_series)
            results = parallel(
                delayed_func(df, col) 
                for col in tqdm(num_features)
            )
    df_diff = pd.concat(results, axis=1)
    
    print("Computing categorical aggregations")
    tic = time.time()
    df_cat_agg1 = (
        df
        .groupby("customer_ID")
        [ohe_cols]
        .mean()
    )
    df_cat_agg1.columns = [col+"_"+"mean" for col in df_cat_agg1.columns]
    df_cat_agg2 = (
        df
        .groupby("customer_ID")
        [ohe_cols]
        .agg(compute_last_observed)
    )
    df_cat_agg2.columns = [col+"_"+"lo" for col in df_cat_agg2.columns]
    
    tac = time.time()
    print(f"Elapsed time: {(tac-tic)/60} min\n")
    
    #print("Computing slope features")
    #tic = time.time()
    #with Parallel(n_jobs=-1) as parallel:
    #        delayed_func = delayed(compute_slope_cols)
    #        results = parallel(
    #            delayed_func(_df, customer_ID, num_features) 
    #            for customer_ID,_df in tqdm(df.groupby("customer_ID"))
    #        )
    #slopes_df = pd.DataFrame(results).fillna(0).set_index("customer_ID")
    #slopes_df.columns = [f"{col}_slope" for col in slopes_df.columns]
    #tac = time.time()
    #print(f"Elapsed time: {(tac-tic)/60} min\n")
    
    print("Building S_2 related features")
    tic = time.time()
    df_count = df.groupby(["customer_ID"])["S_2"].count()
    df_count = pd.DataFrame(df_count).rename({"S_2":"S_2_steps"}, axis=1)
    tac = time.time()
    print(f"Elapsed time: {(tac-tic)/60} min\n")

    print("Concatenating all the results")
    tic = time.time()
    all_dfs = [df_num_agg, df_diff, df_cat_agg1, df_cat_agg2, df_count]
    df = pd.concat(all_dfs, axis=1)
    tac = time.time()
    print(f"Elapsed time: {(tac-tic)/60} min\n")
    
    del df_num_agg, df_diff, df_cat_agg1, df_cat_agg2, df_count
    gc.collect()

    print('shape after engineering', df.shape )
    
    return df

In [5]:
def encode_categoricals(dataframe, encoder=None):
    categoricals = [
        'B_30', 'B_38', 'D_63', 'D_64', 'D_66', 'D_68', 
        'D_114', 'D_116', 'D_117', 'D_120', 'D_126',
    ]
    
    if encoder is None:
        print("fitting the encoder")
        encoder = ce.one_hot.OneHotEncoder(cols=categoricals)
        encoder.fit(dataframe[categoricals])
        
    out = encoder.transform(dataframe[categoricals]).astype(np.int8)
    ohe_cols = encoder.get_feature_names()
    
    dataframe.drop(categoricals, axis=1, inplace=True)
    dataframe = pd.concat([dataframe, out], axis=1)

    gc.collect()
    
    return dataframe, encoder, ohe_cols

#@numba.njit()
def compute_last_observed(series):
    idx = np.nonzero(series.values[::-1])[0]
    if len(idx)==0:
        return 100
    else:
        return idx[0]

***
## preproc on train

In [6]:
#train["S_2"] = pd.to_datetime(train.S_2)
#train["year"] = train.S_2.dt.year
#train["month"] = train.S_2.dt.month

#train["year_month"] = train.year.astype(str) + "-" + train.month.astype(str)

#cid = train.sample()["customer_ID"].values[0]
#df = train.query("customer_ID == @cid")
#df

In [7]:
#def diff_month(d1, d2):
#    return (d1.year - d2.year) * 12 + d1.month - d2.month

In [8]:
#def compute_antiquity(df):
#    return (df.S_2.dt.year.values[-1] - df.S_2.dt.year.values[0])*12 + (df.S_2.dt.month.values[-1] - df.S_2.dt.month.values[0])

In [9]:
#%%time
#r1 = train.groupby("customer_ID").apply(compute_antiquity)

In [10]:
#%%time
#r2 = train.groupby("customer_ID")["S_2"].apply(lambda x: x.diff().max().days)

In [11]:
#r1.describe()

In [12]:
#r2.describe()

***

In [13]:
train = pd.read_parquet("../data/ext/amex-data-integer-dtypes-parquet-format/train.parquet")
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5531451 entries, 0 to 5531450
Columns: 190 entries, customer_ID to D_145
dtypes: float32(93), int16(9), int8(86), object(2)
memory usage: 2.5+ GB


In [14]:
%%time
train = remove_noise(train)
train, encoder, ohe_cols = encode_categoricals(train)
train_agg = build_features(train, ohe_cols)
train_agg = reduce_mem_usage(train_agg, verbose=True)
train_agg.to_parquet(str(OUTPUT_PATH/"train.parquet"))

del train,train_agg
gc.collect()

# of float cols to reduce noise: 93


fitting the encoder


Computing 'after pay' features
Elapsed time: 0.0021482110023498535 min

Computing numerical aggregations


Elapsed time: 1.1426782290140787 min

Computing lag features




Computing diff features


  0%|                                                                | 0/191 [00:00<?, ?it/s]

  8%|████▌                                                  | 16/191 [00:00<00:07, 22.99it/s]

  8%|████▌                                                  | 16/191 [00:10<00:07, 22.99it/s]

 17%|█████████▏                                             | 32/191 [00:13<01:16,  2.07it/s]

 25%|█████████████▊                                         | 48/191 [00:31<01:53,  1.26it/s]

 26%|██████████████                                         | 49/191 [00:32<01:52,  1.26it/s]

 34%|██████████████████▍                                    | 64/191 [00:50<02:04,  1.02it/s]

 34%|██████████████████▋                                    | 65/191 [00:51<02:01,  1.03it/s]

 42%|███████████████████████                                | 80/191 [01:09<01:57,  1.06s/it]

 42%|███████████████████████▎                               | 81/191 [01:09<01:55,  1.05s/it]

 50%|███████████████████████████▋                           | 96/191 [01:27<01:45,  1.11s/it]

 59%|███████████████████████████████▋                      | 112/191 [01:45<01:29,  1.13s/it]

 67%|████████████████████████████████████▏                 | 128/191 [02:04<01:12,  1.15s/it]

 75%|████████████████████████████████████████▋             | 144/191 [02:22<00:53,  1.13s/it]

 84%|█████████████████████████████████████████████▏        | 160/191 [02:40<00:35,  1.14s/it]

 92%|█████████████████████████████████████████████████▊    | 176/191 [02:59<00:17,  1.15s/it]

100%|██████████████████████████████████████████████████████| 191/191 [02:59<00:00,  1.06it/s]




Computing categorical aggregations


Elapsed time: 2.499083971977234 min

Building S_2 related features


Elapsed time: 0.018985867500305176 min

Concatenating all the results


Elapsed time: 0.05188761949539185 min

shape after engineering (458913, 1639)


Mem. usage decreased to 2372.52 Mb (34.2% reduction)


CPU times: user 7min 29s, sys: 1min 35s, total: 9min 5s
Wall time: 9min 5s


27

***
## preproc on test

In [15]:
test = pd.read_parquet("../data/ext/amex-data-integer-dtypes-parquet-format/test.parquet")
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11363762 entries, 0 to 11363761
Columns: 190 entries, customer_ID to D_145
dtypes: float32(93), int16(10), int8(85), object(2)
memory usage: 5.2+ GB


In [16]:
%%time
test = remove_noise(test)
test, _, _ = encode_categoricals(test, encoder)
test_agg = build_features(test, ohe_cols)
test_agg = reduce_mem_usage(test_agg, verbose=True)
test_agg.to_parquet(str(OUTPUT_PATH/"test.parquet"))

del test,test_agg
gc.collect()

# of float cols to reduce noise: 93


Computing 'after pay' features
Elapsed time: 0.003075726826985677 min

Computing numerical aggregations


Elapsed time: 2.0954121311505634 min

Computing lag features




Computing diff features


  0%|                                                                | 0/191 [00:00<?, ?it/s]

  8%|████▌                                                  | 16/191 [00:01<00:15, 11.10it/s]

  8%|████▌                                                  | 16/191 [00:17<00:15, 11.10it/s]

 17%|█████████▏                                             | 32/191 [00:27<02:36,  1.01it/s]

 17%|█████████▌                                             | 33/191 [00:29<02:42,  1.03s/it]

 25%|█████████████▊                                         | 48/191 [01:13<04:48,  2.02s/it]

 34%|██████████████████▍                                    | 64/191 [01:50<04:32,  2.15s/it]

 42%|███████████████████████                                | 80/191 [02:27<04:07,  2.23s/it]

 42%|███████████████████████▎                               | 81/191 [02:29<04:02,  2.20s/it]

 50%|███████████████████████████▋                           | 96/191 [03:05<03:36,  2.28s/it]

 59%|███████████████████████████████▋                      | 112/191 [03:41<02:59,  2.27s/it]

 59%|███████████████████████████████▉                      | 113/191 [03:42<02:55,  2.25s/it]

 67%|████████████████████████████████████▏                 | 128/191 [04:20<02:27,  2.35s/it]

 75%|████████████████████████████████████████▋             | 144/191 [04:58<01:50,  2.36s/it]

 84%|█████████████████████████████████████████████▏        | 160/191 [05:36<01:13,  2.36s/it]

 84%|█████████████████████████████████████████████▌        | 161/191 [05:36<01:08,  2.29s/it]

 92%|█████████████████████████████████████████████████▊    | 176/191 [06:12<00:35,  2.34s/it]

 93%|██████████████████████████████████████████████████    | 177/191 [06:13<00:32,  2.31s/it]

100%|██████████████████████████████████████████████████████| 191/191 [06:13<00:00,  1.96s/it]




Computing categorical aggregations


Elapsed time: 5.0850771029790245 min

Building S_2 related features


Elapsed time: 0.03945449193318685 min

Concatenating all the results


Elapsed time: 0.10026818116505941 min

shape after engineering (924621, 1639)


Mem. usage decreased to 4784.58 Mb (34.2% reduction)


CPU times: user 14min 33s, sys: 3min 10s, total: 17min 43s
Wall time: 17min 30s


27

***