In [1]:
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

***

In [2]:
%%time
train = pd.read_parquet("../data/ext/amex-data-integer-dtypes-parquet-format/train.parquet")
test = pd.read_parquet("../data/ext/amex-data-integer-dtypes-parquet-format/test.parquet")

CPU times: user 30.3 s, sys: 29.9 s, total: 1min
Wall time: 41.6 s


In [3]:
train["S_2"] = pd.to_datetime(train["S_2"])
test["S_2"] = pd.to_datetime(test["S_2"])

***
## identify columns that are constant over time

In [None]:
constant_cols = list()

for col in tqdm(train.columns[2:]):
    if train.groupby("customer_ID")[col].nunique().max() == 1:
        print(col)
        constant_cols.append(col)

In [None]:
# there are no such columns ...
constant_cols

***
## convert to wide format

In [4]:
# create timestep variable

train["timestep"] = 14 - train.groupby("customer_ID")["S_2"].rank(ascending=False).astype(int)
test["timestep"] = 14 - test.groupby("customer_ID")["S_2"].rank(ascending=False).astype(int)

***
## transform on train

In [5]:
# pivot the dataframe col by col

all_dfs = list()

for col in tqdm(train.columns[2:-1]):
    df = train.pivot(index="customer_ID", columns="timestep", values=col)
    df.columns = [f"{col}-{i}" for i in df.columns]
    all_dfs.append(df)

100% 188/188 [06:52<00:00,  2.19s/it]


In [6]:
train_pivot = pd.concat(all_dfs, axis=1)
train_pivot.to_parquet("../data/processed/dsv03/train.parquet")

In [7]:
del train,all_dfs,train_pivot
gc.collect()

23

***
## transform on test

In [8]:
# pivot the dataframe col by col

all_dfs = list()

for col in tqdm(test.columns[2:-1]):
    df = test.pivot(index="customer_ID", columns="timestep", values=col)
    df.columns = [f"{col}-{i}" for i in df.columns]
    all_dfs.append(df)

100% 188/188 [15:20<00:00,  4.89s/it]


In [9]:
test_pivot = pd.concat(all_dfs, axis=1)
test_pivot.to_parquet("../data/processed/dsv03/test.parquet")

In [10]:
del test,all_dfs,test_pivot
gc.collect()

23

***