In [None]:
# Cell 1 – imports and config
import gc
import pandas as pd
import numpy as np
import os

os.environ['HM_TARGET_WEEK_END'] = '2020-08-19' ## SET THIS DATE TO CORRECT WEEK
TARGET_WEEK_END = os.environ.get("HM_TARGET_WEEK_END")
printdate = TARGET_WEEK_END
assert TARGET_WEEK_END is not None, "Set HM_TARGET_WEEK_END=YYYY-MM-DD"
TARGET_WEEK_END = pd.to_datetime(TARGET_WEEK_END)

K_RECENT = 40        # last K purchases per user
HISTORY_DAYS = None  # None = all history up to cut_ts


In [6]:
# Cell 2 – load transactions
tx = pd.read_csv(
    '../data/input_data/transactions_train.csv',
    usecols=['t_dat', 'customer_id', 'article_id'],
    dtype={'t_dat': 'string', 'customer_id': 'string', 'article_id': 'int32'},
)

tx['customer_id'] = tx['customer_id'].str[-16:].apply(lambda h: np.int64(np.uint64(int(h, 16))))
tx['t_dat'] = pd.to_datetime(tx['t_dat'])

last_ts = TARGET_WEEK_END
cut_ts = last_ts - pd.Timedelta(days=7)

if HISTORY_DAYS is not None:
    history_start = cut_ts - pd.Timedelta(days=HISTORY_DAYS)
    tx = tx[(tx['t_dat'] > history_start) & (tx['t_dat'] <= cut_ts)]
else:
    tx = tx[tx['t_dat'] <= cut_ts]


In [7]:
# Cell 3 – last K per user
tx = tx.sort_values(['customer_id', 't_dat'], ascending=[True, False])
tx['rnk'] = tx.groupby('customer_id').cumcount() + 1

# keep rnk so we can compute value
recent = tx[tx['rnk'] <= K_RECENT][['customer_id', 'article_id', 'rnk']]

# Simple value: more recent => higher value
recent['value'] = (K_RECENT + 1 - recent['rnk']).astype('float32')
recent['window_type'] = 'repurchase'

cand = recent[['customer_id', 'article_id', 'value', 'window_type']].drop_duplicates()

In [None]:
# Cell 4 – add hex id and save
sub_customers = pd.read_csv(
    '../data/input_data/sample_submission.csv',
    usecols=['customer_id'],
    dtype={'customer_id': 'string'},
)
sub_customers['customer_id_int'] = sub_customers['customer_id'].str[-16:].apply(lambda h: np.int64(np.uint64(int(h, 16))))

cand = cand.merge(
    sub_customers[['customer_id', 'customer_id_int']],
    left_on='customer_id',
    right_on='customer_id_int',
    how='left',
)

cand = cand.rename(
    columns={'customer_id_x': 'customer_id', 'customer_id_y': 'customer_id_hex'},
)

cand = cand.drop(columns=['customer_id_int'])
cand['article_id'] = cand['article_id'].astype('int32')
cand['value'] = cand['value'].astype('float32')
cand = cand.drop_duplicates(['customer_id', 'article_id'])
cand.to_parquet(f'../data/outputs/candidates/candidates_repurchase_{printdate}.parquet', index=False)

del tx, recent, cand, sub_customers
gc.collect()

22