In [16]:
# Cell 1 – imports and config
import gc
import numpy as np
import pandas as pd

N_PER_USER = 300  # top N per user
HISTORY_DAYS = 270

In [17]:
# Cell 2 – load transactions
tx = pd.read_csv(
    '../data/input_data/transactions_train.csv',
    usecols=['t_dat', 'customer_id', 'article_id'],
    dtype={'t_dat': 'string', 'customer_id': 'string', 'article_id': 'int32'},
)

# map customer_id to int like other pipelines
tx['customer_id'] = tx['customer_id'].str[-16:].apply(lambda h: np.int64(np.uint64(int(h, 16))))
tx['t_dat'] = pd.to_datetime(tx['t_dat'])

last_ts = tx['t_dat'].max()
cut_ts = last_ts - pd.Timedelta(days=7)
history_start = cut_ts - pd.Timedelta(days=HISTORY_DAYS)

tx = tx[(tx['t_dat'] > history_start) & (tx['t_dat'] <= cut_ts)].copy()

In [18]:
# Cell 3 – per-user top-N by frequency
user_item_counts = (
    tx
    .groupby(['customer_id', 'article_id'])['t_dat']
    .size()
    .reset_index(name='cnt')
)

user_item_counts = user_item_counts.sort_values(
    ['customer_id', 'cnt'], ascending=[True, False]
)

user_item_counts['rank'] = user_item_counts.groupby('customer_id')['cnt'].rank(
    method='first', ascending=False
)

user_item_counts = user_item_counts[user_item_counts['rank'] <= N_PER_USER]

# Define value = count (could be log1p(cnt) etc.)
user_item_counts['value'] = user_item_counts['cnt'].astype('float32')

cand = user_item_counts[['customer_id', 'article_id', 'value']].copy()
cand['window_type'] = 'recent_top'

In [19]:
# Cell 4 – add customer_id_hex like other pipelines
sub_customers = pd.read_csv(
    '../data/input_data/sample_submission.csv',
    usecols=['customer_id'],
    dtype={'customer_id': 'string'},
)
sub_customers['customer_id_int'] = sub_customers['customer_id'].str[-16:].apply(lambda h: np.int64(np.uint64(int(h, 16))))

cand = cand.merge(
    sub_customers[['customer_id', 'customer_id_int']],
    left_on='customer_id',
    right_on='customer_id_int',
    how='left',
)

cand = cand.rename(
    columns={'customer_id_x': 'customer_id', 'customer_id_y': 'customer_id_hex'},
)
cand = cand.drop(columns=['customer_id_int'])

cand['article_id'] = cand['article_id'].astype('int32')
cand['value'] = cand['value'].astype('float32')

cand = cand.drop_duplicates(['customer_id', 'article_id'])

cand.to_parquet('../data/outputs/candidates_recent_top.parquet', index=False)

del tx, user_item_counts, cand, sub_customers
gc.collect()

22