In [None]:
# Cell 1
import cudf
import pandas as pd
import gc

import os
os.environ['HM_TARGET_WEEK_END'] = '2020-08-19' ## SET THIS DATE TO CORRECT WEEK
TARGET_WEEK_END = os.environ.get("HM_TARGET_WEEK_END")
printdate = TARGET_WEEK_END
assert TARGET_WEEK_END is not None, "Set HM_TARGET_WEEK_END=YYYY-MM-DD"
TARGET_WEEK_END = pd.to_datetime(TARGET_WEEK_END)

In [None]:
# Cell 2 - Load transactions
df = cudf.read_csv(
    '../data/input_data/transactions_train.csv',
    usecols=['t_dat', 'customer_id', 'article_id'],
    dtype={'t_dat': 'string', 'customer_id': 'string', 'article_id': 'int32'},
)
df['t_dat'] = cudf.to_datetime(df['t_dat'])
last_ts = TARGET_WEEK_END
cut_ts = last_ts - pd.Timedelta(days=7)

# Use last 21 days of history for popularity
pop_start = cut_ts - pd.Timedelta(days=12)
df = df[(df['t_dat'] > pop_start) & (df['t_dat'] <= cut_ts)]

In [None]:
# Cell 3 - Calculate time-decayed popularity
df['diff_days'] = (cut_ts - df['t_dat']).dt.days
df['pop_score'] = 1.0 / (df['diff_days'] + 1)

# Aggregate per article
pop_items = (
    df.groupby('article_id')['pop_score']
      .sum()
      .reset_index()
      .sort_values('pop_score', ascending=False)
      .head(24)  # Top 24 items
)
pop_items['value'] = pop_items['pop_score'].astype('float32')
pop_items['window_type'] = 'global_pop'
pop_items = pop_items.drop('pop_score', axis=1)

In [None]:
# Cell 4 - Assign these items to ALL customers
# Load sample_submission to get all customer_ids
sub = cudf.read_csv(
    '../data/input_data/sample_submission.csv',
    usecols=['customer_id'],
    dtype={'customer_id': 'string'}
)
sub['customer_id'] = sub['customer_id'].str[-16:].str.hex_to_int().astype('int64')
sub['key'] = 1
pop_items['key'] = 1
candidates = sub.merge(pop_items, on='key').drop('key', axis=1)

In [None]:
# Cell 5 - Save
candidates = candidates.to_pandas()
candidates.to_parquet(f'../data/outputs/candidates/candidates_popularity_{printdate}.parquet', index=False)

del df, sub, pop_items, candidates
gc.collect()

0