In [7]:
# Cell 1
import cudf
import pandas as pd
import gc

import os
os.environ['HM_TARGET_WEEK_END'] = '2020-08-19'
TARGET_WEEK_END = os.environ.get("HM_TARGET_WEEK_END")
printdate = TARGET_WEEK_END
assert TARGET_WEEK_END is not None, "Set HM_TARGET_WEEK_END=YYYY-MM-DD"
TARGET_WEEK_END = pd.to_datetime(TARGET_WEEK_END)

In [8]:
# Cell 2
# Load transactions
tx = cudf.read_csv(
    '../data/input_data/transactions_train.csv',
    usecols=['t_dat', 'customer_id', 'article_id'],
    dtype={'t_dat': 'string', 'customer_id': 'string', 'article_id': 'int32'},
)
tx['customer_id'] = tx['customer_id'].str[-16:].str.hex_to_int().astype('int64')
tx['t_dat'] = cudf.to_datetime(tx['t_dat'])

# Load articles to get category info (index_group_no splits Ladies, Men, Kids, etc.)
art = cudf.read_csv(
    '../data/input_data/articles.csv',
    usecols=['article_id', 'index_group_no'], 
    dtype={'article_id': 'int32', 'index_group_no': 'int8'}
)

# Merge category info onto transactions
tx = tx.merge(art, on='article_id', how='left')

In [9]:
# Cell 3
# 1. Build User Profile: What is their favorite category?
# Count purchases per group per user using ALL history
user_pref = tx.groupby(['customer_id', 'index_group_no']).size().reset_index(name='cnt')

# Sort by count desc to find top category
user_pref = user_pref.sort_values(['customer_id', 'cnt'], ascending=[True, False])

# Keep only the Top 1 group per user
user_pref = user_pref.groupby('customer_id').nth(0).reset_index()
user_pref = user_pref[['customer_id', 'index_group_no']]

In [10]:
# Cell 4
# 2. Category Popularity: What is trending in each category right now?
last_ts = TARGET_WEEK_END
cut_ts = last_ts - pd.Timedelta(days=7)
trend_start = cut_ts - pd.Timedelta(days=14) # Last 2 weeks for robust trends

recent_tx = tx[(tx['t_dat'] > trend_start) & (tx['t_dat'] <= cut_ts)]

# Count sales per group+article
group_pop = recent_tx.groupby(['index_group_no', 'article_id']).size().reset_index(name='sales')

# Sort desc
group_pop = group_pop.sort_values(['index_group_no', 'sales'], ascending=[True, False])

# Keep top 12 items per group
group_pop['rank'] = group_pop.groupby('index_group_no').cumcount() + 1
group_pop = group_pop[group_pop['rank'] <= 20]

group_pop = group_pop[['index_group_no', 'article_id']]
group_pop['value'] = 1.0 # Simple score

In [11]:
# Cell 5
# 3. Join User Profile -> Category Trends
# If a user likes "Menswear", they get the top 12 "Menswear" items.
cand = user_pref.merge(group_pop, on='index_group_no', how='left')
cand['window_type'] = 'category_affinity'
cand = cand.drop('index_group_no', axis=1)

In [12]:
# Cell 6
# Save
cand_pd = cand.to_pandas()
del cand, tx, art, user_pref, group_pop, recent_tx
gc.collect()

# Attach hex id for consistency
sub = pd.read_csv('../data/input_data/sample_submission.csv', usecols=['customer_id'])
sub['customer_id_int'] = sub['customer_id'].str[-16:].apply(lambda h: int(h, 16)).astype('int64')

cand_pd = cand_pd.merge(sub, left_on='customer_id', right_on='customer_id_int', how='left')
cand_pd = cand_pd.rename(columns={'customer_id_x': 'customer_id', 'customer_id_y': 'customer_id_hex'})
cand_pd = cand_pd[['customer_id', 'article_id', 'value', 'window_type', 'customer_id_hex']]

# Deduplicate just in case
cand_pd = cand_pd.drop_duplicates(['customer_id', 'article_id'])

cand_pd.to_parquet(f'../data/outputs/candidates/candidates_category_affinity_{printdate}.parquet', index=False)