In [1]:
# Cell 1
import cudf
import pandas as pd
import gc



In [2]:
# Cell 2
# Load transactions
# We need:
# 1. Recent history (e.g. 60 days) to know what users bought.
# 2. Recent popularity (e.g. 30 days) to know which "colors" of a product are currently trending.
tx = cudf.read_csv(
    '../data/input_data/transactions_train.csv',
    usecols=['t_dat', 'customer_id', 'article_id'],
    dtype={'t_dat': 'string', 'customer_id': 'string', 'article_id': 'int32'},
)
tx['customer_id'] = tx['customer_id'].str[-16:].str.hex_to_int().astype('int64')
tx['t_dat'] = cudf.to_datetime(tx['t_dat'])

last_ts = tx['t_dat'].max()
cut_ts = last_ts - pd.Timedelta(days=7)

# 1. Calculate Item Popularity (Last 30 days)
# We use this to rank the "siblings" (e.g. if a user bought the White shirt, 
# we want to recommend the Black one ONLY if the Black one is actually selling right now).
pop_start = cut_ts - pd.Timedelta(days=7)
pop_df = tx[(tx['t_dat'] > pop_start) & (tx['t_dat'] <= cut_ts)]
item_pop = pop_df.groupby('article_id').size().reset_index(name='pop_score')

# 2. User History (Last 60 days)
hist_start = cut_ts - pd.Timedelta(days=180)
user_hist = tx[(tx['t_dat'] > hist_start) & (tx['t_dat'] <= cut_ts)][['customer_id', 'article_id']].drop_duplicates()

del tx, pop_df
gc.collect()

0

In [3]:
# Cell 3
# Load Article Metadata to link items by product_code
meta = cudf.read_csv(
    '../data/input_data/articles.csv',
    usecols=['article_id', 'product_code'],
    dtype={'article_id': 'int32', 'product_code': 'int32'}
)

In [4]:
# Cell 4
# Build "Best Siblings" map
# 1. Attach popularity to all articles
meta_pop = meta.merge(item_pop, on='article_id', how='left')
meta_pop['pop_score'] = meta_pop['pop_score'].fillna(0)

# 2. Filter: We only want to recommend items that have sold at least once recently.
# This removes old/out-of-stock colors.
meta_pop = meta_pop[meta_pop['pop_score'] > 0]

# 3. Rank items within each product_code
meta_pop = meta_pop.sort_values(['product_code', 'pop_score'], ascending=[True, False])
meta_pop['rank'] = meta_pop.groupby('product_code').cumcount() + 1

# 4. Keep Top 3 items per product_code
# If a product has 10 colors, we only recommend the 3 most popular ones.
best_siblings = meta_pop[meta_pop['rank'] <= 3][['product_code', 'article_id', 'pop_score']]
best_siblings = best_siblings.rename(columns={'article_id': 'rec_article_id'})

In [5]:
# Cell 5
# Generate Candidates
# Logic: User -> Bought Item A -> Product Code X -> Top Items in X (B, C, A)

# 1. Get product_code for items user bought
user_prods = user_hist.merge(meta, on='article_id', how='inner')
# user_prods columns: [customer_id, article_id (bought), product_code]

# 2. Join with best siblings
cand = user_prods.merge(best_siblings, on='product_code', how='inner')
# cand columns: [customer_id, article_id (bought), product_code, rec_article_id, pop_score]

# 3. Cleanup
cand = cand[['customer_id', 'rec_article_id', 'pop_score']]
cand = cand.rename(columns={'rec_article_id': 'article_id', 'pop_score': 'value'})

# 4. Deduplicate
# If user bought 2 items from same product code, we might generate duplicates.
cand = cand.groupby(['customer_id', 'article_id'])['value'].max().reset_index()

# 5. Cap per user (Top 100)
cand = cand.sort_values(['customer_id', 'value'], ascending=[True, False])
cand['rnk'] = cand.groupby('customer_id').cumcount() + 1
cand = cand[cand['rnk'] <= 100]
cand = cand.drop('rnk', axis=1)

In [None]:
# Cell 6
# Save
cand['window_type'] = 'same_product'
cand_pd = cand.to_pandas()

del cand, user_hist, meta, meta_pop, best_siblings, user_prods
gc.collect()

# Attach hex id
sub = pd.read_csv('../data/input_data/sample_submission.csv', usecols=['customer_id'])
sub['customer_id_int'] = sub['customer_id'].str[-16:].apply(lambda h: int(h, 16)).astype('int64')

cand_pd = cand_pd.merge(sub, left_on='customer_id', right_on='customer_id_int', how='left')
cand_pd = cand_pd.rename(columns={'customer_id_x': 'customer_id', 'customer_id_y': 'customer_id_hex'})
cand_pd = cand_pd[['customer_id', 'article_id', 'value', 'window_type', 'customer_id_hex']]

cand_pd = cand_pd.drop_duplicates(['customer_id', 'article_id'])

cand_pd.to_parquet('../data/outputs/candidates_same_product.parquet', index=False)