In [1]:
# Cell 1 – imports

import gc
import numpy as np
import pandas as pd

import os
# Only set default if not already in environment (allows shell script override)
if 'HM_TARGET_WEEK_END' not in os.environ:
    os.environ['HM_TARGET_WEEK_END'] = '2020-09-22'

TARGET_WEEK_END = os.environ.get("HM_TARGET_WEEK_END")
printdate = TARGET_WEEK_END
assert TARGET_WEEK_END is not None
WEEK_TAG = TARGET_WEEK_END.replace("-", "")

In [2]:
# Cell 2 – load all candidate files

candidate_files = [
    f'../data/outputs/candidates/candidates_weekly_trending_{printdate}.parquet',
    f'../data/outputs/candidates/candidates_popularity_{printdate}.parquet',
    f'../data/outputs/candidates/candidates_itemcf_{printdate}.parquet',
    f'../data/outputs/candidates/candidates_recent_top_{printdate}.parquet',
    f'../data/outputs/candidates/candidates_repurchase_{printdate}.parquet',
    f'../data/outputs/candidates/candidates_user_overlap_{printdate}.parquet',
    f'../data/outputs/candidates/candidates_age_bucket_pop_{printdate}.parquet',
    f'../data/outputs/candidates/candidates_category_affinity_{printdate}.parquet',
    f'../data/outputs/candidates/candidates_same_product_{printdate}.parquet',
    f'../data/outputs/candidates/candidates_embedding_{printdate}.parquet',
]

dfs = []
for path in candidate_files:
    if path.endswith('.parquet'):
        df = pd.read_parquet(path)
        # Drop hex if present to save memory
        if 'customer_id_hex' in df.columns:
            df = df.drop(columns=['customer_id_hex'])
    else:
        df = pd.read_csv(
            path,
            # Only load necessary columns
            usecols=['customer_id', 'article_id', 'value', 'window_type'],
            dtype={
                'customer_id': 'int64',
                'article_id': 'int32',
                'value': 'float32',
                'window_type': 'category', # Critical for memory
            },
        )
    
    # Enforce types immediately
    df['article_id'] = df['article_id'].astype('int32')
    df['customer_id'] = df['customer_id'].astype('int64')
    df['value'] = df['value'].astype('float32')
    if 'window_type' in df.columns:
        df['window_type'] = df['window_type'].astype('category')
        
    dfs.append(df)

# Concatenate all candidates
all_cand = pd.concat(dfs, axis=0, ignore_index=True)


all_cand.head()

# 1 min 56 sec

Unnamed: 0,customer_id,article_id,value,window_type,age_bucket
0,9186111296798639136,869442001,456.266052,older,
1,9186111296798639136,854826001,412.759399,older,
2,9186111296798639136,832298006,401.102814,older,
3,9186111296798639136,851110002,373.518707,older,
4,9186111296798639136,740519002,368.284912,older,


In [3]:
# New Cell – debug per-source coverage
for path, df in zip(candidate_files, dfs):
    print(path)
    print("  rows:", len(df))
    print("  unique customers:", df['customer_id'].nunique())
    print("  mean cand / customer:", df.groupby('customer_id')['article_id'].nunique().mean())

# 1 min 26 sec

../data/outputs/candidates/candidates_weekly_trending_2020-09-22.parquet
  rows: 7851328
  unique customers: 959265


  mean cand / customer: 8.184733102948611
../data/outputs/candidates/candidates_popularity_2020-09-22.parquet
  rows: 32927520


  unique customers: 1371980


  mean cand / customer: 24.0
../data/outputs/candidates/candidates_itemcf_2020-09-22.parquet
  rows: 39487109


  unique customers: 319144


  mean cand / customer: 123.72818852931591
../data/outputs/candidates/candidates_recent_top_2020-09-22.parquet
  rows: 9653601
  unique customers: 867957


  mean cand / customer: 11.122211123362103
../data/outputs/candidates/candidates_repurchase_2020-09-22.parquet
  rows: 18409081
  unique customers: 1356709


  mean cand / customer: 13.568923770683323
../data/outputs/candidates/candidates_user_overlap_2020-09-22.parquet
  rows: 3299120
  unique customers: 96784


  mean cand / customer: 34.08745247148289
../data/outputs/candidates/candidates_age_bucket_pop_2020-09-22.parquet
  rows: 137198000


  unique customers: 1371980


  mean cand / customer: 100.0
../data/outputs/candidates/candidates_category_affinity_2020-09-22.parquet
  rows: 27245620


  unique customers: 1362281


  mean cand / customer: 20.0
../data/outputs/candidates/candidates_same_product_2020-09-22.parquet
  rows: 13259768
  unique customers: 728475


  mean cand / customer: 18.202090668863036
../data/outputs/candidates/candidates_embedding_2020-09-22.parquet
  rows: 53400600


  unique customers: 534006


  mean cand / customer: 100.0


In [3]:
# Cell 3 – deduplicate per (customer_id, article_id), keep max value
# OPTIMIZATION: Use inplace operations to avoid memory spikes from copying
all_cand.sort_values('value', ascending=False, inplace=True)
all_cand.drop_duplicates(['customer_id', 'article_id'], keep='first', inplace=True)
all_cand.reset_index(drop=True, inplace=True)

gc.collect()

# Ensure dtypes (safety)
all_cand['customer_id'] = all_cand['customer_id'].astype('int64')
all_cand['article_id'] = all_cand['article_id'].astype('int32')
all_cand['value'] = all_cand['value'].astype('float32')
all_cand.head()

# 23 gb, 4 min 34 sec

Unnamed: 0,customer_id,article_id,value,window_type,age_bucket
0,-2906901524099224476,923758001,5905826.0,biweekly,
1,-8098947296956176845,809238001,3005049.5,monthly,
2,-6746527055964686677,924243002,2678193.25,biweekly,
3,-5514443247773956906,923128001,2412895.5,weekly,
4,6361568379542243357,805947001,2212859.0,weekly,


In [4]:
# ONLY RUN TO CHECK COVER. SCRIPT TAKES ~17 MINUTES ===
import pandas as pd, numpy as np, gc

print("\n[Recall check vs last 7 days labels]")
tx = pd.read_csv(
    '../data/input_data/transactions_train.csv',
    usecols=['t_dat','customer_id','article_id'],
    dtype={'t_dat':'string','customer_id':'string','article_id':'int32'}
)
tx['customer_id_int'] = tx['customer_id'].str[-16:].apply(lambda h: np.int64(np.uint64(int(h,16))))
tx['t_dat'] = pd.to_datetime(tx['t_dat'])

# FIX: Use the specific target week defined in Cell 1, not the end of the file
last_ts = pd.to_datetime(TARGET_WEEK_END)
cut_ts  = last_ts - pd.Timedelta(days=7)

label_tx = tx[(tx['t_dat'] > cut_ts) & (tx['t_dat'] <= last_ts)]
labels = (label_tx.groupby(['customer_id_int','article_id'])['t_dat'].size().reset_index()[['customer_id_int','article_id']])
labels = labels.rename(columns={'customer_id_int':'customer_id'})

# Build set for quick per-source coverage
label_set = set(map(tuple, labels[['customer_id','article_id']].to_numpy()))

# Overall coverage using deduped pool
cand_pairs = all_cand[['customer_id','article_id']].drop_duplicates()
covered = labels.merge(cand_pairs, on=['customer_id','article_id'], how='left', indicator=True)
overall = (covered['_merge'] == 'both').sum()
print(f"Total label pairs: {len(labels)}")
print(f"Covered by all candidates: {overall}")
print(f"Recall: {overall/len(labels):.4f}")

# Per-source contribution (approximate coverage)
for path, df in zip(candidate_files, dfs):
    pairs = set(map(tuple, df[['customer_id','article_id']].drop_duplicates().to_numpy()))
    inter = len(label_set & pairs)
    print(f"{path:55s} covers {inter:7d} ({inter/len(labels):.4f})")

del tx, label_tx, labels, cand_pairs, covered, label_set, pairs, dfs, df
gc.collect()


[Recall check vs last 7 days labels]
Total label pairs: 213728
Covered by all candidates: 45852
Recall: 0.2145
../data/outputs/candidates/candidates_weekly_trending_2020-09-22.parquet covers    8083 (0.0378)
../data/outputs/candidates/candidates_popularity_2020-09-22.parquet covers    8921 (0.0417)
../data/outputs/candidates/candidates_itemcf_2020-09-22.parquet covers   13278 (0.0621)
../data/outputs/candidates/candidates_recent_top_2020-09-22.parquet covers    7327 (0.0343)
../data/outputs/candidates/candidates_repurchase_2020-09-22.parquet covers    7538 (0.0353)
../data/outputs/candidates/candidates_user_overlap_2020-09-22.parquet covers    2815 (0.0132)
../data/outputs/candidates/candidates_age_bucket_pop_2020-09-22.parquet covers   24715 (0.1156)
../data/outputs/candidates/candidates_category_affinity_2020-09-22.parquet covers    8079 (0.0378)
../data/outputs/candidates/candidates_same_product_2020-09-22.parquet covers   10503 (0.0491)
../data/outputs/candidates/candidates_embedd

0

In [4]:
# Add synthetic candidates for submission customers missing from candidate pool
sub = pd.read_csv('../data/input_data/sample_submission.csv', usecols=['customer_id'], dtype={'customer_id': 'string'})
sub['customer_id'] = sub['customer_id'].str[-16:].apply(lambda h: np.int64(np.uint64(int(h, 16))))

missing = set(sub['customer_id'].unique()) - set(all_cand['customer_id'].unique())
print("Customers missing candidates:", len(missing))

if missing:
    gp = pd.read_json('../data/outputs/general_pred_str.json', typ='series')
    top_items = [int(x) for x in gp['general_pred_str'].split()][:20]  # small cap for features
    synth = pd.DataFrame(
        [(cid, art, 0.001, 'synthetic') for cid in missing for art in top_items],
        columns=['customer_id', 'article_id', 'value', 'window_type']
    )
    synth['customer_id'] = synth['customer_id'].astype('int64')
    synth['article_id']  = synth['article_id'].astype('int32')
    synth['value']       = synth['value'].astype('float32')
    all_cand = pd.concat([all_cand, synth], ignore_index=True)

Customers missing candidates: 0


In [5]:
cand_per_cust = all_cand.groupby('customer_id')['article_id'].nunique()
print("Number of customers:", cand_per_cust.shape[0])
print("Mean candidates per customer:", cand_per_cust.mean())
print("Median candidates per customer:", cand_per_cust.median())
print("Customer with max candidates:", cand_per_cust.max())
print("Customer with min candidates:", cand_per_cust.min())

Number of customers: 1371980
Mean candidates per customer: 188.72318765579672
Median candidates per customer: 131.0
Customer with max candidates: 731
Customer with min candidates: 103


In [5]:
# Cell 4 – save merged candidates_for_ranker.csv

out_path = f'../data/outputs/candidates_week={WEEK_TAG}.parquet'
all_cand.to_parquet(out_path, index=False)
print("Saved:", out_path)

del all_cand
gc.collect()

Saved: ../data/outputs/candidates_week=20200819.parquet


20