In [1]:
# Cell 1 – imports

import gc
import numpy as np
import pandas as pd

In [2]:
# Cell 2 – load all candidate files

candidate_files = [
    '../data/outputs/candidates_weekly_trending.parquet',
    '../data/outputs/candidates_popularity.parquet',
    '../data/outputs/candidates_itemcf.parquet',
    '../data/outputs/candidates_recent_top.parquet',
    '../data/outputs/candidates_repurchase.parquet',
    '../data/outputs/candidates_user_overlap.parquet',
    '../data/outputs/candidates_age_bucket_pop.parquet',
    '../data/outputs/candidates_category_affinity.parquet',
    '../data/outputs/candidates_same_product.parquet',
]

dfs = []
for path in candidate_files:
    if path.endswith('.parquet'):
        df = pd.read_parquet(path)
    else:
        df = pd.read_csv(
            path,
            dtype={
                'customer_id': 'int64',
                'article_id': 'int32',
                'value': 'float32',
                'window_type': 'string',
                'customer_id_hex': 'string',
            },
        )
    dfs.append(df)

# Concatenate all candidates
all_cand = pd.concat(dfs, axis=0, ignore_index=True)
all_cand.head()

Unnamed: 0,customer_id,article_id,value,window_type,customer_id_hex,age_bucket
0,9162379705966698872,883307004,2436.140625,biweekly,1bd6056682c35cc892c9db3165c4b3d8d3b208079cee77...,
1,9162379705966698872,748355003,2133.914307,older,1bd6056682c35cc892c9db3165c4b3d8d3b208079cee77...,
2,9162379705966698872,853839003,1838.798462,older,1bd6056682c35cc892c9db3165c4b3d8d3b208079cee77...,
3,9162379705966698872,854951003,652.056213,older,1bd6056682c35cc892c9db3165c4b3d8d3b208079cee77...,
4,9162379705966698872,884081001,468.520721,older,1bd6056682c35cc892c9db3165c4b3d8d3b208079cee77...,


In [3]:
# New Cell – debug per-source coverage
for path, df in zip(candidate_files, dfs):
    print(path)
    print("  rows:", len(df))
    print("  unique customers:", df['customer_id'].nunique())
    print("  mean cand / customer:", df.groupby('customer_id')['article_id'].nunique().mean())

../data/outputs/candidates_weekly_trending.parquet
  rows: 7851328
  unique customers: 959265
  mean cand / customer: 8.184733102948611
../data/outputs/candidates_popularity.parquet
  rows: 32927520
  unique customers: 1371980
  mean cand / customer: 24.0
../data/outputs/candidates_itemcf.parquet
  rows: 39487109
  unique customers: 319144
  mean cand / customer: 123.72818852931591
../data/outputs/candidates_recent_top.parquet
  rows: 9653601
  unique customers: 867957
  mean cand / customer: 11.122211123362103
../data/outputs/candidates_repurchase.parquet
  rows: 18409081
  unique customers: 1356709
  mean cand / customer: 13.568923770683323
../data/outputs/candidates_user_overlap.parquet
  rows: 18442629
  unique customers: 279058
  mean cand / customer: 66.08887399752022
../data/outputs/candidates_age_bucket_pop.parquet
  rows: 96038600
  unique customers: 1371980
  mean cand / customer: 70.0
../data/outputs/candidates_category_affinity.parquet
  rows: 27245620
  unique customers: 1

In [4]:
# Cell 3 – deduplicate per (customer_id, article_id), keep max value
all_cand = (
    all_cand
    .sort_values('value', ascending=False)
    .drop_duplicates(['customer_id', 'article_id'], keep='first')
)
# Ensure dtypes (safety)
all_cand['customer_id'] = all_cand['customer_id'].astype('int64')
all_cand['article_id'] = all_cand['article_id'].astype('int32')
all_cand['value'] = all_cand['value'].astype('float32')

all_cand.head()

Unnamed: 0,customer_id,article_id,value,window_type,customer_id_hex,age_bucket
5156209,-2906901524099224476,923758001,5905826.0,biweekly,d264057471d5f09368905d1429907965dcaa72ed15f35f...,
7363042,-8098947296956176845,809238001,3005049.5,monthly,bd27446f7d260a5e16a99d518007e45217fe7e04178312...,
6789750,-6746527055964686677,924243002,2678193.25,biweekly,e272e5b704883e23831edabeeaef557eb1708f843ed82c...,
6279530,-5514443247773956906,923128001,2412895.5,weekly,8cc410db93a1642bb1ce42cea696afc524c2bb58ccb4ba...,
1217855,6361568379542243357,805947001,2212859.0,weekly,4db1f33dca72b9ce9332c4fdc3d014982c7e3265c9e0e9...,


In [None]:
# ONLY RUN TO CHECK COVER. SCRIPT TAKES ~10 MINUTES ===
import pandas as pd, numpy as np, gc

print("\n[Recall check vs last 7 days labels]")
tx = pd.read_csv(
    '../data/input_data/transactions_train.csv',
    usecols=['t_dat','customer_id','article_id'],
    dtype={'t_dat':'string','customer_id':'string','article_id':'int32'}
)
tx['customer_id_int'] = tx['customer_id'].str[-16:].apply(lambda h: np.int64(np.uint64(int(h,16))))
tx['t_dat'] = pd.to_datetime(tx['t_dat'])
last_ts = tx['t_dat'].max()
cut_ts  = last_ts - pd.Timedelta(days=7)

label_tx = tx[(tx['t_dat'] > cut_ts) & (tx['t_dat'] <= last_ts)]
labels = (label_tx.groupby(['customer_id_int','article_id'])['t_dat']
          .size().reset_index()[['customer_id_int','article_id']])
labels = labels.rename(columns={'customer_id_int':'customer_id'})

# Build set for quick per-source coverage
label_set = set(map(tuple, labels[['customer_id','article_id']].to_numpy()))

# Overall coverage using deduped pool
cand_pairs = all_cand[['customer_id','article_id']].drop_duplicates()
covered = labels.merge(cand_pairs, on=['customer_id','article_id'], how='left', indicator=True)
overall = (covered['_merge'] == 'both').sum()
print(f"Total label pairs: {len(labels)}")
print(f"Covered by all candidates: {overall}")
print(f"Recall: {overall/len(labels):.4f}")

# Per-source contribution (approximate coverage)
# for path, df in zip(candidate_files, dfs):
#     pairs = set(map(tuple, df[['customer_id','article_id']].drop_duplicates().to_numpy()))
#     inter = len(label_set & pairs)
#     print(f"{path:55s} covers {inter:7d} ({inter/len(labels):.4f})")
# del tx, label_tx, labels, cand_pairs, covered, label_set, pairs

del tx, label_tx, labels, cand_pairs, covered, label_set
gc.collect()


[Recall check vs last 7 days labels]
Total label pairs: 213728
Covered by all candidates: 40829
Recall: 0.1910


NameError: name 'pairs' is not defined

In [6]:
# Add synthetic candidates for submission customers missing from candidate pool
sub = pd.read_csv('../data/input_data/sample_submission.csv', usecols=['customer_id'], dtype={'customer_id': 'string'})
sub['customer_id'] = sub['customer_id'].str[-16:].apply(lambda h: np.int64(np.uint64(int(h, 16))))

missing = set(sub['customer_id'].unique()) - set(all_cand['customer_id'].unique())
print("Customers missing candidates:", len(missing))

if missing:
    gp = pd.read_json('../data/outputs/general_pred_str.json', typ='series')
    top_items = [int(x) for x in gp['general_pred_str'].split()][:20]  # small cap for features
    synth = pd.DataFrame(
        [(cid, art, 0.001, 'synthetic') for cid in missing for art in top_items],
        columns=['customer_id', 'article_id', 'value', 'window_type']
    )
    synth['customer_id'] = synth['customer_id'].astype('int64')
    synth['article_id']  = synth['article_id'].astype('int32')
    synth['value']       = synth['value'].astype('float32')
    all_cand = pd.concat([all_cand, synth], ignore_index=True)

Customers missing candidates: 0


In [7]:
cand_per_cust = all_cand.groupby('customer_id')['article_id'].nunique()
print("Number of customers:", cand_per_cust.shape[0])
print("Mean candidates per customer:", cand_per_cust.mean())
print("Median candidates per customer:", cand_per_cust.median())
print("Customer with max candidates:", cand_per_cust.max())
print("Customer with min candidates:", cand_per_cust.min())

Number of customers: 1371980
Mean candidates per customer: 134.42617239318358
Median candidates per customer: 95.0
Customer with max candidates: 1056
Customer with min candidates: 70


In [8]:
# Cell 4 – save merged candidates_for_ranker.csv

all_cand.to_parquet('../data/outputs/candidates.parquet', index=False)

del all_cand, dfs
gc.collect()

1067