In [None]:
# Cell 1 – imports & params
import pandas as pd, numpy as np, gc
from collections import defaultdict

import os
os.environ['HM_TARGET_WEEK_END'] = '2020-08-19' ## SET THIS DATE TO CORRECT WEEK
TARGET_WEEK_END = os.environ.get("HM_TARGET_WEEK_END")
printdate = TARGET_WEEK_END
assert TARGET_WEEK_END is not None, "Set HM_TARGET_WEEK_END=YYYY-MM-DD"
TARGET_WEEK_END = pd.to_datetime(TARGET_WEEK_END)

K_RECENT        = 20    # recent items per user
RECENT_DAYS     = 56    # lookback window length before cut_ts
EXCLUDE_LAST    = 7     # exclude last 7 days
MIN_OVERLAP     = 3     # min shared recent items to form edge
MAX_NEIGHBORS   = 40    # cap neighbors per user
N_PER_USER      = 50    # final candidate cap per user

def hex_last16_to_uint64(s):
    return np.int64(np.uint64(int(s[-16:], 16)))

In [2]:
# Cell 2 – load filtered transactions
tx = pd.read_csv(
    '../data/input_data/transactions_train.csv',
    usecols=['t_dat','customer_id','article_id'],
    dtype={'t_dat':'string','customer_id':'string','article_id':'int32'}
)
tx['customer_id'] = tx['customer_id'].apply(hex_last16_to_uint64)
tx['t_dat'] = pd.to_datetime(tx['t_dat'])
last_ts = TARGET_WEEK_END
cut_ts  = last_ts - pd.Timedelta(days=EXCLUDE_LAST)
start_ts = cut_ts - pd.Timedelta(days=RECENT_DAYS)
tx = tx[(tx['t_dat'] > start_ts) & (tx['t_dat'] <= cut_ts)].copy()

# 2 min, 15s

In [3]:
# Cell 3 – recent K items per user
tx = tx.sort_values(['customer_id','t_dat'], ascending=[True,False])
tx['rnk'] = tx.groupby('customer_id').cumcount()+1
recent = tx[tx['rnk'] <= K_RECENT][['customer_id','article_id']].drop_duplicates()


In [None]:
# Cell 4 – inverted index article -> list(users)
art_users = recent.groupby('article_id')['customer_id'].apply(list)

In [5]:
# Cell 5 – build user-user overlaps (skip very popular articles to save memory)
overlap = defaultdict(lambda: defaultdict(int))
for art, users in art_users.items():
    if len(users) > 5000:    # skip huge fan-out
        continue
    for i in range(len(users)):
        u = users[i]
        for j in range(i+1, len(users)):
            v = users[j]
            overlap[u][v] += 1
            overlap[v][u] += 1

pairs = []
for u, nbrs in overlap.items():
    for v, ov in nbrs.items():
        if ov >= MIN_OVERLAP:
            pairs.append((u, v, ov))

neighbors = pd.DataFrame(pairs, columns=['user','nbr','ov'])
if len(neighbors):
    neighbors = neighbors.sort_values(['user','ov'], ascending=[True,False])
    neighbors['rnk'] = neighbors.groupby('user').cumcount()+1
    neighbors = neighbors[neighbors['rnk'] <= MAX_NEIGHBORS].drop(columns=['rnk'])

# 6 minutes, 40s

In [6]:
# Cell 6 – neighbor recent items as candidates
nbr_recent = recent.rename(columns={'customer_id':'nbr'})
cand = neighbors.merge(nbr_recent, on='nbr', how='left')
cand = cand.dropna(subset=['article_id']).drop_duplicates(['user','article_id'])
cand['value'] = cand['ov'].astype('float32')

cand = cand.rename(columns={'user':'customer_id'})
cand = cand[['customer_id','article_id','value']]

cand = cand.sort_values(['customer_id','value'], ascending=[True,False])
cand['rnk'] = cand.groupby('customer_id').cumcount()+1
cand = cand[cand['rnk'] <= N_PER_USER].drop(columns=['rnk'])
cand['window_type'] = 'user_overlap'


In [7]:
# Cell 7 – add hex id & save
sub = pd.read_csv('../data/input_data/sample_submission.csv',
                  usecols=['customer_id'], dtype={'customer_id':'string'})
sub['customer_id_int'] = sub['customer_id'].apply(hex_last16_to_uint64)

cand = cand.merge(sub[['customer_id','customer_id_int']],
                  left_on='customer_id', right_on='customer_id_int', how='left')

cand = cand.rename(columns={'customer_id_x':'customer_id',
                            'customer_id_y':'customer_id_hex'}).drop(columns=['customer_id_int'])
cand['article_id'] = cand['article_id'].astype('int32')
cand['value']      = cand['value'].astype('float32')
cand = cand.drop_duplicates(['customer_id','article_id'])

cand.to_parquet(f'../data/outputs/candidates/candidates_user_overlap_{printdate}.parquet', index=False)

del tx, recent, art_users, overlap, neighbors, nbr_recent, cand, sub
gc.collect()

0