In [None]:
# Cell 1 – imports and config

import gc
import numpy as np
import pandas as pd
import cudf

N_SIM_ITEMS = 1000    # number of similar items per item
N_PER_USER = 170    # max itemCF candidates per user
MIN_COOC = 3     # min co-occurrence count to keep a pair



In [2]:
# Cell 2 – load transactions

tx = cudf.read_csv(
    '../data/input_data/transactions_train.csv',
    usecols=['t_dat', 'customer_id', 'article_id'],
    dtype={'t_dat': 'string', 'customer_id': 'string', 'article_id': 'int32'},
)

# Same int customer_id mapping as other pipelines
tx['customer_id'] = tx['customer_id'].str[-16:].str.hex_to_int().astype('int64')
tx['t_dat'] = cudf.to_datetime(tx['t_dat'])

last_ts = tx['t_dat'].max()

# NEW: cutoff – only use history up to cut_ts (exclude last 7 days)
cut_ts = last_ts - pd.Timedelta(days=7)

# Optional: focus on a recent history window (e.g. 8 weeks) BEFORE cut_ts
history_start = cut_ts - pd.Timedelta(days=56)

tx = tx[(tx['t_dat'] > history_start) & (tx['t_dat'] <= cut_ts)]

tx.head()

Unnamed: 0,t_dat,customer_id,article_id
29314980,2020-07-22,-5042555964741348414,778064038
29314981,2020-07-22,-5042555964741348414,817166007
29314982,2020-07-22,-5042555964741348414,840360002
29314983,2020-07-22,-5042555964741348414,817166003
29314984,2020-07-22,-3400017821321855889,624486001


In [3]:
# Cell 3 – unique user–item interactions

ui = tx[['customer_id', 'article_id']].drop_duplicates()
ui.head()

Unnamed: 0,customer_id,article_id
29314980,-5042555964741348414,778064038
29314981,-5042555964741348414,817166007
29314982,-5042555964741348414,840360002
29314983,-5042555964741348414,817166003
29314984,-3400017821321855889,624486001


In [4]:
# Cell 4 – item-item co-occurrence counts

# Self-join on customer_id to get (item_i, item_j) pairs for the same user
pairs = ui.merge(
    ui,
    on='customer_id',
    how='inner',
    suffixes=('_i', '_j')
)

# Drop self pairs
pairs = pairs[pairs['article_id_i'] != pairs['article_id_j']]

# Enforce ordering to avoid (i,j) and (j,i) duplicates
mask = pairs['article_id_i'] < pairs['article_id_j']
pairs = pairs[mask]

# Count co-occurrences
cooc = (
    pairs
    .groupby(['article_id_i', 'article_id_j'])
    .size()
    .reset_index(name='coocount')
)

# Filter weak pairs
cooc = cooc[cooc['coocount'] >= MIN_COOC]

cooc.head()

Unnamed: 0,article_id_i,article_id_j,coocount
0,816588001,855893001,3
6,878200010,902419001,3
29,909916001,914441001,4
67,841298001,851317003,3
70,832527002,908491005,3


In [5]:
# Cell 5 – similarity and top-N similar per item

# Item frequencies (number of distinct users)
item_freq = ui.groupby('article_id').size().reset_index(name='freq')
item_freq = item_freq.rename(columns={'article_id': 'article_id_i'})

cooc = cooc.merge(item_freq, on='article_id_i', how='left')
item_freq_j = item_freq.rename(columns={
    'article_id_i': 'article_id_j',
    'freq': 'freq_j'
})
cooc = cooc.merge(item_freq_j, on='article_id_j', how='left')

# Simple cosine-like similarity
cooc['sim'] = cooc['coocount'] / (
    (cooc['freq'] * cooc['freq_j']) ** 0.5
)

# For each article_i, keep top N_SIM_ITEMS similar items
cooc = cooc.sort_values(['article_id_i', 'sim'], ascending=[True, False])
cooc['rank'] = cooc.groupby('article_id_i')['sim'].rank(method='first', ascending=False)
cooc = cooc[cooc['rank'] <= N_SIM_ITEMS]

# (article_i -> similar article_j, with similarity score)
item_sim = cooc[['article_id_i', 'article_id_j', 'sim']].rename(
    columns={
        'article_id_i': 'article_id',
        'article_id_j': 'sim_article_id',
        'sim': 'sim_score',
    }
)

item_sim.head()

Unnamed: 0,article_id,sim_article_id,sim_score
56951,108775044,736870001,0.037508
449140,108775044,841383003,0.017283
209066,108775044,557599022,0.016674
37843,111565001,837014001,0.224677
312763,111565001,148033001,0.093791


In [6]:
# Cell 6 – recent items per user

K_RECENT = 30  # how many recent items per user to base neighbors on

tx_sorted = tx.sort_values(['customer_id', 't_dat'], ascending=[True, False])
tx_sorted['rnk'] = tx_sorted.groupby('customer_id').cumcount() + 1
recent = tx_sorted[tx_sorted['rnk'] <= K_RECENT][['customer_id', 'article_id']].drop_duplicates()

recent.head()

Unnamed: 0,customer_id,article_id
31097478,-9223352921020755230,673396002
31097479,-9223352921020755230,812167004
31497298,-9223343869995384291,910601003
31419900,-9223343869995384291,908292002
31300300,-9223343869995384291,903926002


In [7]:
# Cell 7 – itemCF candidates per user (BATCHED to avoid OOM). Use N_SIM_ITEMS=350 & N_PER_USER=350 above.
BATCH_SIZE = 20000  # lower (e.g. 8000) if still OOM

recent_users = recent['customer_id'].unique()
out_parts = []

for start in range(0, len(recent_users), BATCH_SIZE):
    u_slice = recent_users[start:start + BATCH_SIZE]
    rb = recent.merge(cudf.DataFrame({'customer_id': u_slice}), on='customer_id', how='inner')

    cand_part = rb.merge(item_sim, on='article_id', how='left')
    cand_part = cand_part.dropna(subset=['sim_article_id'])

    cand_part = (
        cand_part
        .groupby(['customer_id', 'sim_article_id'])['sim_score']
        .sum()
        .reset_index()
        .rename(columns={'sim_article_id': 'article_id', 'sim_score': 'value'})
    )

    cand_part = cand_part.sort_values(['customer_id', 'value'], ascending=[True, False])
    cand_part['rank'] = cand_part.groupby('customer_id').cumcount() + 1
    cand_part = cand_part[cand_part['rank'] <= N_PER_USER][['customer_id', 'article_id', 'value']]

    out_parts.append(cand_part)
    del rb, cand_part
    gc.collect()

cand = cudf.concat(out_parts)
cand = cand.drop_duplicates(['customer_id', 'article_id'])
cand['window_type'] = 'itemcf'

cand.head()

Unnamed: 0,customer_id,article_id,value,window_type
27735,-9223352921020755230,673396015,0.051572,itemcf
3148746,-9223352921020755230,759465001,0.042399,itemcf
7486527,-9223352921020755230,749974009,0.0327,itemcf
9181283,-9223352921020755230,904734001,0.029349,itemcf
8513485,-9223352921020755230,749974008,0.028479,itemcf


In [8]:
# Cell 8 – add customer_id_hex and save

# Load sample_submission to map int->hex like in weekly_trending
customers = cudf.read_csv(
    '../data/input_data/sample_submission.csv',
    usecols=['customer_id'],
    dtype={'customer_id': 'string'}
)
customers['customer_id_int'] = customers['customer_id'].str[-16:].str.hex_to_int().astype('int64')

cand = cudf.DataFrame(cand)

cand = cand.merge(
    customers[['customer_id', 'customer_id_int']],
    left_on='customer_id',
    right_on='customer_id_int',
    how='left'
)

cand = cand.rename(columns={'customer_id_x': 'customer_id', 'customer_id_y': 'customer_id_hex'})
cand = cand.drop('customer_id_int', axis=1)

# Types and dedup
cand['article_id'] = cand['article_id'].astype('int32')
cand['value']      = cand['value'].astype('float32')

cand = cand.drop_duplicates(['customer_id', 'article_id'])

# Save
# cand.to_pandas().to_csv('../data/outputs/candidates_itemcf.csv', index=False)
cand.to_pandas().to_parquet('../data/outputs/candidates_itemcf.parquet', index=False)

del tx, ui, pairs, cooc, item_freq, item_sim, recent, cand, customers, tx_sorted
gc.collect()

18