In [1]:
# Cell 1 – imports and config
import gc
import pandas as pd
import numpy as np

N_PER_USER = 10  # max same-product candidates per user

In [2]:
# Cell 2 – load transactions (history up to cut_ts)
tx = pd.read_csv(
    '../data/input_data/transactions_train.csv',
    usecols=['t_dat', 'customer_id', 'article_id'],
    dtype={'t_dat': 'string', 'customer_id': 'string', 'article_id': 'int32'},
)

tx['customer_id'] = tx['customer_id'].str[-16:].apply(lambda h: np.int64(np.uint64(int(h, 16))))
tx['t_dat'] = pd.to_datetime(tx['t_dat'])

last_ts = tx['t_dat'].max()
cut_ts = last_ts - pd.Timedelta(days=7)

tx = tx[tx['t_dat'] <= cut_ts].copy()
tx.head()

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
# Cell 3 – load article -> product_code map
art = pd.read_csv(
    '../data/input_data/articles.csv',
    usecols=['article_id', 'product_code'],
    dtype={'article_id': 'int32', 'product_code': 'int32'},
)

art.head()

Unnamed: 0,article_id,product_code
0,108775015,108775
1,108775044,108775
2,108775051,108775
3,110065001,110065
4,110065002,110065


In [None]:
# Cell 4 – attach product_code to user transactions
tx = tx.merge(art, on='article_id', how='left')

# user-product pairs (which product_codes each user has touched)
user_prod = tx[['customer_id', 'product_code']].drop_duplicates()

# for each product_code, list all its articles
prod_articles = (
    art.groupby('product_code')['article_id']
       .apply(lambda x: x.value_counts().index.tolist()[:30])
       .reset_index()
)

user_prod.head(), prod_articles.head()

(           customer_id  product_code
 0    -6846340800584936        663713
 1    -6846340800584936        541518
 2 -8334631767138808638        505221
 3 -8334631767138808638        685687
 7  3618040643253904000        688873,
    product_code                         article_id
 0        108775  [108775015, 108775044, 108775051]
 1        110065  [110065001, 110065002, 110065011]
 2        111565             [111565001, 111565003]
 3        111586                        [111586001]
 4        111593                        [111593001])

In [None]:
# Cell 5 – build same-product candidates and filter already purchased items
cand = user_prod.merge(prod_articles, on='product_code', how='left')

# explode article list
cand = cand.explode('article_id')
cand['article_id'] = cand['article_id'].astype('int32')

# remove already purchased exact articles (optional but common)
purchased = tx[['customer_id', 'article_id']].drop_duplicates()
cand = cand.merge(
    purchased,
    on=['customer_id', 'article_id'],
    how='left',
    indicator=True,
)
cand = cand[cand['_merge'] == 'left_only'].drop(columns=['_merge'])

# simple score
cand['value'] = 1.0
cand['window_type'] = 'same_product'

cand = cand[['customer_id', 'article_id', 'value', 'window_type']]
cand.head()

Unnamed: 0,customer_id,article_id,value,window_type
1,-6846340800584936,541518001,1.0,same_product
2,-6846340800584936,541518004,1.0,same_product
3,-6846340800584936,541518011,1.0,same_product
4,-6846340800584936,541518015,1.0,same_product
5,-6846340800584936,541518018,1.0,same_product


In [None]:
# Cell 6 – limit per user and add hex id
# limit to N_PER_USER per user
cand = cand.sort_values(['customer_id', 'value'], ascending=[True, False])
cand['rank'] = cand.groupby('customer_id')['article_id'].cumcount() + 1
cand = cand[cand['rank'] <= N_PER_USER]
cand = cand.drop(columns=['rank']).drop_duplicates(['customer_id', 'article_id'])

# map int customer_id -> hex like other pipelines
sub_customers = pd.read_csv(
    '../data/input_data/sample_submission.csv',
    usecols=['customer_id'],
    dtype={'customer_id': 'string'},
)
sub_customers['customer_id_int'] = sub_customers['customer_id'].str[-16:].apply(lambda h: np.int64(np.uint64(int(h, 16))))

cand = cand.merge(
    sub_customers[['customer_id', 'customer_id_int']],
    left_on='customer_id',
    right_on='customer_id_int',
    how='left',
)

cand = cand.rename(
    columns={'customer_id_x': 'customer_id', 'customer_id_y': 'customer_id_hex'},
)
cand = cand.drop(columns=['customer_id_int'])

cand['value'] = cand['value'].astype('float32')
cand = cand.drop_duplicates(['customer_id', 'article_id'])

cand.head()

Unnamed: 0,customer_id,article_id,value,window_type,customer_id_hex
0,-9223352921020755230,539723002,1.0,same_product,e7df12e603a24cdaa788472825ee8ed928c1a05ea2c0bd...
1,-9223352921020755230,539723003,1.0,same_product,e7df12e603a24cdaa788472825ee8ed928c1a05ea2c0bd...
2,-9223352921020755230,539723004,1.0,same_product,e7df12e603a24cdaa788472825ee8ed928c1a05ea2c0bd...
3,-9223352921020755230,539723005,1.0,same_product,e7df12e603a24cdaa788472825ee8ed928c1a05ea2c0bd...
4,-9223352921020755230,539723006,1.0,same_product,e7df12e603a24cdaa788472825ee8ed928c1a05ea2c0bd...


In [None]:
# Cell 7 – save and clean up
cand.to_parquet('../data/outputs/candidates_same_product.parquet', index=False)

del tx, art, user_prod, prod_articles, cand, sub_customers, purchased
gc.collect()

22