In [1]:
# Cell 1 – imports
import os
# MUST be set before importing numpy/scipy/implicit to prevent OpenBLAS crash
os.environ['OPENBLAS_NUM_THREADS'] = '1'
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import implicit
import gc
from datetime import timedelta

import os
os.environ['HM_TARGET_WEEK_END'] = '2020-08-19'
TARGET_WEEK_END = os.environ.get("HM_TARGET_WEEK_END")
printdate = TARGET_WEEK_END
assert TARGET_WEEK_END is not None, "Set HM_TARGET_WEEK_END=YYYY-MM-DD"
TARGET_WEEK_END = pd.to_datetime(TARGET_WEEK_END)

# Configuration
N_CANDIDATES = 100  # Number of candidates to retrieve per user
HISTORY_DAYS = 90   # Use last 3 months (older data adds noise to embeddings)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Cell 2 – Load Data
df = pd.read_csv(
    '../data/input_data/transactions_train.csv',
    usecols=['t_dat', 'customer_id', 'article_id'],
    dtype={'t_dat': 'string', 'customer_id': 'string', 'article_id': 'int32'}
)
df['t_dat'] = pd.to_datetime(df['t_dat'])

# Filter to recent history
last_ts = TARGET_WEEK_END
cut_ts = last_ts - timedelta(days=7)
start_ts = cut_ts - timedelta(days=HISTORY_DAYS)

# Training data: History window excluding the validation week
train_df = df[(df['t_dat'] > start_ts) & (df['t_dat'] <= cut_ts)].copy()


In [3]:
# Cell 3 – Create Sparse Matrix

train_df['user_code'] = train_df['customer_id'].astype('category').cat.codes
train_df['item_code'] = train_df['article_id'].astype('category').cat.codes

user_map = dict(zip(train_df['user_code'], train_df['customer_id']))
item_map = dict(zip(train_df['item_code'], train_df['article_id']))

train_df['confidence'] = 1

sparse_user_item = sparse.csr_matrix(
    (train_df['confidence'], (train_df['user_code'], train_df['item_code'])),
    shape=(len(user_map), len(item_map))
)

print("Matrix shape (users, items):", sparse_user_item.shape)
print(f"Sparsity: {100 * sparse_user_item.nnz / (sparse_user_item.shape[0] * sparse_user_item.shape[1]):.5f}%")


Matrix shape (users, items): (546122, 41551)
Sparsity: 0.01761%


In [4]:
# Cell 4 – Train ALS Model
# Factors=64 or 128 is standard. Iterations=15 is usually enough.
model = implicit.als.AlternatingLeastSquares(
    factors=64,
    regularization=0.05,
    iterations=15,
    random_state=42,
    num_threads=0
)

model.fit(sparse_user_item)


100%|██████████| 15/15 [12:48<00:00, 51.20s/it]


In [5]:
# Cell 5 – Generate Candidates
unique_user_codes = train_df['user_code'].unique()

ids, scores = model.recommend(
    unique_user_codes,
    sparse_user_item[unique_user_codes],
    N=N_CANDIDATES,
    filter_already_liked_items=False,
)

In [6]:
# Cell 6 – Format Output
# Flatten the results
user_col = np.repeat(unique_user_codes, N_CANDIDATES)
item_col = ids.flatten()
score_col = scores.flatten()

candidates = pd.DataFrame({
    'user_code': user_col,
    'item_code': item_col,
    'value': score_col
})

# Map back to original IDs
candidates['customer_id'] = candidates['user_code'].map(user_map)
candidates['article_id'] = candidates['item_code'].map(item_map)

# Clean up
candidates = candidates.dropna()
candidates['article_id'] = candidates['article_id'].astype('int32')
candidates['value'] = candidates['value'].astype('float32')
candidates['window_type'] = 'als_embedding'

In [7]:
# Cell 7 – add hex id & save (same pattern as c_same_product)

sub = pd.read_csv(
    '../data/input_data/sample_submission.csv',
    usecols=['customer_id'],
    dtype={'customer_id': 'string'}
)
sub['customer_id_int'] = sub['customer_id'].str[-16:].apply(lambda h: int(h, 16)).astype('int64')

# candidates['customer_id'] is a string from transactions; build int key
cand_pd = candidates.copy()
cand_pd['customer_id_int'] = cand_pd['customer_id'].str[-16:].apply(lambda h: int(h, 16)).astype('int64')

# Merge on int id; this will create customer_id_x (from cand) and customer_id_y (hex from sub)
cand_pd = cand_pd.merge(sub, left_on='customer_id_int', right_on='customer_id_int', how='left')

# Rename and clean to match other generators
cand_pd = cand_pd.rename(columns={
    'customer_id_int': 'customer_id',     # int64
    'customer_id_y':  'customer_id_hex'   # hex string
})
if 'customer_id_x' in cand_pd.columns:
    cand_pd = cand_pd.drop(columns=['customer_id_x'])

# Final cols and dtypes
cand_pd['article_id'] = cand_pd['article_id'].astype('int32')
cand_pd['value']      = cand_pd['value'].astype('float32')
cand_pd = cand_pd[['customer_id', 'article_id', 'value', 'window_type', 'customer_id_hex']]

cand_pd.to_parquet(f'../data/outputs/candidates/candidates_embedding_{printdate}.parquet', index=False)
print('Saved embedding candidates.')
del cand_pd, sub
gc.collect()

Saved embedding candidates.


0