In [1]:
%load_ext autoreload
%autoreload 2

import IPython
from pathlib import Path
import os
locals = IPython.extract_module_locals() # type: ignore
notebook_name = "/".join(locals[1]["__vsc_ipynb_file__"].split("/"))
os.chdir(Path(notebook_name).parent.parent.parent)

In [19]:
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
from pycleora import SparseMatrix
import torch

In [3]:
base_path = Path(".data/hm/base")
relations = pd.read_csv(base_path / "transactions_train.csv")
sample_submission = pd.read_csv(base_path / "sample_submission.csv")

intermediate_path = Path(".data/hm/intermediate/cov1")

relations_train = pd.read_parquet(intermediate_path / "relations_train.parquet")
relations_validation = pd.read_parquet(intermediate_path / "relations_validation.parquet")
customer_id_map = pd.read_parquet(intermediate_path / "customer_id_map.parquet")
article_id_map = pd.read_parquet(intermediate_path / "article_id_map.parquet")

In [4]:
n_users = sample_submission.customer_id.nunique()
n_items = relations.article_id.nunique()

print(n_users, n_items, relations.shape[0])

1371980 104547 31788324


In [5]:
relations_train

Unnamed: 0,t_dat,price,sales_channel_id,session_id,item_id
4212358,2018-12-27,0.044051,1,0,10895
4212359,2018-12-27,0.035576,1,0,12746
4212360,2018-12-27,0.030492,1,0,5938
9663224,2019-05-02,0.010153,2,0,50328
10754876,2019-05-25,0.050831,2,0,865
...,...,...,...,...,...
24375394,2020-04-09,0.043203,2,1371978,84419
24375395,2020-04-09,0.013542,2,1371978,82129
25077914,2020-04-25,0.050831,2,1371978,84419
27806865,2020-06-22,0.016932,1,1371978,93746


In [6]:
relations_train['item_id'] = relations_train['item_id'].astype(str)

In [7]:
customer_products = relations_train.groupby('session_id')['item_id'].apply(list).values

In [8]:
cleora_input = map(lambda x: ' '.join(x), customer_products)

In [9]:
mat = SparseMatrix.from_iterator(cleora_input, columns='complex::reflexive::product')

In [10]:
embeddings = mat.initialize_deterministically(1024)

In [11]:
n_walks = 7

for i in range(n_walks):
    # Can propagate with a symmetric matrix as well, but left Markov is a great default
    embeddings = mat.left_markov_propagate(embeddings)
    # Normalize with L2 norm by default, for the embeddings to reside on a hypersphere. Can use standardization instead.
    embeddings /= np.linalg.norm(embeddings, ord=2, axis=-1, keepdims=True)


In [12]:
embeddings_sorted = embeddings[np.argsort(np.array(mat.entity_ids).astype(int))]

In [13]:
def build_session_embedding(item_ids):
    return embeddings_sorted[item_ids].mean(axis=0)

relations_train['item_id'] = relations_train['item_id'].astype(int)
user_embedding_series = relations_train.groupby('session_id')['item_id'].apply(build_session_embedding)

In [14]:
user_embedding_series.index

Index([      0,       1,       2,       3,       4,       5,       6,       7,
             8,       9,
       ...
       1371970, 1371971, 1371972, 1371973, 1371974, 1371975, 1371976, 1371977,
       1371978, 1371979],
      dtype='int64', name='session_id', length=1356709)

In [15]:
user_embeddings = np.vstack(user_embedding_series.values)

In [16]:
embeddings

array([[-0.03376987,  0.04804428, -0.05466268, ..., -0.06763975,
         0.03211973, -0.03638557],
       [-0.03956544,  0.05481985, -0.08671246, ..., -0.04105688,
         0.03566735, -0.0385193 ],
       [-0.03351728,  0.05266862, -0.0538512 , ..., -0.04580219,
         0.04112133, -0.03350928],
       ...,
       [-0.04707328,  0.04280025, -0.08462011, ..., -0.0466844 ,
         0.04498481, -0.04090003],
       [-0.04860504,  0.04248057, -0.08476947, ..., -0.04783266,
         0.04775231, -0.03875012],
       [-0.0237193 ,  0.0241503 , -0.05308963, ..., -0.02514559,
         0.02945288, -0.03008084]], shape=(103880, 1024), dtype=float32)

In [23]:
user_embeddings_tensor = torch.from_numpy(user_embeddings)
embeddings_tensor = torch.from_numpy(embeddings_sorted)

In [24]:
user_embeddings.shape

(1356709, 1024)

In [25]:
from tqdm import tqdm

N_candid = 50

recommendations = []
for batch in tqdm(torch.split(user_embeddings_tensor, 1000)):
    similarity = torch.matmul(batch, embeddings_tensor.T)

    recommendations.append(similarity.topk(N_candid, dim=1).indices)

100%|██████████| 1357/1357 [04:38<00:00,  4.86it/s]


In [26]:
recommendations = torch.vstack(recommendations)

In [27]:
most_popular_items = relations_train[["session_id", 'item_id']].drop_duplicates()["item_id"].value_counts()

In [28]:
recommendations_cleora = pd.DataFrame({"session_id": user_embedding_series.index, "candidates": recommendations.numpy().tolist()})

In [29]:
missing_indices = np.setdiff1d(np.arange(n_users), user_embedding_series.index)
most_popular_list = most_popular_items.index[:N_candid].tolist()
missing_recommendations_df = pd.DataFrame({
    'session_id': missing_indices,
    'candidates': [most_popular_list] * len(missing_indices)
})

In [30]:
recommendations_full = pd.concat([recommendations_cleora, missing_recommendations_df], axis=0).sort_values("session_id")

In [31]:
from retail_recommender_system.evaluation.metrics import map_k, precision_k, recall_k

In [32]:
ground_truth = torch.from_numpy(relations_validation[["session_id", "item_id"]].values).T

In [33]:
recommendations_tensor = torch.from_numpy(np.array(recommendations_full['candidates'].tolist()))

In [34]:
users_idx = torch.from_numpy(recommendations_full['session_id'].values)

# K=1

In [30]:
map = map_k(recommendations_tensor, ground_truth, k=12, users_idx=users_idx, n_users=n_users, n_items=n_items)
prec = precision_k(recommendations_tensor, ground_truth, k=12, users_idx=users_idx, n_users=n_users, n_items=n_items)
rec = recall_k(recommendations_tensor, ground_truth, k=12, users_idx=users_idx, n_users=n_users, n_items=n_items)

print(f"MAP@12: {map:.6f} | Precision@12: {prec:.6f} | Recall@12: {rec:.6f}")

MAP@12: 0.006668 | Precision@12: 0.004087 | Recall@12: 0.017953


In [31]:
map = map_k(recommendations_tensor, ground_truth, k=30, users_idx=users_idx, n_users=n_users, n_items=n_items)
prec = precision_k(recommendations_tensor, ground_truth, k=30, users_idx=users_idx, n_users=n_users, n_items=n_items)
rec = recall_k(recommendations_tensor, ground_truth, k=30, users_idx=users_idx, n_users=n_users, n_items=n_items)

print(f"MAP@30: {map:.6f} | Precision@30: {prec:.6f} | Recall@30: {rec:.6f}")

MAP@30: 0.004415 | Precision@30: 0.002264 | Recall@30: 0.024979


In [32]:
map = map_k(recommendations_tensor, ground_truth, k=50, users_idx=users_idx, n_users=n_users, n_items=n_items)
prec = precision_k(recommendations_tensor, ground_truth, k=50, users_idx=users_idx, n_users=n_users, n_items=n_items)
rec = recall_k(recommendations_tensor, ground_truth, k=50, users_idx=users_idx, n_users=n_users, n_items=n_items)

print(f"MAP@50: {map:.6f} | Precision@50: {prec:.6f} | Recall@50: {rec:.6f}")

MAP@50: 0.003394 | Precision@50: 0.001584 | Recall@50: 0.028735


# K=7

In [35]:
map = map_k(recommendations_tensor, ground_truth, k=12, users_idx=users_idx, n_users=n_users, n_items=n_items)
prec = precision_k(recommendations_tensor, ground_truth, k=12, users_idx=users_idx, n_users=n_users, n_items=n_items)
rec = recall_k(recommendations_tensor, ground_truth, k=12, users_idx=users_idx, n_users=n_users, n_items=n_items)

print(f"MAP@12: {map:.6f} | Precision@12: {prec:.6f} | Recall@12: {rec:.6f}")

MAP@12: 0.001733 | Precision@12: 0.001024 | Recall@12: 0.004172


In [36]:
map = map_k(recommendations_tensor, ground_truth, k=30, users_idx=users_idx, n_users=n_users, n_items=n_items)
prec = precision_k(recommendations_tensor, ground_truth, k=30, users_idx=users_idx, n_users=n_users, n_items=n_items)
rec = recall_k(recommendations_tensor, ground_truth, k=30, users_idx=users_idx, n_users=n_users, n_items=n_items)

print(f"MAP@30: {map:.6f} | Precision@30: {prec:.6f} | Recall@30: {rec:.6f}")

MAP@30: 0.001158 | Precision@30: 0.000636 | Recall@30: 0.006459


In [37]:
map = map_k(recommendations_tensor, ground_truth, k=50, users_idx=users_idx, n_users=n_users, n_items=n_items)
prec = precision_k(recommendations_tensor, ground_truth, k=50, users_idx=users_idx, n_users=n_users, n_items=n_items)
rec = recall_k(recommendations_tensor, ground_truth, k=50, users_idx=users_idx, n_users=n_users, n_items=n_items)

print(f"MAP@50: {map:.6f} | Precision@50: {prec:.6f} | Recall@50: {rec:.6f}")

MAP@50: 0.000914 | Precision@50: 0.000489 | Recall@50: 0.008093


In [None]:
recommendations_submission = recommendations_full.merge(customer_id_map, on="session_id").drop("session_id", axis=1)
item_to_article_map = dict(zip(article_id_map['item_id'], article_id_map['article_id']))
recommendations_submission["prediction"] = recommendations_submission['candidates'].apply(lambda x: " ".join([str(item_to_article_map[item]) for item in x]))
recommendations_submission = recommendations_submission.drop("candidates", axis=1)

In [None]:
recommendations_submission = recommendations_submission[['customer_id', 'prediction']]
recommendations_submission.to_csv(base_path / "recommendations_submission.csv", index=False)

In [None]:
import gzip
import shutil

with open(base_path / "recommendations_submission.csv", 'rb') as f_in:
    with gzip.open(base_path / "recommendations_submission.csv.gz", 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [None]:
!ls -lah .data/hm/base

In [None]:
!kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f {base_path / "recommendations_submission.csv.gz"} -m "Message"