In [1]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)
os.chdir('../..')

os.environ["OMP_NUM_THREADS"] = "4"

In [2]:
from pathlib import Path

from hydra import compose, initialize

import pandas as pd
import torch 
import numpy as np 

from src.models.implicit import ImplicitBench
from src.preprocessing import ClassicDataset
from src.utils.logging import get_logger
from src.utils.processing import save_results, train_test_split, pandas_to_sparse
from src.utils.metrics import run_all_metrics, coverage
from src.utils.processing import pandas_to_aggregate
logger = get_logger(name=__name__)

%load_ext autoreload
%autoreload 2

2023-12-03 18:24:02.587798: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-03 18:24:02.587844: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-03 18:24:02.589376: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-03 18:24:02.596555: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from implicit.evaluation import ranking_metrics_at_k

In [4]:
with initialize(config_path='../../config', version_base=None):
    cfg = compose(config_name='config')

In [27]:
data_train.groupby('userId').apply(lambda x: x).iloc[56]

userId                    57
itemId                    40
rating                     5
timestamp          977935034
implicit_rating            1
Name: (57, 4983), dtype: int64

In [28]:
def save_to_file(df, filename):
    with open(filename, 'w') as file:
        for user_id, group in df[df['implicit_rating'] == 1].groupby('userId'):
            items = ' '.join(group['itemId'].astype(str))
            line = f"{user_id} {items}\n"
            file.write(line)

# Saving the data
save_to_file(data_train, 'train.txt')
save_to_file(data_test, 'test.txt')

In [6]:
implicit_name: str = cfg['library']['name']
cfg_data = cfg['dataset']
cfg_model = cfg['library']['implicit_model']

# split data into samples
dataset_folder = Path('/'.join(("preproc_data", cfg_data["name"])))

dataset = ClassicDataset()
dataset.prepare(cfg_data)

if (
    dataset_folder.joinpath("train.parquet").exists() and
    dataset_folder.joinpath("test.parquet").exists()
):
    data_train = pd.read_parquet(dataset_folder.joinpath("train.parquet"))
    data_test = pd.read_parquet(dataset_folder.joinpath("test.parquet"))
else:
    data_train, data_test = train_test_split(
        dataset.prepared_data,
        test_size=cfg_data["splitting"]["test_size"],
        splitting_type=cfg_data["splitting"]["strategy"],
    )
    data_train.to_parquet(dataset_folder.joinpath("train.parquet"))
    data_test.to_parquet(dataset_folder.joinpath("test.parquet"))

shape = (
    data_train["userId"].max() + 1,
    data_train["itemId"].max() + 1,
)

train_interactions_sparse, train_weights_sparse = pandas_to_sparse(
    data_train,
    weighted=True,
    shape=shape
)
test_interactions_sparse, _ = pandas_to_sparse(
    data_test,
    weighted=False,
    shape=shape
)

model_folder = dataset_folder.joinpath(implicit_name, cfg_model["name"])

if cfg_model["saved_model"]:
    implicit = ImplicitBench.initialize_saved_model(
        model_folder.joinpath(cfg_model["saved_model_name"])
    )
else:
    implicit = None
if implicit is None:
    if cfg_model["enable_optimization"]:
        data_train_opt, data_val = train_test_split(
            data_train,
            test_size=cfg_data["splitting"]["val_size"],
            splitting_type=cfg_data["splitting"]["strategy"],
        )
        train_opt_interactions_sparse, train_opt_weights_sparse = pandas_to_sparse(
            data_train_opt,
            weighted=True,
            shape=shape
        )
        val_interactions_sparse, _ = pandas_to_sparse(
            data_val,
            weighted=False,
            shape=shape
        )
        implicit = ImplicitBench.initialize_with_optimization(
            cfg_model["name"],
            cfg_model["optuna_optimizer"],
            train_opt_interactions_sparse,
            train_opt_weights_sparse,
            data_val
        )
        was_optimized = True
    else:
        implicit = ImplicitBench.initialize_with_params(
            cfg_model["name"], cfg_model["model"]
        )
        was_optimized = False
    implicit.fit(train_interactions_sparse, train_weights_sparse, **cfg_model['learning'])
    implicit.save_model(model_folder)

test_userids = np.sort(data_test.userId.unique())
top_100_items = implicit.recommend_k(k=100, userids=test_userids)

metrics = run_all_metrics(top_100_items, pandas_to_aggregate(data_test), [5, 10, 20, 100])
coverage_metrics = []
for k in (5, 10, 20, 100):
    coverage_metrics.append(coverage(top_100_items, train_interactions_sparse.shape[1], k))

metrics_df = pd.DataFrame(metrics, index=[5, 10, 20, 100], columns=(
    'Precision@k', 'Recall@K', 'MAP@K', 'nDCG@k', 'MRR@k', 'HitRate@k'
))
metrics_df['Coverage@K'] = coverage_metrics

metrics_df['Time_fit'] = implicit.learning_time
metrics_df['Time_predict'] = implicit.predict_time

print(metrics_df.loc[10])

[I 2023-12-03 18:25:44,393] A new study created in memory with name: no-name-585f38a0-a729-4e45-ba2b-069b0914e831
100%|██████████| 100/100 [00:00<00:00, 275.31it/s, train_auc=0.00%, skipped=19.57%]
[I 2023-12-03 18:25:44,860] Trial 0 finished with value: 0.008262504004631148 and parameters: {'factors': 105, 'iterations': 100, 'regularization': 0.4570563099801455, 'learning_rate': 0.06026718993550663}. Best is trial 0 with value: 0.008262504004631148.
100%|██████████| 20/20 [00:00<00:00, 234.43it/s, train_auc=91.30%, skipped=19.47%]
[I 2023-12-03 18:25:45,018] Trial 1 finished with value: 0.05659942036359312 and parameters: {'factors': 70, 'iterations': 20, 'regularization': 0.00019517224641449495, 'learning_rate': 0.08675143843171859}. Best is trial 1 with value: 0.05659942036359312.
100%|██████████| 75/75 [00:00<00:00, 178.90it/s, train_auc=89.32%, skipped=19.55%]
[I 2023-12-03 18:25:45,515] Trial 2 finished with value: 0.04018141566098731 and parameters: {'factors': 140, 'iterations'

Precision@k     0.175290
Recall@K        0.047983
MAP@K           0.106967
nDCG@k          0.180551
MRR@k           0.308453
HitRate@k       0.576756
Coverage@K      0.033600
Time_fit        0.434715
Time_predict    0.013176
Name: 10, dtype: float64


In [6]:
ranking_metrics_at_k(implicit.model, train_interactions_sparse, test_interactions_sparse, K=10)

100%|██████████| 1944/1944 [00:00<00:00, 540805.66it/s]


{'precision': 0.0003439972480220158,
 'map': 7.001600365797896e-05,
 'ndcg': 0.0001696676511234754,
 'auc': 0.49965949506582036}

In [7]:
ranking_metrics_at_k(implicit.model, train_interactions_sparse, test_interactions_sparse, K=10)

100%|██████████| 1944/1944 [00:00<00:00, 641217.91it/s]


{'precision': 0.0003439972480220158,
 'map': 7.001600365797896e-05,
 'ndcg': 0.0001696676511234754,
 'auc': 0.49965949506582036}

In [8]:
print(metrics_df.loc[10])

Precision@k     0.000109
Recall@K        0.000067
MAP@K           0.000070
nDCG@k          0.000170
MRR@k           0.000643
HitRate@k       0.001029
Coverage@K      0.013688
Time_fit        0.318789
Time_predict    0.035809
Name: 10, dtype: float64


In [9]:
predicted_items = top_100_items.copy()
interactions = train_interactions_sparse.copy()

In [10]:
interactions.shape

(15739, 13369)

In [11]:
k = 100

In [12]:
np.array(interactions.sum(axis=0)).flatten().shape

(13369,)

In [14]:
predicted_items = predicted_items[:, :k]
popularity = np.array(interactions.sum(axis=0)).flatten() / interactions.shape[0]
# Avoid division by zero in the log operation
popularity = np.clip(popularity, 1e-12, None)
item_novelty = -np.log2(popularity)
# Calculate the novelty for the recommended items only
recommended_item_novelties = item_novelty[predicted_items.flatten()]
# Calculate the average novelty across all recommended items
novelty_score = np.mean(recommended_item_novelties)


In [15]:
from typing import Union
from itertools import combinations

import numpy as np
import pandas as pd
from scipy.special import comb
import scipy.sparse as sps

from tqdm.auto import tqdm

In [16]:
predicted_items = predicted_items[:, :k].astype(np.int32)
cosin_sim_history = {}
predicted_items.sort(1)
total_cos_sim = 0.0
for item_ids in tqdm(predicted_items, desc="Diversity calculation", leave=False):
    for i, j in combinations(item_ids, 2):
        if (i, j) in cosin_sim_history:
            total_cos_sim += cosin_sim_history[(i, j)]
        else:
            col1 = interactions[:, i].astype(np.float64)
            col2 = interactions[:, j].astype(np.float64)
            cos_sim = col1.T.dot(col2).sum() / (np.sqrt((col1.power(2)).sum()) * np.sqrt((col2.power(2)).sum()))
            cosin_sim_history[(i, j)] = cos_sim
            total_cos_sim += cos_sim
            
diversity_score = 1 - total_cos_sim / (predicted_items.shape[0] * comb(k, 2))

                                                                          

In [None]:
diversity_score

0.9929829037930245