In [1]:
%cd /content/drive/MyDrive/DeepLearning/recommenders-main
!pip install retrying
!pip install pandera

/content/drive/MyDrive/DeepLearning/recommenders-main
Collecting retrying
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Installing collected packages: retrying
Successfully installed retrying-1.3.4
Collecting pandera
  Downloading pandera-0.22.1-py3-none-any.whl.metadata (15 kB)
Collecting typing_inspect>=0.6.0 (from pandera)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing_inspect>=0.6.0->pandera)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Downloading pandera-0.22.1-py3-none-any.whl (261 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Installing collected packages: mypy-extensions, typing_inspect, pandera
Successfully insta

In [2]:
import sys
import logging
import scipy
import numpy as np
import pandas as pd

from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.sar import SAR
from recommenders.utils.notebook_utils import store_metadata

print(f"System version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"SciPy version: {scipy.__version__}")

System version: 3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]
Pandas version: 2.2.2
NumPy version: 1.26.4
SciPy version: 1.13.1


In [4]:
# Top k items to recommend
TOP_K = 3


In [5]:
# set log level to INFO
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s %(levelname)-8s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

In [7]:

# Đường dẫn đến file ratings.dat
interact_file_path = "./ml-1m/synthetic_dataset.csv"

data = pd.read_csv(interact_file_path)
print(data)

# Convert the float precision to 32-bit in order to reduce memory consumption
data["rating"] = data["rating"].astype(np.float32)

data.head()

        userID  itemID   timestamp  rating
0        36975    3794  1672930839       1
1        48204     882  1723374997       1
2        44109    3514  1724953765       1
3        46603    2889  1724804439       1
4        32666     541  1727377434       1
...        ...     ...         ...     ...
999995   10358     252  1710911627       1
999996   15208    1733  1686878135       1
999997    6735    1115  1718954908       1
999998    5109    4322  1692039494       1
999999    1640    3376  1694969435       1

[1000000 rows x 4 columns]


Unnamed: 0,userID,itemID,timestamp,rating
0,36975,3794,1672930839,1.0
1,48204,882,1723374997,1.0
2,44109,3514,1724953765,1.0
3,46603,2889,1724804439,1.0
4,32666,541,1727377434,1.0


In [10]:
header = {
    "col_user": "userID",
    "col_item": "itemID",
    "col_rating": "rating",
    "col_timestamp": "timestamp",
    "col_prediction": "prediction",
}

In [11]:
train, test = python_stratified_split(
    data, ratio=0.75, col_user=header["col_user"], col_item=header["col_item"], seed=42
)


In [12]:
model = SAR(
    similarity_type="jaccard",
    time_decay_coefficient=30,
    time_now=None,
    timedecay_formula=True,
    **header
)

In [13]:
model.fit(train)

In [14]:
top_k = model.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

In [16]:
top_k_with_titles = top_k.join(
    data[["itemID"]].drop_duplicates().set_index("itemID"),
    on="itemID",
    how="inner",
).sort_values(by=["userID", "prediction"], ascending=False)

top_k_with_titles.head(10)

Unnamed: 0,userID,itemID,prediction
149997,50000,2283,0.131619
149998,50000,2285,0.124487
149999,50000,120,0.122749
149994,49999,834,0.141017
149995,49999,832,0.133006
149996,49999,826,0.131019
149991,49998,2631,0.13304
149992,49998,2496,0.132697
149993,49998,2629,0.130102
149988,49997,3415,0.217838


In [17]:
# all ranking metrics have the same arguments
args = [test, top_k]
kwargs = dict(
    col_user="userID",
    col_item="itemID",
    col_rating="rating",
    col_prediction="prediction",
    relevancy_method="top_k",
    k=TOP_K,
)

eval_map = map_at_k(*args, **kwargs)
eval_ndcg = ndcg_at_k(*args, **kwargs)
eval_precision = precision_at_k(*args, **kwargs)
eval_recall = recall_at_k(*args, **kwargs)

In [18]:
print(f"Model:",
      f"Top K:\t\t {TOP_K}",
      f"MAP:\t\t {eval_map:f}",
      f"NDCG:\t\t {eval_ndcg:f}",
      f"Precision@K:\t {eval_precision:f}",
      f"Recall@K:\t {eval_recall:f}", sep='\n')

Model:
Top K:		 3
MAP:		 0.307842
NDCG:		 0.347014
Precision@K:	 0.331873
Recall@K:	 0.205626
