In [None]:
%cd /content/drive/MyDrive/DeepLearning/recommenders-main
!pip install retrying
!pip install pandera

/content/drive/MyDrive/DeepLearning/recommenders-main
Collecting retrying
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Installing collected packages: retrying
Successfully installed retrying-1.3.4
Collecting pandera
  Downloading pandera-0.22.1-py3-none-any.whl.metadata (15 kB)
Collecting typing_inspect>=0.6.0 (from pandera)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing_inspect>=0.6.0->pandera)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Downloading pandera-0.22.1-py3-none-any.whl (261 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Installing collected packages: mypy-extensions, typing_inspect, pandera
Successfully instal

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import map, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.utils.constants import SEED as DEFAULT_SEED
from recommenders.models.deeprec.deeprec_utils import prepare_hparams
from recommenders.utils.notebook_utils import store_metadata

print(f"System version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"Tensorflow version: {tf.__version__}")

System version: 3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]
Pandas version: 2.2.2
Tensorflow version: 2.17.1


In [None]:
# top k items to recommend
TOP_K = 3

# Model parameters
EPOCHS = 50
BATCH_SIZE = 1024

SEED = DEFAULT_SEED  # Set None for non-deterministic results

yaml_file = "./recommenders/models/deeprec/config/lightgcn.yaml"
user_file = "./tests/resources/deeprec/lightgcn/user_embeddings.csv"
item_file = "./tests/resources/deeprec/lightgcn/item_embeddings.csv"

In [None]:
# Đường dẫn đến file ratings.dat
interact_file_path = "./ml-1m/synthetic_dataset.csv"

df = pd.read_csv(interact_file_path)
print(df)


        userID  itemID   timestamp  rating
0        36975    3794  1672930839       1
1        48204     882  1723374997       1
2        44109    3514  1724953765       1
3        46603    2889  1724804439       1
4        32666     541  1727377434       1
...        ...     ...         ...     ...
999995   10358     252  1710911627       1
999996   15208    1733  1686878135       1
999997    6735    1115  1718954908       1
999998    5109    4322  1692039494       1
999999    1640    3376  1694969435       1

[1000000 rows x 4 columns]


In [None]:
train, test = python_stratified_split(df, ratio=0.75)
data = ImplicitCF(train=train, test=test, seed=SEED)

In [None]:
hparams = prepare_hparams(yaml_file,
                          n_layers=3,
                          batch_size=BATCH_SIZE,
                          epochs=EPOCHS,
                          learning_rate=0.005,
                          eval_epoch=5,
                          top_k=TOP_K,
                         )
model = LightGCN(hparams, data, seed=SEED)

with Timer() as train_time:
    model.fit()

print("Took {} seconds for training.".format(train_time.interval))

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.
Epoch 1 (train)21.2s: train loss = 0.28552 = (mf)0.28400 + (embed)0.00152
Epoch 2 (train)19.0s: train loss = 0.16238 = (mf)0.15872 + (embed)0.00366
Epoch 3 (train)19.0s: train loss = 0.12480 = (mf)0.11962 + (embed)0.00518
Epoch 4 (train)18.7s: train loss = 0.10563 = (mf)0.09932 + (embed)0.00631
Epoch 5 (train)19.2s + (eval)7.7s: train loss = 0.09317 = (mf)0.08600 + (embed)0.00717, recall = 0.23142, ndcg = 0.39224, precision = 0.37508, map = 0.34984
Epoch 6 (train)19.1s: train loss = 0.08423 = (mf)0.07637 + (embed)0.00786
Epoch 7 (train)18.7s: train loss = 0.07812 = (mf)0.06971 + (embed)0.00842
Epoch 8 (train)18.8s: train loss = 0.07185 = (mf)0.06297 + (embed)0.00889
Epoch 9 (train)18.8s: train loss = 0.06820 = (mf)0.05890 + (embed)0.00929
Epoch 10 (train)18.6s + (eval)7.3s: train loss = 0.06432 = (mf)0.05469 + (embed)0.00963, recall = 0.23474, ndcg = 0.39740, precision = 0.37991, map = 0.3

In [None]:
topk_scores = model.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

topk_scores.head()

Unnamed: 0,userID,itemID,prediction
0,1,1128,18.452774
1,1,3724,18.200329
2,1,3793,17.829363
3,2,961,10.809409
4,2,3587,10.337966


In [None]:
eval_map = map(test, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.210285
NDCG:	0.387130
Precision@K:	0.369673
Recall@K:	0.230509


In [None]:
item_local_file_path = "./ml-1m/procedures.csv"
# Định nghĩa tên cột trong DataFrame
columns = [
    "ID", "PROCEDURE_CODE", "PROCEDURE_NAME", "PUBLISHED_AGENCY",
    "IMPLEMENTATION_AGENCY", "QDCBID", "FIELD_NAME", "AMOUNT", "ROW_STT"
]

# Đọc file CSV vào DataFrame
item_df = pd.read_csv(item_local_file_path, names=columns, header=0, encoding="utf-8")
# Tạo ánh xạ ngược từ item_idx (chỉ số nội bộ) sang itemID gốc
idx_to_itemID = {v: k for k, v in data.item2id.items()}
# Thay thế itemID nội bộ trong topk_scores bằng itemID gốc
topk_scores["itemID"] = topk_scores["itemID"].map(idx_to_itemID)
# Tạo ánh xạ trực tiếp từ itemID gốc sang PROCEDURE_NAME
itemID_to_procedure_name = dict(zip(item_df["ROW_STT"], item_df["PROCEDURE_NAME"]))
# Ánh xạ itemID gốc trong topk_scores sang tên dịch vụ (PROCEDURE_NAME)
topk_scores["PROCEDURE_NAME"] = topk_scores["itemID"].map(itemID_to_procedure_name)

# Xuất toàn bộ các gợi ý ra file CSV
output_file = "./ml-1m/results.csv"
topk_scores.to_csv(output_file, index=False)

print(f"Tất cả các gợi ý đã được lưu vào file: {output_file}")

Tất cả các gợi ý đã được lưu vào file: ./ml-1m/results.csv


In [None]:
# Đọc lại file CSV
recommendations = pd.read_csv("./ml-1m/results.csv")

In [None]:
# Lọc gợi ý cho userID cụ thể
user_id = 2  # ID của người dùng cần tìm
user_recommendations = recommendations[recommendations["userID"] == user_id]

# Hiển thị kết quả
print(f"Gợi ý cho userID = {user_id}:")
print(user_recommendations[["itemID", "PROCEDURE_NAME", "prediction"]].to_string(index=False))


Gợi ý cho userID = 2:
 itemID                                        PROCEDURE_NAME  prediction
   73.0 Cấp giấy xác nhận quá trình thực hành công tác xã hội   10.809409
 3744.0          Cấp giấy xác nhận nội dung quảng cáo mỹ phẩm   10.337966
 4023.0        Cấp Giấy phép, năng định cho người lái tàu bay   10.284188


In [None]:
model.infer_embedding(user_file, item_file)