In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# install Java8 (Spark kh√¥ng t∆∞∆°ng th√≠ch t·ªët v·ªõi c√°c phi√™n b·∫£n Java kh√°c)
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# download Spark (v√≠ d·ª• v·ªõi spark-3.5.1)
!wget -q https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz

!tar xf spark-3.5.1-bin-hadoop3.tgz

# install findspark
!pip install -q findspark


# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.1-bin-hadoop3"

# Quick Installation Test
import findspark
findspark.init()
from pyspark.sql import SparkSession
# Check the pyspark version
import pyspark
print(pyspark.__version__)


3.5.1


In [3]:
%%bash
mkdir -p /content/scripts
cat <<'EOF' > /content/scripts/getGpusResources.sh
#!/bin/bash
# Script gi√∫p Spark ph√°t hi·ªán GPU ƒëang c√≥
GPUS=$(nvidia-smi --query-gpu=index --format=csv,noheader | wc -l)
JSON="{\"name\": \"gpu\", \"addresses\": ["
for i in $(seq 0 $(($GPUS-1))); do
  JSON="${JSON}\"$i\""
  if [ $i -lt $(($GPUS-1)) ]; then
    JSON="${JSON}, "
  fi
done
JSON="${JSON}]}"
echo $JSON
EOF
chmod +x /content/scripts/getGpusResources.sh


In [4]:
# ==========================================================
# ‚öôÔ∏è 1Ô∏è‚É£ KH·ªûI T·∫†O SPARK
# ==========================================================
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType
import matplotlib.pyplot as plt
spark = (
    SparkSession.builder
    .appName("Spark-Torch-NCF")
    .master("local[*]")
    # GPU cho DRIVER
    .config("spark.driver.resource.gpu.discoveryScript", "/content/scripts/getGpusResources.sh")
    .config("spark.driver.resource.gpu.amount", "1")
    # GPU cho EXECUTOR
    .config("spark.executor.resource.gpu.discoveryScript", "/content/scripts/getGpusResources.sh")
    .config("spark.executor.resource.gpu.amount", "1")
    .getOrCreate()
)

sc = spark.sparkContext
print("Spark started successfully with GPU support ‚úÖ")

Spark started successfully with GPU support ‚úÖ


In [5]:
user_index = spark.read.parquet("/content/drive/MyDrive/XuLyDuLieuLon/mappings/user_index.parquet")
item_index = spark.read.parquet("/content/drive/MyDrive/XuLyDuLieuLon/mappings/item_index.parquet")
item_means = spark.read.parquet("/content/drive/MyDrive/XuLyDuLieuLon/preprocessed/item_means.parquet")

train_df_norm = spark.read.parquet("/content/drive/MyDrive/XuLyDuLieuLon/preprocessed/train_norm.parquet")
valid_df_norm = spark.read.parquet("/content/drive/MyDrive/XuLyDuLieuLon/preprocessed/valid_norm.parquet")
test_df_norm  = spark.read.parquet("/content/drive/MyDrive/XuLyDuLieuLon/preprocessed/test_norm.parquet")

In [6]:
print(user_index)
user_index.show(5)
print(item_index)
item_index.show(5)
print(item_means)
item_means.show(5)
print(train_df_norm)
train_df_norm.show()
print(valid_df_norm)
valid_df_norm.show()
print(test_df_norm)
test_df_norm.show()


DataFrame[user_id: string, userIndex: int]
+--------------------+---------+
|             user_id|userIndex|
+--------------------+---------+
|AE22236AFRRSMQIKG...|        0|
|AE222H3FGXWLHRFUM...|        1|
|AE224QIIILW6WVFAE...|        2|
|AE224XBMLKDOWJRHA...|        3|
|AE2255XXPI47TT6JO...|        4|
+--------------------+---------+
only showing top 5 rows

DataFrame[parent_asin: string, itemIndex: int]
+-----------+---------+
|parent_asin|itemIndex|
+-----------+---------+
| 0307449440|        0|
| 0307965570|        1|
| 0385344945|        2|
| 0394820371|        3|
| 0399211942|        4|
+-----------+---------+
only showing top 5 rows

DataFrame[itemIndex: int, item_mean: double]
+---------+-----------------+
|itemIndex|        item_mean|
+---------+-----------------+
|    68090|4.181818181818182|
|    43852|4.513513513513513|
|    46952|4.266666666666667|
|    69048|4.464285714285714|
|    43935|             4.42|
+---------+-----------------+
only showing top 5 rows

DataFra

In [7]:
from pyspark.sql import functions as F

# L·∫•y danh s√°ch item_id trong t·ª´ng t·∫≠p
train_items = train_df_norm.select("parent_asin").distinct()
valid_items = valid_df_norm.select("parent_asin").distinct()
test_items  = test_df_norm.select("parent_asin").distinct()

# 1Ô∏è‚É£ Item c√≥ trong VALID nh∆∞ng kh√¥ng c√≥ trong TRAIN
valid_not_in_train = valid_items.join(train_items, on="parent_asin", how="left_anti")
print("S·ªë l∆∞·ª£ng item ch·ªâ c√≥ trong valid m√† kh√¥ng c√≥ trong train:", valid_not_in_train.count())
valid_not_in_train.show(10, truncate=False)

# 2Ô∏è‚É£ Item c√≥ trong TEST nh∆∞ng kh√¥ng c√≥ trong TRAIN
test_not_in_train = test_items.join(train_items, on="parent_asin", how="left_anti")
print("S·ªë l∆∞·ª£ng item ch·ªâ c√≥ trong test m√† kh√¥ng c√≥ trong train:", test_not_in_train.count())
test_not_in_train.show(10, truncate=False)


S·ªë l∆∞·ª£ng item ch·ªâ c√≥ trong valid m√† kh√¥ng c√≥ trong train: 5996
+-----------+
|parent_asin|
+-----------+
|B0C6DYSX1P |
|B00N9RE2E6 |
|B09GJYNB6C |
|B093LJLMYQ |
|B09MS41GLQ |
|B09V2KMX56 |
|B0B531PFZD |
|B099JWSP5K |
|B095N68N2J |
|B097H1WVD4 |
+-----------+
only showing top 10 rows

S·ªë l∆∞·ª£ng item ch·ªâ c√≥ trong test m√† kh√¥ng c√≥ trong train: 10551
+-----------+
|parent_asin|
+-----------+
|B0B29LP7VW |
|B0B2MZW3DS |
|B097H1WVD4 |
|B0B8CG17JK |
|B08LDHL36R |
|B09C23Z36H |
|B0BFW28M2Y |
|B093LJLMYQ |
|B0B8J62QJF |
|B0BK7NCW4L |
+-----------+
only showing top 10 rows



In [8]:
# üîç L·ªçc c√°c d√≤ng trong test_df_norm c√≥ item thu·ªôc nh√≥m n√†y
test_missing_items = test_df_norm.join(test_not_in_train, on="parent_asin", how="inner")

# In ra 1 d√≤ng v√≠ d·ª•
test_missing_items.show(1, truncate=False)

+-----------+---------+---------+----------------------------+------+------------------+-----------------+
|parent_asin|itemIndex|userIndex|user_id                     |rating|rating_norm       |item_mean        |
+-----------+---------+---------+----------------------------+------+------------------+-----------------+
|B0B833LBX2 |82529    |9114     |AE7YYSYFEWAYIUCAK4W33CHPP4JQ|5.0   |0.4991011434207744|4.500898856579226|
+-----------+---------+---------+----------------------------+------+------------------+-----------------+
only showing top 1 row



In [9]:
from pyspark.sql import functions as F

max_item_index = valid_df_norm.agg(F.max("itemIndex").alias("max_itemIndex")).collect()[0]["max_itemIndex"]
print("Gi√° tr·ªã l·ªõn nh·∫•t c·ªßa itemIndex:", max_item_index)


Gi√° tr·ªã l·ªõn nh·∫•t c·ªßa itemIndex: 89957


Train

In [10]:
from pyspark.sql import SparkSession
from pyspark.ml.torch.distributor import TorchDistributor
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

num_users = user_index.selectExpr("max(userIndex)").collect()[0][0] + 1
num_items = item_index.selectExpr("max(itemIndex)").collect()[0][0] + 1

# ===================== 2Ô∏è‚É£ Chuy·ªÉn d·ªØ li·ªáu Spark -> Pandas =====================
train_pd = train_df_norm.select("userIndex", "itemIndex", "rating_norm", "item_mean").toPandas()
valid_pd = valid_df_norm.select("userIndex", "itemIndex", "rating_norm", "item_mean").toPandas()
test_pd = test_df_norm.select("userIndex", "itemIndex", "rating_norm", "rating", "item_mean").toPandas()

In [11]:
# ===================== 3Ô∏è‚É£ T·∫°o Dataset =====================
class NCFDataset(Dataset):
    def __init__(self, df):
        self.user = torch.tensor(df["userIndex"].values, dtype=torch.long)
        self.item = torch.tensor(df["itemIndex"].values, dtype=torch.long)
        self.rating_norm = torch.tensor(df["rating_norm"].values, dtype=torch.float32)
        self.item_mean = torch.tensor(df["item_mean"].values, dtype=torch.float32)

    def __len__(self):
        return len(self.user)

    def __getitem__(self, idx):
        return self.user[idx], self.item[idx], self.rating_norm[idx], self.item_mean[idx]

train_dataset = NCFDataset(train_pd)
valid_dataset = NCFDataset(valid_pd)
test_dataset = NCFDataset(test_pd)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=256, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

In [12]:
# ===================== 4Ô∏è‚É£ ƒê·ªãnh nghƒ©a m√¥ h√¨nh NCF =====================
class NCFModel(nn.Module):
    def __init__(self, num_users, num_items, embed_dim=64, hidden=[128, 64, 32], dropout=0.3):
        super(NCFModel, self).__init__()
        self.user_embed = nn.Embedding(num_users, embed_dim)
        self.item_embed = nn.Embedding(num_items, embed_dim)

        layers = []
        input_size = embed_dim * 2
        for h in hidden:
            layers.append(nn.Linear(input_size, h))
            layers.append(nn.BatchNorm1d(h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            input_size = h
        self.mlp = nn.Sequential(*layers)
        self.output = nn.Linear(hidden[-1], 1)

    def forward(self, user, item, item_mean):
        u = self.user_embed(user)
        i = self.item_embed(item)
        x = torch.cat([u, i], dim=1)
        x = self.mlp(x)
        rating_norm_pred = self.output(x).squeeze(1)
        return rating_norm_pred + item_mean


In [13]:
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
    def forward(self, yhat, y):
        return torch.sqrt(self.mse(yhat, y))

In [16]:
# ===================== 5Ô∏è‚É£ Hu·∫•n luy·ªán + Validation =====================
def train_model():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = NCFModel(num_users, num_items).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
    loss_fn_rmse = RMSELoss()
    loss_fn_mae = torch.nn.L1Loss()  # MAE Loss

    best_val_loss = float('inf')
    best_model_state = None
    history_train_RMSE = []
    history_valid_RMSE = []
    history_train_MAE = []
    history_valid_MAE = []
    patience = 20
    no_improve_count = 0

    for epoch in range(100):
        # ----- TRAIN -----
        model.train()
        total_loss_rmse = 0
        total_loss_mae = 0
        for user, item, rating_norm, item_mean in train_loader:
            user, item, rating_norm, item_mean = user.to(device), item.to(device), rating_norm.to(device), item_mean.to(device)
            optimizer.zero_grad()
            pred = model(user, item, item_mean)
            target = rating_norm + item_mean
            loss_rmse = loss_fn_rmse(pred, target)
            loss_mae = loss_fn_mae(pred, target)
            loss_rmse.backward()
            optimizer.step()
            total_loss_rmse += loss_rmse.item()
            total_loss_mae += loss_mae.item()

        avg_train_rmse = total_loss_rmse / len(train_loader)
        avg_train_mae = total_loss_mae / len(train_loader)
        history_train_RMSE.append(avg_train_rmse)
        history_train_MAE.append(avg_train_mae)

        # ----- VALIDATION -----
        model.eval()
        val_loss_rmse = 0
        val_loss_mae = 0
        with torch.no_grad():
            for user, item, rating_norm, item_mean in valid_loader:
                user, item, rating_norm, item_mean = user.to(device), item.to(device), rating_norm.to(device), item_mean.to(device)
                pred = model(user, item, item_mean)
                target = rating_norm + item_mean
                val_loss_rmse += loss_fn_rmse(pred, target).item()
                val_loss_mae += loss_fn_mae(pred, target).item()

        avg_val_rmse = val_loss_rmse / len(valid_loader)
        avg_val_mae = val_loss_mae / len(valid_loader)
        history_valid_RMSE.append(avg_val_rmse)
        history_valid_MAE.append(avg_val_mae)

        print(f"Epoch {epoch+1} | TrainRMSE={avg_train_rmse:.4f} | ValRMSE={avg_val_rmse:.4f} | TrainMAE={avg_train_mae:.4f} | ValMAE={avg_val_mae:.4f}")

        # ----- L∆∞u model t·ªët nh·∫•t -----
        if avg_val_rmse < best_val_loss:
            best_val_loss = avg_val_rmse
            best_model_state = model.state_dict().copy()
            no_improve_count = 0
        else:
            no_improve_count += 1

        # ----- Early Stopping -----
        if no_improve_count >= patience:
            print(f"‚èπÔ∏è D·ª´ng s·ªõm t·∫°i epoch {epoch+1} (ValRMSE kh√¥ng c·∫£i thi·ªán trong {patience} epoch li√™n ti·∫øp).")
            break

    # ----- L∆∞u model t·ªët nh·∫•t -----
    torch.save(best_model_state, "ncf_best_model.pt")
    print("‚úÖ Training done! Best Val RMSE:", best_val_loss)

    # ===== V·∫Ω bi·ªÉu ƒë·ªì RMSE =====
    plt.figure(figsize=(8, 5))
    plt.plot(range(1, len(history_train_RMSE) + 1), history_train_RMSE, marker='o', label='Train RMSE')
    plt.plot(range(1, len(history_valid_RMSE) + 1), history_valid_RMSE, marker='s', label='Validation RMSE')
    plt.title('NCF Training: RMSE tr√™n Train & Validation')
    plt.xlabel('Epoch')
    plt.ylabel('RMSE')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("rmse_plot.png", dpi=300)
    print("üìä Bi·ªÉu ƒë·ªì RMSE ƒë√£ ƒë∆∞·ª£c l∆∞u t·∫°i: rmse_plot.png")

    # ===== V·∫Ω bi·ªÉu ƒë·ªì MAE =====
    plt.figure(figsize=(8, 5))
    plt.plot(range(1, len(history_train_MAE) + 1), history_train_MAE, marker='o', label='Train MAE')
    plt.plot(range(1, len(history_valid_MAE) + 1), history_valid_MAE, marker='s', label='Validation MAE')
    plt.title('NCF Training: MAE tr√™n Train & Validation')
    plt.xlabel('Epoch')
    plt.ylabel('MAE')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("mae_plot.png", dpi=300)
    print("üìä Bi·ªÉu ƒë·ªì MAE ƒë√£ ƒë∆∞·ª£c l∆∞u t·∫°i: mae_plot.png")

    # ===== V·∫Ω bi·ªÉu ƒë·ªì RMSE + MAE c√πng nhau =====
    plt.figure(figsize=(8, 5))
    plt.plot(range(1, len(history_train_RMSE) + 1), history_train_RMSE, marker='o', label='Train RMSE')
    plt.plot(range(1, len(history_valid_RMSE) + 1), history_valid_RMSE, marker='s', label='Val RMSE')
    plt.plot(range(1, len(history_train_MAE) + 1), history_train_MAE, marker='^', label='Train MAE')
    plt.plot(range(1, len(history_valid_MAE) + 1), history_valid_MAE, marker='v', label='Val MAE')
    plt.title('NCF Training: RMSE+MAE tr√™n Train & Validation')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("rmse_mae_plot.png", dpi=300)
    print("üìä Bi·ªÉu ƒë·ªì RMSE + MAE ƒë√£ ƒë∆∞·ª£c l∆∞u t·∫°i: rmse_mae_plot.png")


In [17]:

# ===================== 6Ô∏è‚É£ Ph√¢n t√°n hu·∫•n luy·ªán b·∫±ng Spark =====================
TorchDistributor(num_processes=1, local_mode=True).run(train_model)


INFO:TorchDistributor:Started local training with 1 processes
INFO:TorchDistributor:Finished local training with 1 processes


Epoch 1 | TrainRMSE=0.9682 | ValRMSE=1.2343 | TrainMAE=0.6610 | ValMAE=0.8369
Epoch 2 | TrainRMSE=0.9371 | ValRMSE=1.2166 | TrainMAE=0.6280 | ValMAE=0.8177
Epoch 3 | TrainRMSE=0.9112 | ValRMSE=1.2153 | TrainMAE=0.6040 | ValMAE=0.8118
Epoch 4 | TrainRMSE=0.9032 | ValRMSE=1.2147 | TrainMAE=0.5970 | ValMAE=0.8105
Epoch 5 | TrainRMSE=0.9023 | ValRMSE=1.2175 | TrainMAE=0.5964 | ValMAE=0.8096
Epoch 6 | TrainRMSE=0.9020 | ValRMSE=1.2169 | TrainMAE=0.5960 | ValMAE=0.8079
Epoch 7 | TrainRMSE=0.9015 | ValRMSE=1.2145 | TrainMAE=0.5953 | ValMAE=0.8144
Epoch 8 | TrainRMSE=0.9006 | ValRMSE=1.2167 | TrainMAE=0.5947 | ValMAE=0.8102
Epoch 9 | TrainRMSE=0.8989 | ValRMSE=1.2162 | TrainMAE=0.5928 | ValMAE=0.8117
Epoch 10 | TrainRMSE=0.8972 | ValRMSE=1.2160 | TrainMAE=0.5912 | ValMAE=0.8153
Epoch 11 | TrainRMSE=0.8945 | ValRMSE=1.2164 | TrainMAE=0.5886 | ValMAE=0.8095
Epoch 12 | TrainRMSE=0.8924 | ValRMSE=1.2161 | TrainMAE=0.5864 | ValMAE=0.8056
Epoch 13 | TrainRMSE=0.8915 | ValRMSE=1.2164 | TrainMAE=0.585

In [18]:
# ===================== 5Ô∏è‚É£ Load model ƒë√£ train =====================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NCFModel(num_users, num_items).to(device)
model.load_state_dict(torch.load("/content/ncf_best_model.pt", map_location=device))
model.eval()

# ===================== 6Ô∏è‚É£ D·ª± ƒëo√°n =====================
predictions = []
with torch.no_grad():
    for user, item, rating_norm, item_mean in test_loader:
        user, item, item_mean = user.to(device), item.to(device), item_mean.to(device)
        pred = model(user, item, item_mean)
        predictions.extend(pred.cpu().numpy())

# ===================== 7Ô∏è‚É£ G·ªôp k·∫øt qu·∫£ l·∫°i =====================
test_pd["rating_pred"] = predictions

# Hi·ªÉn th·ªã 5 d√≤ng ƒë·∫ßu
print(test_pd[["userIndex", "itemIndex", "rating", "rating_pred"]].head())


   userIndex  itemIndex  rating  rating_pred
0     193523      89574     4.0     4.880085
1      41477      21220     5.0     4.505364
2       9114      82529     5.0     4.478246
3      97066      87656     3.0     4.444818
4     179928      44596     3.0     4.634758


In [19]:
from pyspark.sql import functions as F, Window
from pyspark.sql.types import FloatType
import torch

# ----- 2Ô∏è‚É£ H√†m d·ª± ƒëo√°n ƒë∆°n l·∫ª -----
def predict_single(user_idx, item_idx, item_mean):
    user = torch.tensor([user_idx], dtype=torch.long).to(device)
    item = torch.tensor([item_idx], dtype=torch.long).to(device)
    item_mean_t = torch.tensor([item_mean], dtype=torch.float32).to(device)
    with torch.no_grad():
        pred = model(user, item, item_mean_t)
    return float(pred.item())

# ----- 3Ô∏è‚É£ ƒêƒÉng k√Ω UDF -----
predict_udf = F.udf(lambda u, i, m: predict_single(u, i, m), FloatType())

# ----- 4Ô∏è‚É£ T·∫°o c·ªôt d·ª± ƒëo√°n -----
pred_df = test_df_norm.withColumn("rating_pred", predict_udf("userIndex", "itemIndex", "item_mean"))

# ----- 5Ô∏è‚É£ X·∫øp h·∫°ng -----
w_true = Window.partitionBy("userIndex").orderBy(F.desc("rating"), F.asc("itemIndex"))
w_pred = Window.partitionBy("userIndex").orderBy(F.desc("rating_pred"), F.asc("itemIndex"))

pred_df = (
    pred_df
    .withColumn("true_rank", F.row_number().over(w_true))
    .withColumn("pred_rank", F.row_number().over(w_pred))
)

pred_df.orderBy(F.asc("userIndex")).show(10)


+---------+-----------+---------+--------------------+------+------------------+------------------+-----------+---------+---------+
|itemIndex|parent_asin|userIndex|             user_id|rating|       rating_norm|         item_mean|rating_pred|true_rank|pred_rank|
+---------+-----------+---------+--------------------+------+------------------+------------------+-----------+---------+---------+
|    87909| B0BXP2YLPJ|        2|AE224QIIILW6WVFAE...|   5.0|            0.1875|            4.8125|   4.902306|       10|        1|
|    72547| B0969TZWJL|        2|AE224QIIILW6WVFAE...|   5.0|0.4991011434207744| 4.500898856579226|   4.543292|        2|        2|
|    76796| B09MQFB4X2|        2|AE224QIIILW6WVFAE...|   5.0|0.4991011434207744| 4.500898856579226|   4.543292|        4|        3|
|    76997| B09N799FPW|        2|AE224QIIILW6WVFAE...|   5.0|0.4991011434207744| 4.500898856579226|   4.543292|        5|        4|
|    77614| B09Q236HWT|        2|AE224QIIILW6WVFAE...|   5.0|0.4991011434207

In [20]:
rmse_df = pred_df.withColumn("squared_error", F.pow(F.col("rating_pred") - F.col("rating"), 2))
rmse = rmse_df.agg(F.sqrt(F.mean("squared_error")).alias("RMSE")).collect()[0]["RMSE"]

print(f"RMSE tr√™n t·∫≠p test: {rmse:.4f}")

RMSE tr√™n t·∫≠p test: 1.0878


In [21]:
mae_df = pred_df.withColumn("abs_error", F.abs(F.col("rating_pred") - F.col("rating")))
mae = mae_df.agg(F.mean("abs_error").alias("MAE")).collect()[0]["MAE"]

print(f"MAE tr√™n t·∫≠p test: {mae:.4f}")

MAE tr√™n t·∫≠p test: 0.7382


Precision@K

In [22]:
from pyspark.sql import functions as F, Window

# ‚úÖ 1Ô∏è‚É£ Thi·∫øt l·∫≠p K
K = 10

# ‚úÖ 2Ô∏è‚É£ X·∫øp h·∫°ng theo d·ª± ƒëo√°n gi·∫£m d·∫ßn
w_user_pred = Window.partitionBy("userIndex").orderBy(F.desc("rating_pred"))

# ‚úÖ 3Ô∏è‚É£ L·∫•y top-K item ƒë∆∞·ª£c d·ª± ƒëo√°n cao nh·∫•t cho m·ªói user
topk_pred = (
    pred_df
    .withColumn("rank_order", F.row_number().over(w_user_pred))
    .filter(F.col("rank_order") <= K)
)

# ‚úÖ 4Ô∏è‚É£ X√°c ƒë·ªãnh item n√†o l√† "relevant" (rating th·∫≠t ‚â• 4)
topk_pred = topk_pred.withColumn("is_relevant", (F.col("rating") >= 4).cast("int"))

# ‚úÖ 5Ô∏è‚É£ T√≠nh precision@K cho t·ª´ng user
precision_user = (
    topk_pred.groupBy("userIndex")
    .agg((F.sum("is_relevant") / F.count("*")).alias("precision_at_k"))
)

# ‚úÖ 6Ô∏è‚É£ Trung b√¨nh precision@K to√†n b·ªô user
precision_at_k = precision_user.agg(F.mean("precision_at_k").alias("mean_precision_at_k")).collect()[0]["mean_precision_at_k"]



In [23]:
print(f"Precision@{K}: {precision_at_k:.4f}")
precision_user.show(10)

Precision@10: 0.7952
+---------+------------------+
|userIndex|    precision_at_k|
+---------+------------------+
|        2|               1.0|
|        4|               0.0|
|        5|0.7142857142857143|
|        6|               0.0|
|        7|               1.0|
|        8|               1.0|
|       11|               1.0|
|       17|               1.0|
|       19|0.8571428571428571|
|       24|               1.0|
+---------+------------------+
only showing top 10 rows



NDCG@K

In [24]:
from pyspark.sql import functions as F, Window

# ‚úÖ 1Ô∏è‚É£ Thi·∫øt l·∫≠p gi√° tr·ªã K
K = 10

# ‚úÖ 2Ô∏è‚É£ L·∫•y top-K item d·ª± ƒëo√°n theo pred_rank
w_pred = Window.partitionBy("userIndex").orderBy(F.asc("pred_rank"))
topk_pred = (
    pred_df
    .withColumn("rank_order", F.row_number().over(w_pred))
    .filter(F.col("rank_order") <= K)
)

# ‚úÖ 3Ô∏è‚É£ L·∫•y top-K item th·∫≠t theo true_rank
w_true = Window.partitionBy("userIndex").orderBy(F.asc("true_rank"))
topk_true = (
    pred_df
    .withColumn("rank_order", F.row_number().over(w_true))
    .filter(F.col("rank_order") <= K)
)

# ‚úÖ 4Ô∏è‚É£ T√≠nh DCG (Discounted Cumulative Gain)
dcg_df = (
    topk_pred
    .withColumn("dcg_term", F.col("rating") / F.log2(F.col("pred_rank") + F.lit(1)))
    .groupBy("userIndex")
    .agg(F.sum("dcg_term").alias("dcg"))
)

# ‚úÖ 5Ô∏è‚É£ T√≠nh IDCG (Ideal DCG)
idcg_df = (
    topk_true
    .withColumn("idcg_term", F.col("rating") / F.log2(F.col("true_rank") + F.lit(1)))
    .groupBy("userIndex")
    .agg(F.sum("idcg_term").alias("idcg"))
)

# ‚úÖ 6Ô∏è‚É£ T√≠nh NDCG = DCG / IDCG
ndcg_df = (
    dcg_df.join(idcg_df, "userIndex", "inner")
    .withColumn("ndcg_at_k", F.col("dcg") / F.col("idcg"))
)

# ‚úÖ 7Ô∏è‚É£ Trung b√¨nh NDCG@K tr√™n to√†n b·ªô user
ndcg_at_k = ndcg_df.agg(F.mean("ndcg_at_k").alias("mean_ndcg_at_k")).collect()[0]["mean_ndcg_at_k"]




In [25]:
print(f"NDCG@{K} = {ndcg_at_k:.4f}")

# ‚úÖ 8Ô∏è‚É£ Xem chi ti·∫øt top v√†i user
ndcg_df.orderBy(F.asc("userIndex")).show(10)

NDCG@10 = 0.9820
+---------+------------------+------------------+------------------+
|userIndex|               dcg|              idcg|         ndcg_at_k|
+---------+------------------+------------------+------------------+
|        2| 22.71779669044173| 22.71779669044173|               1.0|
|        4|               3.0|               3.0|               1.0|
|        5| 14.76516944817165| 16.14425048905436|0.9145775741142201|
|        6|               1.0|               1.0|               1.0|
|        7|10.654648767857287|10.654648767857287|               1.0|
|        8| 18.18999819660374| 18.18999819660374|               1.0|
|       11|               5.0|               5.0|               1.0|
|       17|               4.0|               4.0|               1.0|
|       19| 17.52333152993707| 17.52333152993707|               1.0|
|       24| 8.154648767857287| 8.154648767857287|               1.0|
+---------+------------------+------------------+------------------+
only showing top 