In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install jieba



In [99]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import pandas as pd
import jieba
import numpy as np
import torch
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_auc_score, log_loss, mean_squared_error

In [4]:
spark = SparkSession.builder \
    .appName("User-Course") \
    .getOrCreate()

## Read

In [5]:
course_df = spark.read.json("/content/drive/MyDrive/Big Data/Input/course.json")

In [6]:
print("Số lượng course: ", course_df.count())

Số lượng course:  3781


In [7]:
course_df.printSchema()

root
 |-- about: string (nullable = true)
 |-- field: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- prerequisites: string (nullable = true)
 |-- resource: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- chapter: string (nullable = true)
 |    |    |-- resource_id: string (nullable = true)
 |    |    |-- titles: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)



In [26]:
mapping_course_df = pd.read_csv("/content/drive/MyDrive/Big Data/Output/course_mapping.csv")

In [27]:
print("Số lượng course: ", mapping_course_df.shape[0])

Số lượng course:  3148


## Filter and prepare dâtaata

### Filter

In [41]:
mapping_course = mapping_course_df.set_index("mapped_id")["original_id"].to_dict()

In [28]:
# Lấy danh sách các original_id từ pandas dataframe
original_ids = mapping_course_df['original_id'].tolist()

# Lọc course_df với các id có trong danh sách original_ids
filtered_course_df = course_df.filter(course_df['id'].isin(original_ids))

In [11]:
filtered_course_df.show(5, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+--------+--------------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [12]:
null_counts = filtered_course_df.select([sum(col(c).isNull().cast("int")).alias(c) for c in filtered_course_df.columns])
null_counts.show()

+-----+-----+---+----+-------------+--------+
|about|field| id|name|prerequisites|resource|
+-----+-----+---+----+-------------+--------+
|    1|    0|  0|   0|            1|       0|
+-----+-----+---+----+-------------+--------+



### Prepare

In [13]:
result_df = filtered_course_df.select(
    col('id'),
    concat(
        lit('名称: '), col('name'),
        lit(' 领域: '), array_join(col('field'), ', '),  # Nối các phần tử bằng dấu phẩy
        lit(' 关于: '), col('about')
    ).alias('description')
)
result_df.show(5, truncate=False)

+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id      |description                                                                                                                                                                                                                   |
+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|C_584329|名称: 微积分——极限理论与一元函数 领域: 应用经济学, 数学, 物理学, 理论经济学 关于: 本课程是理工科的一门数学基础课，系统、全面地介绍了一元函数微积分学。课程既保持了数学的严谨和抽象的特点，也注意了数学概念的直观和形象的一面。|
|C_584381|名称: 新闻摄影 领域: 艺术学, 新闻传播学 关于: 掌握基本的摄影技能，了解图片新闻的工作方式，训练对生活的观察和热爱，发展对图像的审美和批评能力，以及，培养一个终身的爱好。                                                      |
|C_597208|名称

In [22]:
# Chuyển sang pandas để xử lý
description_pdf = result_df.toPandas()

In [23]:
description_pdf.isnull().sum()

Unnamed: 0,0
id,0
description,1


In [113]:
# Drop rows with null values in the 'description' column
description_pdf_cleaned = description_pdf.fillna("")

In [114]:
# Chinese stopwords
chinese_stopwords = {
    '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个',
    '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好',
    '自己', '这', '那', '里', '什么', '时候', '可以', '但是', '如果', '因为'
}

def chinese_tokenizer_with_stopwords(text):
    tokens = jieba.cut(text)
    return [token for token in tokens if token not in chinese_stopwords and len(token.strip()) > 0]

In [115]:
vectorizer = TfidfVectorizer(
    tokenizer=chinese_tokenizer_with_stopwords,
    lowercase=False,
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.8
)

tfidf_matrix = vectorizer.fit_transform(description_pdf_cleaned['description'])
description_pdf_cleaned['tfidf_embedding'] = tfidf_matrix.toarray().tolist()

In [116]:
description_pdf_cleaned.head()

Unnamed: 0,id,description,tfidf_embedding
0,C_584329,"名称: 微积分——极限理论与一元函数 领域: 应用经济学, 数学, 物理学, 理论经济学 关...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,C_584381,"名称: 新闻摄影 领域: 艺术学, 新闻传播学 关于: 掌握基本的摄影技能，了解图片新闻的工...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,C_597208,名称: 数据挖掘：理论与算法 领域: 计算机科学与技术 关于: 最有趣的理论+最有用的算法=...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,C_597225,名称: 大学计算机 领域: 关于: 大学计算机课程将以计算思维为导向，以计算机原理、概念为...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,C_597229,"名称: 财务分析与决策 领域: 应用经济学, 管理科学与工程 关于: 这门课程用财务语言解构...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [117]:
courses_embedding = tfidf_matrix.toarray().tolist()

In [93]:
train_df = "/content/drive/MyDrive/Big Data/Output/KGAT-final/train.txt"
eval_df = "/content/drive/MyDrive/Big Data/Output/KGAT-final/val.txt"

with open(train_df, 'r') as file:
    train_lines = file.readlines()
with open(eval_df, 'r') as file:
    eval_lines = file.readlines()

In [118]:
def get_users_embedding(train_lines, description_pdf_cleaned, mapping_course):
  users_embedding = {}
  for line in tqdm(train_lines, desc="Đang đọc file"):
    parts = line.strip().split()
    course_ids = list(parts[1:])
    course_ids = [mapping_course[int(course_id)] for course_id in course_ids]
    course_embeddings = description_pdf_cleaned[description_pdf_cleaned['id'].isin(course_ids)]['tfidf_embedding'].tolist()
    users_embedding[int(parts[0])] = np.mean(course_embeddings,axis=0)
  return users_embedding

In [122]:
users_embedding = get_users_embedding(train_lines, description_pdf_cleaned, mapping_course)

Đang đọc file: 100%|██████████| 39872/39872 [09:51<00:00, 67.44it/s]


In [120]:
len(users_embedding)

39872

In [121]:
len(courses_embedding)

3148

In [83]:
tfidf_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 139712 stored elements and shape (3147, 5000)>

In [140]:
def compute_all_similarities_vectorized(user_ids_tensor, users_embedding, courses_embedding):
    """
    Tính toán similarities bằng vector hóa (user_ids là torch.Tensor).
    Trả về kết quả dưới dạng torch.Tensor.
    """
    # Chuyển user_ids từ tensor sang numpy (nếu cần)
    user_ids = user_ids_tensor.cpu().numpy() if isinstance(user_ids_tensor, torch.Tensor) else user_ids_tensor

    # Lấy embedding của các user tương ứng
    users_matrix = np.array([users_embedding[user_id] for user_id in user_ids])

    # Tính cosine similarity giữa user embeddings và course embeddings
    similarity_matrix = cosine_similarity(users_matrix, courses_embedding)  # numpy array

    # Chuyển kết quả sang torch.Tensor và trả về
    return torch.from_numpy(similarity_matrix).float()

In [85]:
user_similarity_scores = compute_all_similarities_vectorized(users_embedding, courses_embedding)

In [87]:
user_similarity_scores[999]

array([0.12156208, 0.08970466, 0.03541417, ..., 0.10571857, 0.01969273,
       0.03473543])

In [124]:
train_user_dict = {}
for line in tqdm(train_lines, desc="Đang đọc file"):
    parts = line.strip().split()
    course_ids = list(parts[1:])
    train_user_dict[int(parts[0])] = [int(course_id) for course_id in course_ids]

Đang đọc file: 100%|██████████| 39872/39872 [00:00<00:00, 79202.91it/s]


In [125]:
eval_user_dict = {}
for line in tqdm(eval_lines, desc="Đang đọc file"):
    parts = line.strip().split()
    course_ids = list(parts[1:])
    eval_user_dict[int(parts[0])] = [int(course_id) for course_id in course_ids]

Đang đọc file: 100%|██████████| 39872/39872 [00:01<00:00, 22648.54it/s]


In [126]:
test_batch_size = 256
user_ids = list(users_course.keys())
user_ids_batches = [
        user_ids[i : i + test_batch_size]
        for i in range(0, len(user_ids), test_batch_size)
]

In [127]:
user_ids_batches = [torch.LongTensor(d) for d in user_ids_batches]

In [138]:
Ks = [20,40,60,80,100]
n_items = 3148
item_ids = torch.arange(n_items, dtype=torch.long)

In [131]:
def calc_recall(rank, ground_truth, k):
    """
    calculate recall of one example
    """
    return len(set(rank[:k]) & set(ground_truth)) / float(len(set(ground_truth)))


def precision_at_k(hit, k):
    """
    calculate Precision@k
    hit: list, element is binary (0 / 1)
    """
    hit = np.asarray(hit)[:k]
    return np.mean(hit)


def precision_at_k_batch(hits, k):
    """
    calculate Precision@k
    hits: array, element is binary (0 / 1), 2-dim
    """
    res = hits[:, :k].mean(axis=1)
    return res


def average_precision(hit, cut):
    """
    calculate average precision (area under PR curve)
    hit: list, element is binary (0 / 1)
    """
    hit = np.asarray(hit)
    precisions = [precision_at_k(hit, k + 1) for k in range(cut) if len(hit) >= k]
    if not precisions:
        return 0.
    return np.sum(precisions) / float(min(cut, np.sum(hit)))


def dcg_at_k(rel, k):
    """
    calculate discounted cumulative gain (dcg)
    rel: list, element is positive real values, can be binary
    """
    rel = np.asfarray(rel)[:k]
    dcg = np.sum((2 ** rel - 1) / np.log2(np.arange(2, rel.size + 2)))
    return dcg


def ndcg_at_k(rel, k):
    """
    calculate normalized discounted cumulative gain (ndcg)
    rel: list, element is positive real values, can be binary
    """
    idcg = dcg_at_k(sorted(rel, reverse=True), k)
    if not idcg:
        return 0.
    return dcg_at_k(rel, k) / idcg


def ndcg_at_k_batch(hits, k):
    """
    calculate NDCG@k
    hits: array, element is binary (0 / 1), 2-dim
    """
    hits_k = hits[:, :k]
    dcg = np.sum((2 ** hits_k - 1) / np.log2(np.arange(2, k + 2)), axis=1)

    sorted_hits_k = np.flip(np.sort(hits), axis=1)[:, :k]
    idcg = np.sum((2 ** sorted_hits_k - 1) / np.log2(np.arange(2, k + 2)), axis=1)

    idcg[idcg == 0] = np.inf
    ndcg = (dcg / idcg)
    return ndcg


def recall_at_k(hit, k, all_pos_num):
    """
    calculate Recall@k
    hit: list, element is binary (0 / 1)
    """
    hit = np.asfarray(hit)[:k]
    return np.sum(hit) / all_pos_num


def recall_at_k_batch(hits, k):
    """
    calculate Recall@k
    hits: array, element is binary (0 / 1), 2-dim
    """
    res = (hits[:, :k].sum(axis=1) / hits.sum(axis=1))
    return res


def F1(pre, rec):
    if pre + rec > 0:
        return (2.0 * pre * rec) / (pre + rec)
    else:
        return 0.


def calc_auc(ground_truth, prediction):
    try:
        res = roc_auc_score(y_true=ground_truth, y_score=prediction)
    except Exception:
        res = 0.
    return res


def logloss(ground_truth, prediction):
    logloss = log_loss(np.asarray(ground_truth), np.asarray(prediction))
    return logloss


def calc_metrics_at_k(cf_scores, train_user_dict, test_user_dict, user_ids, item_ids, Ks):
    """
    cf_scores: (n_users, n_items)
    """
    test_pos_item_binary = np.zeros([len(user_ids), len(item_ids)], dtype=np.float32)
    for idx, u in enumerate(user_ids):
        train_pos_item_list = train_user_dict[u]
        test_pos_item_list = test_user_dict[u]
        # print(train_pos_item_list)
        cf_scores[idx][train_pos_item_list] = -np.inf
        test_pos_item_binary[idx][test_pos_item_list] = 1

    try:
        _, rank_indices = torch.sort(cf_scores.cuda(), descending=True)    # try to speed up the sorting process
    except:
        _, rank_indices = torch.sort(cf_scores, descending=True)
    rank_indices = rank_indices.cpu()

    binary_hit = []
    for i in range(len(user_ids)):
        binary_hit.append(test_pos_item_binary[i][rank_indices[i]])
    binary_hit = np.array(binary_hit, dtype=np.float32)

    metrics_dict = {}
    for k in Ks:
        metrics_dict[k] = {}
        metrics_dict[k]['precision'] = precision_at_k_batch(binary_hit, k)
        metrics_dict[k]['recall']    = recall_at_k_batch(binary_hit, k)
        metrics_dict[k]['ndcg']      = ndcg_at_k_batch(binary_hit, k)
    return metrics_dict

In [144]:
cf_scores = []
metric_names = ['precision', 'recall', 'ndcg']
metrics_dict = {k: {m: [] for m in metric_names} for k in Ks}

with tqdm(total=len(user_ids_batches), desc='Filtering Iteration') as pbar:
  for batch_user_ids in user_ids_batches:
    batch_scores = compute_all_similarities_vectorized(batch_user_ids, users_embedding, courses_embedding)
    batch_metrics = calc_metrics_at_k(batch_scores, train_user_dict, eval_user_dict, batch_user_ids.numpy(), item_ids.numpy(), Ks)
    cf_scores.append(batch_scores.numpy())
    for k in Ks:
        for m in metric_names:
            metrics_dict[k][m].append(batch_metrics[k][m])
    pbar.update(1)

cf_scores = np.concatenate(cf_scores, axis=0)
for k in Ks:
  for m in metric_names:
    metrics_dict[k][m] = np.concatenate(metrics_dict[k][m]).mean()

Filtering Iteration: 100%|██████████| 156/156 [04:12<00:00,  1.62s/it]


In [148]:
metrics_dict

{20: {'precision': np.float32(0.00036742576),
  'recall': np.float32(0.007348515),
  'ndcg': np.float64(0.0022789701869892773)},
 40: {'precision': np.float32(0.00039376004),
  'recall': np.float32(0.0157504),
  'ndcg': np.float64(0.003973206255815406)},
 60: {'precision': np.float32(0.00040295615),
  'recall': np.float32(0.024177367),
  'ndcg': np.float64(0.005458376673931588)},
 80: {'precision': np.float32(0.00038905747),
  'recall': np.float32(0.0311246),
  'ndcg': np.float64(0.006587335928518824)},
 100: {'precision': np.float32(0.0003772071),
  'recall': np.float32(0.037720706),
  'ndcg': np.float64(0.007600911975582456)}}