# Xây dựng các đồ thị, trong đó có 1 thực thể course làm trung tâm

### Đọc file CSV chứa course ID và nhãn của course

In [1]:
import pandas as pd
from utils.encode import *
from utils.edge import *

courses = pd.read_csv("/home/ptdat/Desktop/graph/course_labeled.csv")[["id", "course_classification"]]
courses = courses
courses.head()

If you want to use `XLMRobertaLMHeadModel` as a standalone, add `is_decoder=True.`


Unnamed: 0,id,course_classification
0,C_655852,normal
1,C_655850,bad
2,C_654554,normal
3,C_654506,bad
4,C_629558,normal


### Kết nối đến MongoDB (config tại file docker-compose.yml)

Để mở MongoDB, chạy lệnh
```sh
sudo docker compose up
```

In [2]:
from pymongo import MongoClient
from pymongo.database import Database

client = MongoClient("mongodb://root:example@localhost:27017/")
db = client["database"]

Hàm truy vấn các thực thể liên quan đến course có ID là *course_id*

![](graph-diagram.png)

In [3]:
from itertools import product
import re

def query(course_id: str, db: Database):
    # Course
    course = db["course"].find_one(
        {"_id": course_id}, 
        projection=["about", "prerequisites", "school_id", "field"])
    # Fields
    fields = course.pop("field")
    # Resources
    resources = list(db["resource"].find(
        {"course_id": course_id}, 
        projection={"title": {"$arrayElemAt": ["$titles", 0]}, "ccid": 1}))
    # School
    schools = list(db["school"].aggregate([
        {
            "$group": {
            "_id": "$name",
            "school": { "$first": "$$ROOT" }
            }
        },
        {
            "$replaceRoot": { "newRoot": "$school" }
        },
        {
            "$project": {"name": 1, "about": 1, "motto": 1}
        }
    ])) # Lấy tất cả dữ liệu school, nhưng lọc những trường bị trùng tên
    # Teachers
    teacher_ids = [re.search(r"T_\d+$", dict(i)["_id"]).group(0) for i in db["course-teacher"].find({"_id": {"$regex": f"^{course_id}"}})]
    teachers = list(db["teacher"].find(
        {"_id": {"$in": teacher_ids}}, 
        projection=["about", "job_title", "org_name"]))
    # Users
    user_ids = [re.search(r"U_\d+$", dict(i)["_id"]).group(0) for i in db["user_course"].find({"_id": {"$regex": f"^{course_id}"}})]
    users = list(db["user"].find(
        {"_id": {"$in": user_ids}, "gender": {"$in": [0, 1, 2]}}, # Chỉ lấy các gender mang mã 0, 1, 2 (chiếm đa số)
        projection=["school", "gender"]))
    # Comments
    comments = list(db["comment"].find(
        {"course_id": course_id}, 
        projection=["text", "user_id"]))
    # Replies
    comment_ids = [comment["_id"] for comment in comments]
    replies = list(db["reply"].find(
        {"comment_id": {"$in": comment_ids}}, 
        projection=["text", "user_id", "comment_id"]))
    # Exercises
    resource_ids = [resource["_id"] for resource in resources]
    exercises = list(db["problem"].find(
        {"exercise_id": {"$in": resource_ids}}, 
        projection=["title", "content", "typetext", "exercise_id"]))
    # Videos
    ccids = list(set([resource["ccid"] for resource in resources if "ccid" in resource]))
    videos = list(db["video"].find(
        {"_id": {"$in": ccids}}, 
        projection=["name", "text"]))

    return dict(
        course=course,
        fields=fields,
        resources=resources,
        schools=schools,
        teachers=teachers,
        users=users,
        comments=comments,
        replies=replies,
        exercises=exercises,
        videos=videos,
    )

Test các thực thể truy vấn từ *course_id* C_676953

In [4]:
c_id = courses.loc[52, "id"]
sample = query(course_id=c_id, db=db)
print(c_id)
sample

C_676953


{'course': {'_id': 'C_676953',
  'prerequisites': 'C语言',
  'about': '不仅仅是一门编程语言，汇编语言更是计算机系统软硬件的分界与桥梁，是理解整个计算机系统的有效起点，为学习后续的计算机系统课程打下基础。',
  'school_id': 'S_1'},
 'fields': ['计算机科学与技术'],
 'resources': [{'_id': 'V_71673',
   'ccid': 'BB2012EAC138EA979C33DC5901307461',
   'title': '一、基础知识'},
  {'_id': 'V_71674',
   'ccid': 'DC4227387F8716419C33DC5901307461',
   'title': '一、基础知识'},
  {'_id': 'V_71675',
   'ccid': '28B3AA48D6C4DCBA9C33DC5901307461',
   'title': '一、基础知识'},
  {'_id': 'V_71676',
   'ccid': '55A4D3424F9E38539C33DC5901307461',
   'title': '一、基础知识'},
  {'_id': 'V_71677',
   'ccid': 'B2DF7CB98A899C0E9C33DC5901307461',
   'title': '一、基础知识'},
  {'_id': 'V_71678',
   'ccid': 'F1605AD9493E4C669C33DC5901307461',
   'title': '一、基础知识'},
  {'_id': 'V_71679',
   'ccid': 'F583D1EE366342779C33DC5901307461',
   'title': '一、基础知识'},
  {'_id': 'V_71680',
   'ccid': '93238A9393F49BCB9C33DC5901307461',
   'title': '一、基础知识'},
  {'_id': 'V_71681',
   'ccid': 'D91EC6FCA01C69749C33DC5901307461',
   't

Thống kê số lượng các thực thể số nhiều của course C_676953

In [5]:
for k, v in sample.items():
    if isinstance(v, list):
        print(f"{len(v)} {k}")

1 fields
140 resources
422 schools
2 teachers
41 users
0 comments
0 replies
0 exercises
40 videos


### Encode thông tin của các thực thể

- Dữ liệu dạng text: Sử dụng mô hình XLM-R, và lấy vector đặc trưng đại diện cho sentence embedding.
- Dữ liệu category: Mã hóa one-hot.

**Ý tưởng:** 1 thực thể bao gồm nhiều property sẽ được biểu diễn bằng cách nối vector embedding của mỗi property với nhau.

**Ví dụ:**

Course = (*about*, *prerequisites*)

$\rightarrow$ Encoding(Course) = [Encoding(*about*), Encoding(*prerequisites*)]

In [6]:
print("Raw course:")
print(sample["course"])

print("Encoded course:")
print(encode_course(sample["course"]))

Raw course:
{'_id': 'C_676953', 'prerequisites': 'C语言', 'about': '不仅仅是一门编程语言，汇编语言更是计算机系统软硬件的分界与桥梁，是理解整个计算机系统的有效起点，为学习后续的计算机系统课程打下基础。', 'school_id': 'S_1'}
Encoded course:
tensor([[-0.0867,  0.0782,  0.0976,  ..., -0.2662, -0.0538, -0.0740]])


### Hàm số chuyển đổi các thực thể của course cho phù hợp với API của `torch_geometric`

In [7]:
import torch
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

def to_hetero_data(sample: dict):
    data = HeteroData()
    # Xây dựng encoding cho các thực thể
    data["course"].x = encode_course(sample["course"])
    data["field"].x = encode_fields(sample["fields"])
    data["resource"].x = encode_resources(sample["resources"])
    data["school"].x = encode_schools(sample["schools"])
    data["teacher"].x = encode_teachers(sample["teachers"])
    data["user"].x = encode_users(sample["users"])
    data["comment"].x = encode_comments(sample["comments"])
    data["reply"].x = encode_replies(sample["replies"])
    data["exercise"].x = encode_exercises(sample["exercises"])
    data["video"].x = encode_videos(sample["videos"])

    # Xây dựng các cạnh dựa trên mối quan hệ tham chiếu của bộ dữ liệu MOOCCubeX
    if (edge := course_field_edges(sample["fields"])).size(0) == 2:
        data["course", "edge", "field"].edge_index = edge
    if (edge := course_resource_edges(sample["resources"])).size(0) == 2:
        data["course", "edge", "resource"].edge_index = edge
    if (edge := course_teacher_edges(sample["teachers"])).size(0) == 2:
        data["course", "edge", "teacher"].edge_index = edge
    if (edge := course_school_edges(sample["course"], sample["schools"])).size(0) == 2:
        data["course", "edge", "school"].edge_index = edge
    if (edge := course_user_edges(sample["users"])).size(0) == 2:
        data["course", "edge", "user"].edge_index = edge
    if (edge := course_comment_edges(sample["comments"])).size(0) == 2:
        data["course", "edge", "comment"].edge_index = edge
    if (edge := comment_reply_edges(sample["comments"], sample["replies"])).size(0) == 2:
        data["comment", "edge", "reply"].edge_index = edge
    if (edge := user_comment_edges(sample["users"], sample["comments"])).size(0) == 2:
        data["user", "edge", "comment"].edge_index = edge
    if (edge := user_reply_edges(sample["users"], sample["replies"])).size(0) == 2:
        data["user", "edge", "reply"].edge_index = edge
    if (edge := school_user_edges(sample["schools"], sample["users"])).size(0) == 2:
        data["school", "edge", "user"].edge_index = edge
    if (edge := school_teacher_edges(sample["schools"], sample["teachers"])).size(0) == 2:
        data["school", "edge", "teacher"].edge_index = edge
    if (edge := resource_exercise_edges(sample["resources"], sample["exercises"])).size(0) == 2:
        data["resource", "edge", "exercise"].edge_index = edge
    if (edge := resource_video_edges(sample["resources"], sample["videos"])).size(0) == 2:
        data["resource", "edge", "video"].edge_index = edge

    data = T.ToUndirected()(data)
    return data

In [8]:
# Test data của torch_geometric
data = to_hetero_data(sample)
data

  return torch.tensor(edges).T.long()


HeteroData(
  course={ x=[1, 2048] },
  field={ x=[1, 1024] },
  resource={ x=[140, 1024] },
  school={ x=[422, 2048] },
  teacher={ x=[2, 2048] },
  user={ x=[41, 3] },
  comment={ x=[0, 1024] },
  reply={ x=[0, 1024] },
  exercise={ x=[0, 3072] },
  video={ x=[40, 2048] },
  (course, edge, field)={ edge_index=[2, 1] },
  (course, edge, resource)={ edge_index=[2, 140] },
  (course, edge, teacher)={ edge_index=[2, 2] },
  (course, edge, school)={ edge_index=[2, 1] },
  (course, edge, user)={ edge_index=[2, 41] },
  (course, edge, comment)={ edge_index=[2, 0] },
  (school, edge, user)={ edge_index=[2, 22] },
  (school, edge, teacher)={ edge_index=[2, 2] },
  (resource, edge, video)={ edge_index=[2, 40] },
  (field, rev_edge, course)={ edge_index=[2, 1] },
  (resource, rev_edge, course)={ edge_index=[2, 140] },
  (teacher, rev_edge, course)={ edge_index=[2, 2] },
  (school, rev_edge, course)={ edge_index=[2, 1] },
  (user, rev_edge, course)={ edge_index=[2, 41] },
  (comment, rev_edge, c

Lặp qua từng `course_id`, và lưu lại đồ thị của course đó.

In [9]:
from tqdm import tqdm
import joblib
import os

os.makedirs("samples", exist_ok=True)
for row in tqdm(courses.itertuples(), desc="Storing samples to disk", total=courses.shape[0]):
    course_id = row.id
    label = row.course_classification
    if os.path.isfile(f"samples/{course_id}.pkl"):
        continue

    try:
        sample = query(course_id, db)
        data = to_hetero_data(sample)
        obj = {
            "_id": course_id,
            "data": data,
            "label": label
        }
        joblib.dump(obj, f"samples/{course_id}.pkl")
    except Exception as e:
        print(f"Error processing course {course_id}: {e}")
        continue

Storing samples to disk:   0%|          | 1/3781 [01:22<86:09:34, 82.06s/it]


KeyboardInterrupt: 

In [10]:
joblib.load("/home/ptdat/Desktop/graph/samples/C_655852.pkl")

{'_id': 'C_655852',
 'data': HeteroData(
   course={ x=[1, 2048] },
   field={ x=[0, 1024] },
   resource={ x=[162, 1024] },
   school={ x=[422, 2048] },
   teacher={ x=[1, 2048] },
   user={ x=[2, 3] },
   comment={ x=[0, 1024] },
   reply={ x=[0, 1024] },
   exercise={ x=[472, 3072] },
   video={ x=[30, 2048] },
   (course, edge, field)={ edge_index=[2, 0] },
   (course, edge, resource)={ edge_index=[2, 162] },
   (course, edge, teacher)={ edge_index=[2, 1] },
   (course, edge, school)={ edge_index=[2, 1] },
   (course, edge, user)={ edge_index=[2, 2] },
   (course, edge, comment)={ edge_index=[2, 0] },
   (school, edge, user)={ edge_index=[2, 1] },
   (school, edge, teacher)={ edge_index=[2, 1] },
   (resource, edge, exercise)={ edge_index=[2, 472] },
   (resource, edge, video)={ edge_index=[2, 30] },
   (field, rev_edge, course)={ edge_index=[2, 0] },
   (resource, rev_edge, course)={ edge_index=[2, 162] },
   (teacher, rev_edge, course)={ edge_index=[2, 1] },
   (school, rev_edge,