In [1]:
!wget -q https://lfs.aminer.cn/misc/moocdata/data/mooccube2/relations/concept-video.txt \
        https://lfs.aminer.cn/misc/moocdata/data/mooccube2/relations/video_id-ccid.txt \
        https://lfs.aminer.cn/misc/moocdata/data/mooccube2/entities/course.json \
        https://lfs.aminer.cn/misc/moocdata/data/mooccube2/entities/concept.json \
        https://raw.githubusercontent.com/THU-KEG/MOOCCubeX/refs/heads/main/scripts/concept_course.py

In [2]:
!mkdir relations entities
!mv course.json concept.json entities/
!mv video_id-ccid.txt concept-video.txt relations/

In [3]:
!python concept_course.py

vid2ccid: 2798892it [00:05, 467821.38it/s]
vid2course: 3781it [00:01, 3651.23it/s]
concept2course: 624683it [00:17, 35807.96it/s]


In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('preprocessing data').getOrCreate()

In [16]:
from pyspark.sql.functions import col, trim, concat_ws, lit, split
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

In [6]:
course = spark.read.options(header=True, inferSchema=True).json('entities/course.json')
course.show(5)

+-------------------------------------+------------------------------+--------+--------------------------+-------------+--------------------+
|                                about|                         field|      id|                      name|prerequisites|            resource|
+-------------------------------------+------------------------------+--------+--------------------------+-------------+--------------------+
|通过老师导读，同学们可深入这一经典...|        [历史学, 中国语言文学]|C_584313|          《资治通鉴》导读|             |[{1.1.1, V_849, [...|
|本课程是理工科的一门数学基础课，系...|[应用经济学, 数学, 物理学, ...|C_584329|微积分——极限理论与一元函数|             |[{1.1.1, V_1350, ...|
|掌握基本的摄影技能，了解图片新闻的...|          [艺术学, 新闻传播学]|C_584381|                  新闻摄影|             |[{1.1.1, V_1800, ...|
|  最有趣的理论+最有用的算法=不得不...|            [计算机科学与技术]|C_597208|      数据挖掘：理论与算法|             |[{1.1.1, V_2961, ...|
|大学计算机课程将以计算思维为导向，...|                            []|C_597225|                大学计算机|             |[{1.1.1, V_4596, ...|
+--------------------

In [7]:
concept = spark.read.options(header=True, inferSchema=True).json('entities/concept.json')
concept.show(5)

+--------------------------------+---------------------------------+----------------+
|                         context|                               id|            name|
+--------------------------------+---------------------------------+----------------+
|                              []|          K_神经部_组织学与胚胎学|          神经部|
|[质和髓质两部分组成\n答案：B\...|K_促甲状腺激素细胞_组织学与胚胎学|促甲状腺激素细胞|
|                              []|        K_嗜色细胞_组织学与胚胎学|        嗜色细胞|
| [案：B\n13．腺垂体嗜酸性细胞...|    K_生长激素细胞_组织学与胚胎学|    生长激素细胞|
|      [\n褐铁矿（Limonite）是...|          K_褐铁矿_材料科学与工程|          褐铁矿|
+--------------------------------+---------------------------------+----------------+
only showing top 5 rows



In [8]:
course_concept = spark.read.options(header=True, inferSchema=True).text('relations/concept-course.txt')
course_concept.show(5)

+----------------------------------+
|                             value|
+----------------------------------+
|  K_直流电变换_控制科学与工程\t...|
|  K_r结果首通道_控制科学与工程\...|
|     K_plc控制_控制科学与工程\t...|
|   K_响应特性_控制科学与工程\tC...|
|K_用户程序存储器_控制科学与工程...|
+----------------------------------+
only showing top 5 rows



In [9]:
# Split course and concept id from course_concept
course_concept = course_concept.withColumn("course_id", split(course_concept['value'], '\t')[1]) \
                              .withColumn("concept_id", split(course_concept['value'], '\t')[0]) \
                              .drop('value')
course_concept.show(5)

+---------+-------------------------------+
|course_id|                     concept_id|
+---------+-------------------------------+
| C_681460|    K_直流电变换_控制科学与工程|
| C_681460|   K_r结果首通道_控制科学与工程|
| C_681460|       K_plc控制_控制科学与工程|
| C_681460|      K_响应特性_控制科学与工程|
| C_681460|K_用户程序存储器_控制科学与工程|
+---------+-------------------------------+
only showing top 5 rows



In [10]:
course_concept.summary().show()

+-------+---------+---------------------------+
|summary|course_id|                 concept_id|
+-------+---------+---------------------------+
|  count|   451078|                     451078|
|   mean|     NULL|                       NULL|
| stddev|     NULL|                       NULL|
|    min|C_1169394|      K_$ _计算机科学与技术|
|    25%|     NULL|                       NULL|
|    50%|     NULL|                       NULL|
|    75%|     NULL|                       NULL|
|    max| C_947773|K_ｇ分布的概率分布函数_数学|
+-------+---------+---------------------------+



### 1.Filter

In [11]:
# Join with course to filter the valid course_id
valid_course_concept = course_concept.join(
    course,
    course_concept.course_id == course.id,
    'inner'
).select(['course_id', 'concept_id'])
valid_course_concept.show(5)

+---------+-------------------------------+
|course_id|                     concept_id|
+---------+-------------------------------+
| C_681460|    K_直流电变换_控制科学与工程|
| C_681460|   K_r结果首通道_控制科学与工程|
| C_681460|       K_plc控制_控制科学与工程|
| C_681460|      K_响应特性_控制科学与工程|
| C_681460|K_用户程序存储器_控制科学与工程|
+---------+-------------------------------+
only showing top 5 rows



In [12]:
# Join valid_course_concept with concept to filter the valid concept_id
fully_valid_course_concept = valid_course_concept.join(
    concept,
    valid_course_concept.concept_id == concept.id,
    'inner'
).select(['course_id', 'concept_id'])
fully_valid_course_concept.show(5)

+---------+------------------------+
|course_id|              concept_id|
+---------+------------------------+
|C_2091183|   K_$ _计算机科学与技术|
| C_697797|   K_$ _计算机科学与技术|
| C_655850|   K_$ _计算机科学与技术|
| C_707379|   K_$ _计算机科学与技术|
| C_682381|K_&符号_计算机科学与技术|
+---------+------------------------+
only showing top 5 rows



In [13]:
print("Số lượng dòng dữ liệu hợp lệ:", fully_valid_course_concept.count())
print("Số lượng dòng dữ liệu không hợp lệ:", course_concept.count() - fully_valid_course_concept.count())

Số lượng dòng dữ liệu hợp lệ: 451078
Số lượng dòng dữ liệu không hợp lệ: 0


### 2. Mapping

### Concept

In [14]:
valid_concepts = fully_valid_course_concept.select("concept_id").distinct()
valid_concepts.show()

+------------------------------+
|                    concept_id|
+------------------------------+
|         K_$ _计算机科学与技术|
|      K_&符号_计算机科学与技术|
|  K_(分子_动力工程及工程热物理|
|             K_(荷载)方向_力学|
|              K_)求解_电气工程|
|        K_)离子_化学工程与技术|
|        K_)输出_控制科学与工程|
|    K_+运算符_计算机科学与技术|
|      K_-二甲苯_化学工程与技术|
|          K_.母液的保存_作物学|
|     K_0 part_计算机科学与技术|
|           K_0v低电平_电气工程|
|        K_0二阶导数_应用经济学|
|K_0亏格的曲面_计算机科学与技术|
|          K_0值电流源_电气工程|
|      K_0值电流源等效_电气工程|
|            K_0值电阻_电气工程|
|              K_0元子集合_数学|
|  K_0元素个数_计算机科学与技术|
|                  K_0元集_数学|
+------------------------------+
only showing top 20 rows



In [17]:
# Add a new column with numbers starting from 1
window_spec = Window.orderBy("concept_id")
df_mapped = valid_concepts.withColumn("mapped_id", row_number().over(window_spec) + 13100)

In [19]:
# Select only the id and the new number
mapping_df = df_mapped.select(valid_concepts["concept_id"].alias("original_id"), "mapped_id")
mapping_df.tail(5)

[Row(original_id='K_，高效液相色谱法_药学', mapped_id=229192),
 Row(original_id='K_，齐次方程_数学', mapped_id=229193),
 Row(original_id='K_；洛仑兹规范_动力工程及工程热物理', mapped_id=229194),
 Row(original_id='K_ｃ语言的表达式_计算机科学与技术', mapped_id=229195),
 Row(original_id='K_ｇ分布的概率分布函数_数学', mapped_id=229196)]

In [20]:
# Save as a single .txt file with two columns (CSV format)
mapping_df.coalesce(1).write \
    .option("header", True) \
    .option("delimiter", ",") \
    .mode("overwrite") \
    .csv("output/mapping_txt")

### Course-Concept

In [21]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [22]:
course_mapping = spark.read.options(header=True, inferSchema=True).csv('/content/drive/MyDrive/Big Data/Output/course_mapping.csv')
concept_mapping = spark.read.options(header=True, inferSchema=True).csv('/content/drive/MyDrive/Big Data/Output/concept.csv')

In [23]:
# Show mapping data
print("Course Mapping:")
course_mapping.show(10)
print("Concept Mapping:")
concept_mapping.show(10)

Course Mapping:
+-----------+---------+
|original_id|mapped_id|
+-----------+---------+
|  C_1017355|        0|
|  C_1017419|        1|
|  C_1025064|        2|
|  C_1025076|        3|
|  C_1025079|        4|
|  C_1073350|        5|
|  C_1123814|        6|
|  C_1123848|        7|
|  C_1123944|        8|
|  C_1123979|        9|
+-----------+---------+
only showing top 10 rows

Concept Mapping:
+----------------------------+---------+
|                 original_id|mapped_id|
+----------------------------+---------+
|       K_$ _计算机科学与技术|    13101|
|    K_&符号_计算机科学与技术|    13102|
|K_(分子_动力工程及工程热物理|    13103|
|           K_(荷载)方向_力学|    13104|
|              K_)值_冶金工程|    13105|
|    K_)函数_计算机科学与技术|    13106|
|              K_)导数_物理学|    13107|
|            K_)求解_电气工程|    13108|
|      K_)离子_化学工程与技术|    13109|
|      K_)输出_控制科学与工程|    13110|
+----------------------------+---------+
only showing top 10 rows



In [24]:
# Bước 1: Map concept_id từ original sang mapped
concept_mapped = fully_valid_course_concept.join(
    concept_mapping,
    fully_valid_course_concept.concept_id == concept_mapping.original_id,
    'inner'
).select(
    concept_mapping.mapped_id.alias("mapped_concept_id"),
    fully_valid_course_concept.course_id
)
concept_mapped.show(5)

+-----------------+---------+
|mapped_concept_id|course_id|
+-----------------+---------+
|            13101|C_2091183|
|            13101| C_697797|
|            13101| C_655850|
|            13101| C_707379|
|            13102| C_682381|
+-----------------+---------+
only showing top 5 rows



In [25]:
# Bước 2: Map course_id từ original sang mapped
course_mapped = concept_mapped.join(
    course_mapping,
    concept_mapped.course_id == course_mapping.original_id,
    "inner"
).select(
    concept_mapped.mapped_concept_id,
    course_mapping.mapped_id.alias("mapped_course_id")
)
course_mapped.show(5)

+-----------------+----------------+
|mapped_concept_id|mapped_course_id|
+-----------------+----------------+
|            13101|            2339|
|            13101|            1171|
|            13101|            2387|
|            13102|            1795|
|            13102|            2343|
+-----------------+----------------+
only showing top 5 rows



In [26]:
# Bước 3: Tạo chuỗi text theo format yêu cầu: course_id 0 concept_id (0 là mối quan hệ teacher)
output_df = course_mapped.select(
    concat_ws(" ",
        col("mapped_course_id"),
              lit(2),
        col("mapped_concept_id")
    ).alias("output_line")
)

In [27]:
output_df.show(5)

+------------+
| output_line|
+------------+
|2339 2 13101|
|1171 2 13101|
|2387 2 13101|
|1795 2 13102|
|2343 2 13102|
+------------+
only showing top 5 rows



In [28]:
output_df.select("output_line").coalesce(1).write.mode("overwrite").text("course_concept")