In [1]:
!wget -q https://lfs.aminer.cn/misc/moocdata/data/mooccube2/relations/concept-video.txt \
        https://lfs.aminer.cn/misc/moocdata/data/mooccube2/relations/video_id-ccid.txt \
        https://lfs.aminer.cn/misc/moocdata/data/mooccube2/entities/course.json \
        https://lfs.aminer.cn/misc/moocdata/data/mooccube2/entities/concept.json \
        https://raw.githubusercontent.com/THU-KEG/MOOCCubeX/refs/heads/main/scripts/concept_course.py

In [2]:
!mkdir relations entities
!mv course.json concept.json entities/
!mv video_id-ccid.txt concept-video.txt relations/

In [3]:
!python concept_course.py

vid2ccid: 2798892it [00:06, 439052.26it/s]
vid2course: 3781it [00:00, 3795.77it/s]
concept2course: 624683it [00:15, 39600.17it/s]


In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('preprocessing data').getOrCreate()

In [5]:
from pyspark.sql.functions import col, trim, concat_ws, lit, split
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

In [6]:
course = spark.read.options(header=True, inferSchema=True).json('entities/course.json')
course.show(5)

+-------------------------------------+------------------------------+--------+--------------------------+-------------+--------------------+
|                                about|                         field|      id|                      name|prerequisites|            resource|
+-------------------------------------+------------------------------+--------+--------------------------+-------------+--------------------+
|通过老师导读，同学们可深入这一经典...|        [历史学, 中国语言文学]|C_584313|          《资治通鉴》导读|             |[{1.1.1, V_849, [...|
|本课程是理工科的一门数学基础课，系...|[应用经济学, 数学, 物理学, ...|C_584329|微积分——极限理论与一元函数|             |[{1.1.1, V_1350, ...|
|掌握基本的摄影技能，了解图片新闻的...|          [艺术学, 新闻传播学]|C_584381|                  新闻摄影|             |[{1.1.1, V_1800, ...|
|  最有趣的理论+最有用的算法=不得不...|            [计算机科学与技术]|C_597208|      数据挖掘：理论与算法|             |[{1.1.1, V_2961, ...|
|大学计算机课程将以计算思维为导向，...|                            []|C_597225|                大学计算机|             |[{1.1.1, V_4596, ...|
+--------------------

In [7]:
concept = spark.read.options(header=True, inferSchema=True).json('entities/concept.json')
concept.show(5)

+--------------------------------+---------------------------------+----------------+
|                         context|                               id|            name|
+--------------------------------+---------------------------------+----------------+
|                              []|          K_神经部_组织学与胚胎学|          神经部|
|[质和髓质两部分组成\n答案：B\...|K_促甲状腺激素细胞_组织学与胚胎学|促甲状腺激素细胞|
|                              []|        K_嗜色细胞_组织学与胚胎学|        嗜色细胞|
| [案：B\n13．腺垂体嗜酸性细胞...|    K_生长激素细胞_组织学与胚胎学|    生长激素细胞|
|      [\n褐铁矿（Limonite）是...|          K_褐铁矿_材料科学与工程|          褐铁矿|
+--------------------------------+---------------------------------+----------------+
only showing top 5 rows



In [11]:
print(concept.count())

637572


In [8]:
course_concept = spark.read.options(header=True, inferSchema=True).text('relations/concept-course.txt')
course_concept.show(5)

+---------------------------------+
|                            value|
+---------------------------------+
|  K_工件叠加_控制科学与工程\tC...|
|K_直流输入电路_控制科学与工程\...|
|    K_plc系统_控制科学与工程\t...|
|  K_步进电机_控制科学与工程\tC...|
|  K_电源断电_控制科学与工程\tC...|
+---------------------------------+
only showing top 5 rows



In [9]:
# Split course and concept id from course_concept
course_concept = course_concept.withColumn("course_id", split(course_concept['value'], '\t')[1]) \
                              .withColumn("concept_id", split(course_concept['value'], '\t')[0]) \
                              .drop('value')
course_concept.show(5)

+---------+-----------------------------+
|course_id|                   concept_id|
+---------+-----------------------------+
| C_681460|    K_工件叠加_控制科学与工程|
| C_681460|K_直流输入电路_控制科学与工程|
| C_681460|     K_plc系统_控制科学与工程|
| C_681460|    K_步进电机_控制科学与工程|
| C_681460|    K_电源断电_控制科学与工程|
+---------+-----------------------------+
only showing top 5 rows



In [10]:
course_concept.summary().show()

+-------+---------+---------------------------+
|summary|course_id|                 concept_id|
+-------+---------+---------------------------+
|  count|   451078|                     451078|
|   mean|     NULL|                       NULL|
| stddev|     NULL|                       NULL|
|    min|C_1169394|      K_$ _计算机科学与技术|
|    25%|     NULL|                       NULL|
|    50%|     NULL|                       NULL|
|    75%|     NULL|                       NULL|
|    max| C_947773|K_ｇ分布的概率分布函数_数学|
+-------+---------+---------------------------+



### 1.Filter

In [12]:
# Join with course to filter the valid course_id
valid_course_concept = course_concept.join(
    course,
    course_concept.course_id == course.id,
    'inner'
).select(['course_id', 'concept_id'])
valid_course_concept.show(5)

+---------+-----------------------------+
|course_id|                   concept_id|
+---------+-----------------------------+
| C_681460|    K_工件叠加_控制科学与工程|
| C_681460|K_直流输入电路_控制科学与工程|
| C_681460|     K_plc系统_控制科学与工程|
| C_681460|    K_步进电机_控制科学与工程|
| C_681460|    K_电源断电_控制科学与工程|
+---------+-----------------------------+
only showing top 5 rows



In [13]:
# Join valid_course_concept with concept to filter the valid concept_id
fully_valid_course_concept = valid_course_concept.join(
    concept,
    valid_course_concept.concept_id == concept.id,
    'inner'
).select(['course_id', 'concept_id'])
fully_valid_course_concept.show(5)

+---------+------------------------+
|course_id|              concept_id|
+---------+------------------------+
| C_655850|   K_$ _计算机科学与技术|
| C_697797|   K_$ _计算机科学与技术|
|C_2091183|   K_$ _计算机科学与技术|
| C_707379|   K_$ _计算机科学与技术|
| C_682381|K_&符号_计算机科学与技术|
+---------+------------------------+
only showing top 5 rows



In [14]:
print("Số lượng dòng dữ liệu hợp lệ:", fully_valid_course_concept.count())
print("Số lượng dòng dữ liệu không hợp lệ:", course_concept.count() - fully_valid_course_concept.count())

Số lượng dòng dữ liệu hợp lệ: 451078
Số lượng dòng dữ liệu không hợp lệ: 0


In [18]:
concept_at_least_5 = fully_valid_course_concept.groupBy("concept_id").count().filter(col("count") >= 5).select("concept_id")
print("Số lượng concept ban đầu: ", concept.count())
print("Số lượng concept sau khi lọc: ", concept_at_least_5.count())

filtered_course_concept = fully_valid_course_concept.alias("a").join(
    concept_at_least_5.alias("b"),
    col("a.concept_id") == col("b.concept_id"),
    "inner"
).select(
    col("a.course_id"),
    col("a.concept_id")
)
filtered_course_concept.show(5)

Số lượng concept ban đầu:  637572
Số lượng concept sau khi lọc:  14016
+---------+------------------------+
|course_id|              concept_id|
+---------+------------------------+
| C_682381|K_&符号_计算机科学与技术|
|C_2341225|K_&符号_计算机科学与技术|
| C_697828|K_&符号_计算机科学与技术|
|C_2341259|K_&符号_计算机科学与技术|
| C_682189|K_&符号_计算机科学与技术|
+---------+------------------------+
only showing top 5 rows



In [19]:
print("Số lượng dòng dữ liệu ban đầu: ", course_concept.count())
print("Số lượng dòng dữ liệu sau khi lọc: ", filtered_course_concept.count())

Số lượng dòng dữ liệu ban đầu:  451078
Số lượng dòng dữ liệu sau khi lọc:  125584


### 2. Mapping

### Concept

In [20]:
valid_concepts = filtered_course_concept.select("concept_id").distinct()
valid_concepts.show()

+-----------------------------+
|                   concept_id|
+-----------------------------+
|     K_&符号_计算机科学与技术|
|   K_+运算符_计算机科学与技术|
|   K_0号元素_计算机科学与技术|
|               K_0向量，_数学|
|              K_1,4-加成_化学|
|             K_100阶导数_数学|
|    K_10gbps_计算机科学与技术|
|    K_16进制_计算机科学与技术|
|  K_16进制数_计算机科学与技术|
|       K_1gb_计算机科学与技术|
|             K_2次多项式_数学|
|  K_3d space_计算机科学与技术|
|K_3d几何结构_计算机科学与技术|
|K_3d变换滤镜_计算机科学与技术|
|                     K_4_化学|
|    K_64字节_计算机科学与技术|
|     K_80286_计算机科学与技术|
|      K_8bit_计算机科学与技术|
|   K_8维向量_计算机科学与技术|
|       K_ARM_计算机科学与技术|
+-----------------------------+
only showing top 20 rows



In [21]:
# Add a new column with numbers starting from 1
window_spec = Window.orderBy("concept_id")
df_mapped = valid_concepts.withColumn("mapped_id", row_number().over(window_spec) + 1924)

In [22]:
df_mapped.tail(2)

[Row(concept_id='K_，波函数_物理学', mapped_id=15939),
 Row(concept_id='K_，相电流_电气工程', mapped_id=15940)]

In [23]:
# Select only the id and the new number
mapping_df = df_mapped.select(valid_concepts["concept_id"].alias("original_id"), "mapped_id")
mapping_df.tail(5)

[Row(original_id='K_，寄存器_计算机科学与技术', mapped_id=15936),
 Row(original_id='K_，导函数_数学', mapped_id=15937),
 Row(original_id='K_，幂级数展开形式_数学', mapped_id=15938),
 Row(original_id='K_，波函数_物理学', mapped_id=15939),
 Row(original_id='K_，相电流_电气工程', mapped_id=15940)]

In [24]:
# Save as a single .txt file with two columns (CSV format)
mapping_df.coalesce(1).write \
    .option("header", True) \
    .option("delimiter", ",") \
    .mode("overwrite") \
    .csv("output/mapping_txt")

### Course-Concept

In [25]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [28]:
course_mapping = spark.read.options(header=True, inferSchema=True).csv('/content/drive/MyDrive/Big Data/Output/course_mapping.csv')
concept_mapping = spark.read.options(header=True, inferSchema=True).csv('/content/drive/MyDrive/Big Data/Output/concept.csv')

In [29]:
# Show mapping data
print("Course Mapping:")
course_mapping.show(10)
print("Concept Mapping:")
concept_mapping.show(10)

Course Mapping:
+-----------+---------+
|original_id|mapped_id|
+-----------+---------+
|  C_1017355|        0|
|  C_1017419|        1|
|  C_1025064|        2|
|  C_1025076|        3|
|  C_1025079|        4|
|  C_1073350|        5|
|  C_1123814|        6|
|  C_1123848|        7|
|  C_1123944|        8|
|  C_1123979|        9|
+-----------+---------+
only showing top 10 rows

Concept Mapping:
+----------------------------+---------+
|                 original_id|mapped_id|
+----------------------------+---------+
|    K_&符号_计算机科学与技术|     1925|
|    K_)函数_计算机科学与技术|     1926|
|      K_+ 2_计算机科学与技术|     1927|
|  K_+运算符_计算机科学与技术|     1928|
|     K_.NET_计算机科学与技术|     1929|
|                K_0函数_数学|     1930|
|  K_0号元素_计算机科学与技术|     1931|
|K_0号寄存器_计算机科学与技术|     1932|
|              K_0向量，_数学|     1933|
|            K_0阶导数_物理学|     1934|
+----------------------------+---------+
only showing top 10 rows



In [30]:
# Bước 1: Map concept_id từ original sang mapped
concept_mapped = fully_valid_course_concept.join(
    concept_mapping,
    fully_valid_course_concept.concept_id == concept_mapping.original_id,
    'inner'
).select(
    concept_mapping.mapped_id.alias("mapped_concept_id"),
    fully_valid_course_concept.course_id
)
concept_mapped.show(5)

+-----------------+---------+
|mapped_concept_id|course_id|
+-----------------+---------+
|             1925| C_682381|
|             1925|C_2341225|
|             1925| C_697828|
|             1925|C_2341259|
|             1925| C_682189|
+-----------------+---------+
only showing top 5 rows



In [31]:
# Bước 2: Map course_id từ original sang mapped
course_mapped = concept_mapped.join(
    course_mapping,
    concept_mapped.course_id == course_mapping.original_id,
    "inner"
).select(
    concept_mapped.mapped_concept_id,
    course_mapping.mapped_id.alias("mapped_course_id")
)
course_mapped.show(5)

+-----------------+----------------+
|mapped_concept_id|mapped_course_id|
+-----------------+----------------+
|             1925|            1795|
|             1925|            2343|
|             1925|            1703|
|             1925|            1794|
|             1925|            2711|
+-----------------+----------------+
only showing top 5 rows



In [32]:
# Bước 3: Tạo chuỗi text theo format yêu cầu: course_id 0 concept_id (0 là mối quan hệ teacher)
output_df = course_mapped.select(
    concat_ws(" ",
        col("mapped_course_id"),
              lit(2),
        col("mapped_concept_id")
    ).alias("output_line")
)

In [33]:
output_df.show(5)

+-----------+
|output_line|
+-----------+
|1795 2 1925|
|2343 2 1925|
|1703 2 1925|
|1794 2 1925|
|2711 2 1925|
+-----------+
only showing top 5 rows



In [34]:
output_df.select("output_line").coalesce(1).write.mode("overwrite").text("course_concept")