In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [3]:
spark = SparkSession.builder \
    .appName("Course-Field") \
    .getOrCreate()

## course

In [4]:
course_df = spark.read.json("/content/drive/MyDrive/Big Data/Input/course.json")

In [5]:
course_df.show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+--------+--------------------------------+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [6]:
course_df.describe().show()

+-------+------------------------------------+---------+-----------------------------------+--------------------------+
|summary|                               about|       id|                               name|             prerequisites|
+-------+------------------------------------+---------+-----------------------------------+--------------------------+
|  count|                                3779|     3781|                               3781|                      3779|
|   mean|                1.587301587301592E19|     NULL|                               NULL|                     111.0|
| stddev|                4.199605255658078E19|     NULL|                               NULL|                      NULL|
|    min|                                    |C_1017355|                    Food Chemistry |                          |
|    max|（1）特色：课程资源建设，充分体现...| C_956450|（疾风计划）面向对象程序设计（C++）|高级语言程序设计、数据结构|
+-------+------------------------------------+---------+-----------------------------

In [7]:
course_df.select('id').show(truncate=False)

+--------+
|id      |
+--------+
|C_584313|
|C_584329|
|C_584381|
|C_597208|
|C_597225|
|C_597229|
|C_597291|
|C_597307|
|C_597365|
|C_597367|
|C_645178|
|C_608132|
|C_678165|
|C_629513|
|C_629514|
|C_629515|
|C_629520|
|C_629522|
|C_654397|
|C_654551|
+--------+
only showing top 20 rows



## course-field

In [8]:
course_field_df = spark.read.json("/content/drive/MyDrive/Big Data/Input/course-field.json")

In [9]:
course_field_df.describe().show()

+-------+-----------------+------------------+
|summary|        course_id|       course_name|
+-------+-----------------+------------------+
|  count|              632|               632|
|   mean|691117.9841772151|              NULL|
| stddev| 84019.2774572239|              NULL|
|    min|           584313|      5G与人工智能|
|    max|          1814513|麦肯锡“全球领导力”|
+-------+-----------------+------------------+



In [10]:
course_field_df.show(truncate=False)

+---------+-----------------------------------+----------------------+
|course_id|course_name                        |field                 |
+---------+-----------------------------------+----------------------+
|584313   |《资治通鉴》导读                   |[中国语言文学, 历史学]|
|681932   |“做中学”Java程序设计               |[计算机科学与技术]    |
|674962   |《红楼梦》的空间艺术               |[中国语言文学]        |
|682709   |《纯粹理性批判》导论               |[哲学]                |
|682635   |《统万城》导读                     |[历史学]              |
|629515   |《论语》人生课堂                   |[中国语言文学]        |
|681692   |5G与人工智能                       |[计算机科学与技术]    |
|697791   |C++语言程序设计基础                |[计算机科学与技术]    |
|676937   |C++语言程序设计进阶                |[计算机科学与技术]    |
|682131   |C君带你玩编程                      |[计算机科学与技术]    |
|784172   |C语言程序设计（上）                |[计算机科学与技术]    |
|682672   |C语言程序设计（下）                |[计算机科学与技术]    |
|677038   |e时代的大佬师——慕课教师的修炼心法  |[教育学]              |
|677045   |e时代的教与学——慕课引发的混合式教学|[教育学]              |
|677235

In [11]:
course_field_df.printSchema()

root
 |-- course_id: long (nullable = true)
 |-- course_name: string (nullable = true)
 |-- field: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [12]:
print(course_field_df.count())

632


In [13]:
null_counts = course_field_df.select([
    sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in course_field_df.columns
])

null_counts.show()

+---------+-----------+-----+
|course_id|course_name|field|
+---------+-----------+-----+
|        0|          0|    0|
+---------+-----------+-----+



In [14]:
from pyspark.sql.functions import concat, lit, col

course_field_df = course_field_df.withColumn(
    "course_id",
    concat(lit("C_"), col("course_id").cast("string"))
)

# Kiểm tra lại
course_field_df.show(truncate=False)


+---------+-----------------------------------+----------------------+
|course_id|course_name                        |field                 |
+---------+-----------------------------------+----------------------+
|C_584313 |《资治通鉴》导读                   |[中国语言文学, 历史学]|
|C_681932 |“做中学”Java程序设计               |[计算机科学与技术]    |
|C_674962 |《红楼梦》的空间艺术               |[中国语言文学]        |
|C_682709 |《纯粹理性批判》导论               |[哲学]                |
|C_682635 |《统万城》导读                     |[历史学]              |
|C_629515 |《论语》人生课堂                   |[中国语言文学]        |
|C_681692 |5G与人工智能                       |[计算机科学与技术]    |
|C_697791 |C++语言程序设计基础                |[计算机科学与技术]    |
|C_676937 |C++语言程序设计进阶                |[计算机科学与技术]    |
|C_682131 |C君带你玩编程                      |[计算机科学与技术]    |
|C_784172 |C语言程序设计（上）                |[计算机科学与技术]    |
|C_682672 |C语言程序设计（下）                |[计算机科学与技术]    |
|C_677038 |e时代的大佬师——慕课教师的修炼心法  |[教育学]              |
|C_677045 |e时代的教与学——慕课引发的混合式教学|[教育学]              |
|C_6772

## Join

In [15]:
valid_course_field_df = course_field_df.join(
    course_df,
    course_field_df.course_id == course_df.id,
    "inner"
).select(
    course_field_df.course_id,
    course_field_df.course_name,
    course_field_df.field
)

In [16]:
valid_course_field_df.show()

+---------+---------------------------------+--------------------------------+
|course_id|                      course_name|                           field|
+---------+---------------------------------+--------------------------------+
| C_584313|                 《资治通鉴》导读|          [中国语言文学, 历史学]|
| C_584329|       微积分——极限理论与一元函数|[应用经济学, 理论经济学, 物理...|
| C_584381|                         新闻摄影|            [新闻传播学, 艺术学]|
| C_597208|             数据挖掘：理论与算法|              [计算机科学与技术]|
| C_597229|                   财务分析与决策|    [应用经济学, 管理科学与工程]|
| C_597307|                         大唐兴衰|                        [历史学]|
| C_629515|                 《论语》人生课堂|                  [中国语言文学]|
| C_629520|                       病理生理学|                      [临床医学]|
| C_629522|                     医学文献检索|                    [科学技术史]|
| C_629503|信息素养——学术研究的必修课（20...|                [情报与档案管理]|
| C_597314|     大学国文——北宋至现代文学赏析|                  [中国语言文学]|
| C_674903|       不朽的艺术：走进大师与经典|                        [艺术学]|
|

In [17]:
print("Số lượng dòng dữ liệu hợp lệ:", valid_course_field_df.count())
print("Số lượng dòng dữ liệu không hợp lệ:", course_field_df.count() - valid_course_field_df.count())

Số lượng dòng dữ liệu hợp lệ: 547
Số lượng dòng dữ liệu không hợp lệ: 85


In [18]:
# Đọc các file CSV mapping
course_mapping_df = spark.read.option("header", "true").csv("/content/drive/MyDrive/Big Data/Output/course_mapping.csv")

print("Course Mapping:")
course_mapping_df.show(10)

Course Mapping:
+-----------+---------+
|original_id|mapped_id|
+-----------+---------+
|  C_1017355|        0|
|  C_1017419|        1|
|  C_1025064|        2|
|  C_1025076|        3|
|  C_1025079|        4|
|  C_1073350|        5|
|  C_1123814|        6|
|  C_1123848|        7|
|  C_1123944|        8|
|  C_1123979|        9|
+-----------+---------+
only showing top 10 rows



In [19]:
course_field_mapped_df = valid_course_field_df.join(
    course_mapping_df,
    valid_course_field_df.course_id == course_mapping_df.original_id,
    "inner"
).select(
    course_mapping_df.mapped_id.alias("mapped_course_id"),
    valid_course_field_df.course_name,
    valid_course_field_df.field
)

In [20]:
course_field_mapped_df.show(truncate=False)

+----------------+------------------------------------+--------------------------------------------+
|mapped_course_id|course_name                         |field                                       |
+----------------+------------------------------------+--------------------------------------------+
|1158            |微积分——极限理论与一元函数          |[应用经济学, 理论经济学, 物理学, 数学]      |
|1159            |新闻摄影                            |[新闻传播学, 艺术学]                        |
|1160            |数据挖掘：理论与算法                |[计算机科学与技术]                          |
|1162            |财务分析与决策                      |[应用经济学, 管理科学与工程]                |
|1164            |信息素养——学术研究的必修课（2019春）|[情报与档案管理]                            |
|1163            |大学国文——北宋至现代文学赏析        |[中国语言文学]                              |
|1173            |不朽的艺术：走进大师与经典          |[艺术学]                                    |
|1177            |逻辑学概论                          |[哲学]                                      |
|1182            |经济地理与企业兴衰  

## tạo field riêng

In [21]:
field_df = (
    course_field_mapped_df
    .select(explode(col("field")).alias("name"))
    .distinct()
    .withColumn("field_id", monotonically_increasing_id() + 13023)
)

In [22]:
field_df.show(truncate=False)

+--------------------+--------+
|name                |field_id|
+--------------------+--------+
|哲学                |13023   |
|公共管理            |13024   |
|畜牧学              |13025   |
|口腔医学            |13026   |
|兵器科学与技术      |13027   |
|计算机科学与技术    |13028   |
|动力工程及工程热物理|13029   |
|控制科学与工程      |13030   |
|园艺学              |13031   |
|地质学              |13032   |
|临床中医学          |13033   |
|植物保护            |13034   |
|化学工程与技术      |13035   |
|仪器科学与技术      |13036   |
|地球物理学          |13037   |
|生物医学工程        |13038   |
|体育学              |13039   |
|土木工程            |13040   |
|心理学              |13041   |
|材料科学与工程      |13042   |
+--------------------+--------+
only showing top 20 rows



In [23]:
# 3. Tạo bảng Course_Field: explode mảng field rồi join để lấy field_id
course_field_mapped_df = (
    course_field_mapped_df
    .select("mapped_course_id", explode(col("field")).alias("name"))
    .join(field_df, on="name", how="inner")
    .select("mapped_course_id", "field_id")
)


In [24]:
course_field_mapped_df.show(truncate=False)

+----------------+--------+
|mapped_course_id|field_id|
+----------------+--------+
|2491            |13023   |
|1925            |13023   |
|1924            |13023   |
|1768            |13023   |
|1419            |13023   |
|1408            |13023   |
|1287            |13023   |
|1232            |13023   |
|1201            |13023   |
|1177            |13023   |
|1270            |13024   |
|2085            |13024   |
|1434            |13024   |
|1961            |13024   |
|1489            |13024   |
|2018            |13025   |
|1391            |13026   |
|1757            |13026   |
|2052            |13027   |
|2341            |13028   |
+----------------+--------+
only showing top 20 rows



In [25]:
# Bước 3: Tạo chuỗi text theo format yêu cầu: course_id 0 teacher_id (0 là mối quan hệ teacher)
output_df = course_field_mapped_df.select(
    concat_ws(" ",
        col("mapped_course_id"),
              lit(1),
        col("field_id")
    ).alias("output_line")
)

In [26]:
output_df.show(truncate=False)

+------------+
|output_line |
+------------+
|2491 1 13023|
|1925 1 13023|
|1924 1 13023|
|1768 1 13023|
|1419 1 13023|
|1408 1 13023|
|1287 1 13023|
|1232 1 13023|
|1201 1 13023|
|1177 1 13023|
|1270 1 13024|
|2085 1 13024|
|1434 1 13024|
|1961 1 13024|
|1489 1 13024|
|2018 1 13025|
|1391 1 13026|
|1757 1 13026|
|2052 1 13027|
|2341 1 13028|
+------------+
only showing top 20 rows



In [27]:
print(course_field_mapped_df.count())

576


In [28]:
# Save as a single .txt file
output_df.coalesce(1).write \
    .mode("overwrite") \
    .text("output/course_field_mapping.txt")

In [29]:
# prompt: save lại field_df theo định dạng json
field_df.coalesce(1).write.mode("overwrite").json("output/field_df_json")