In [1]:
!wget -q https://lfs.aminer.cn/misc/moocdata/data/mooccube2/entities/school.json \
         https://lfs.aminer.cn/misc/moocdata/data/mooccube2/entities/course.json \
         https://lfs.aminer.cn/misc/moocdata/data/mooccube2/relations/course-school.txt

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('preprocessing data').getOrCreate()

In [3]:
from pyspark.sql.functions import col, trim, concat_ws, lit, split
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

In [4]:
course = spark.read.options(header=True, inferSchema=True).json('course.json')
course.show(5)

+-------------------------------------+------------------------------+--------+--------------------------+-------------+--------------------+
|                                about|                         field|      id|                      name|prerequisites|            resource|
+-------------------------------------+------------------------------+--------+--------------------------+-------------+--------------------+
|通过老师导读，同学们可深入这一经典...|        [历史学, 中国语言文学]|C_584313|          《资治通鉴》导读|             |[{1.1.1, V_849, [...|
|本课程是理工科的一门数学基础课，系...|[应用经济学, 数学, 物理学, ...|C_584329|微积分——极限理论与一元函数|             |[{1.1.1, V_1350, ...|
|掌握基本的摄影技能，了解图片新闻的...|          [艺术学, 新闻传播学]|C_584381|                  新闻摄影|             |[{1.1.1, V_1800, ...|
|  最有趣的理论+最有用的算法=不得不...|            [计算机科学与技术]|C_597208|      数据挖掘：理论与算法|             |[{1.1.1, V_2961, ...|
|大学计算机课程将以计算思维为导向，...|                            []|C_597225|                大学计算机|             |[{1.1.1, V_4596, ...|
+--------------------

In [5]:
school = spark.read.options(header=True, inferSchema=True).json('school.json')
school.show(5)

+-----------------------------------+---+----------------------+--------+-------------------+----+
|                              about| id|                 motto|    name|            name_en|sign|
+-----------------------------------+---+----------------------+--------+-------------------+----+
|简称“清华”，由中华人民共和国教育...|S_1|     自强不息,厚德载物|清华大学|Tsinghua University| thu|
|          北京大学（Peking Unive...|S_2|博学、审问、慎思、明辨|北京大学|  Peking University| PKU|
|          武汉大学（Wuhan Univer...|S_3|   自强 弘毅 求是 拓新|武汉大学|   Wuhan University| whu|
|          苏州大学（Soochow Univ...|S_4|养天地正气，法古今完人|苏州大学| Soochow University|suda|
|          四川大学（Sichuan Univ...|S_5|                      |四川大学| Sichuan University| scu|
+-----------------------------------+---+----------------------+--------+-------------------+----+
only showing top 5 rows



In [6]:
course_school = spark.read.options(header=True, inferSchema=True).text('course-school.txt')
course_school.show(5)

+-------------+
|        value|
+-------------+
|C_375629\tS_1|
|C_375775\tS_1|
|C_375778\tS_1|
|C_584313\tS_1|
|C_584329\tS_1|
+-------------+
only showing top 5 rows



In [7]:
# Split course and school id from course_school
course_school = course_school.withColumn("course_id", split(course_school['value'], '\t')[0]) \
                              .withColumn("school_id", split(course_school['value'], '\t')[1]) \
                              .drop('value')
course_school.show(5)

+---------+---------+
|course_id|school_id|
+---------+---------+
| C_375629|      S_1|
| C_375775|      S_1|
| C_375778|      S_1|
| C_584313|      S_1|
| C_584329|      S_1|
+---------+---------+
only showing top 5 rows



In [8]:
course_school.summary().show()

+-------+---------+---------+
|summary|course_id|school_id|
+-------+---------+---------+
|  count|     3983|     3983|
|   mean|     NULL|     NULL|
| stddev|     NULL|     NULL|
|    min|C_1017355|      S_1|
|    25%|     NULL|     NULL|
|    50%|     NULL|     NULL|
|    75%|     NULL|     NULL|
|    max| C_956450|     S_99|
+-------+---------+---------+



### 1. Filter

In [9]:
# Join with course to filter the valid course_id
valid_course_school = course_school.join(
    course,
    course_school.course_id == course.id,
    'inner'
).select(['course_id', 'school_id'])
valid_course_school.show(5)

+---------+---------+
|course_id|school_id|
+---------+---------+
| C_584313|      S_1|
| C_584329|      S_1|
| C_584381|      S_1|
| C_597208|      S_1|
| C_597225|      S_6|
+---------+---------+
only showing top 5 rows



In [10]:
# Join valid_course_school with school to filter the valid school_id
fully_valid_course_school = valid_course_school.join(
    school,
    valid_course_school.school_id == school.id,
    'inner'
).select(['course_id', 'school_id'])
fully_valid_course_school.show(5)

+---------+---------+
|course_id|school_id|
+---------+---------+
| C_584313|      S_1|
| C_584329|      S_1|
| C_584381|      S_1|
| C_597208|      S_1|
| C_597225|      S_6|
+---------+---------+
only showing top 5 rows



In [11]:
print("Số lượng dòng dữ liệu hợp lệ:", fully_valid_course_school.count())
print("Số lượng dòng dữ liệu không hợp lệ:", course_school.count() - fully_valid_course_school.count())

Số lượng dòng dữ liệu hợp lệ: 3605
Số lượng dòng dữ liệu không hợp lệ: 378


In [12]:
school_at_least_5 = fully_valid_course_school.groupBy("school_id").count().filter("count >= 5")
print("Số lượng school ban đầu: ", school.count())
print("Số lượng school có ít nhất 5 khóa học: ", school_at_least_5.count())

filtered_course_school = fully_valid_course_school.alias("a").join(
    school_at_least_5.alias("b"),
    col("a.school_id") == col("b.school_id"),
    "inner"
).select(
    col("a.course_id"),
    col("a.school_id")
)
filtered_course_school.show(5)

Số lượng school ban đầu:  429
Số lượng school có ít nhất 5 khóa học:  178
+---------+---------+
|course_id|school_id|
+---------+---------+
| C_696563|     S_21|
| C_682550|     S_21|
| C_681255|     S_21|
| C_680870|     S_21|
| C_597367|     S_21|
+---------+---------+
only showing top 5 rows



In [13]:
print("Số lượng dòng dữ liệu ban đầu: ", course_school.count())
print("Số lượng dòng dữ liệu sau khi lọc: ", filtered_course_school.count())

Số lượng dòng dữ liệu ban đầu:  3983
Số lượng dòng dữ liệu sau khi lọc:  3117


### 2. Mapping

### School

In [14]:
valid_schools = filtered_course_school.select("school_id").distinct()
valid_schools.show()

+---------+
|school_id|
+---------+
|     S_21|
|    S_138|
|    S_319|
|    S_228|
|    S_486|
|    S_379|
|     S_14|
|     S_64|
|     S_83|
|    S_220|
|    S_131|
|    S_130|
|    S_218|
|    S_415|
|     S_92|
|    S_159|
|    S_494|
|    S_260|
|     S_84|
|     S_73|
+---------+
only showing top 20 rows



In [15]:
# Add a new column with numbers starting from 1
window_spec = Window.orderBy("school_id")
df_mapped = valid_schools.withColumn("mapped_id", row_number().over(window_spec) + 15940)

In [16]:
# Select only the id and the new number
mapping_df = df_mapped.select(valid_schools["school_id"].alias("original_id"), "mapped_id")
mapping_df.tail(5)

[Row(original_id='S_92', mapped_id=16114),
 Row(original_id='S_964', mapped_id=16115),
 Row(original_id='S_97', mapped_id=16116),
 Row(original_id='S_98', mapped_id=16117),
 Row(original_id='S_99', mapped_id=16118)]

In [17]:
# Save as a single .txt file with two columns (CSV format)
mapping_df.coalesce(1).write \
    .option("header", True) \
    .option("delimiter", ",") \
    .mode("overwrite") \
    .csv("output/mapping_txt")

### Course-School

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
course_mapping = spark.read.options(header=True, inferSchema=True).csv('/content/drive/MyDrive/Big Data/Output/course_mapping.csv')
school_mapping = spark.read.options(header=True, inferSchema=True).csv('/content/drive/MyDrive/Big Data/Output/school.csv')

In [20]:
# Show mapping data
print("Course Mapping:")
course_mapping.show(10)
print("School Mapping:")
school_mapping.show(10)

Course Mapping:
+-----------+---------+
|original_id|mapped_id|
+-----------+---------+
|  C_1017355|        0|
|  C_1017419|        1|
|  C_1025064|        2|
|  C_1025076|        3|
|  C_1025079|        4|
|  C_1073350|        5|
|  C_1123814|        6|
|  C_1123848|        7|
|  C_1123944|        8|
|  C_1123979|        9|
+-----------+---------+
only showing top 10 rows

School Mapping:
+-----------+---------+
|original_id|mapped_id|
+-----------+---------+
|        S_1|    15941|
|       S_10|    15942|
|      S_100|    15943|
|      S_103|    15944|
|     S_1031|    15945|
|      S_104|    15946|
|      S_107|    15947|
|     S_1075|    15948|
|     S_1086|    15949|
|      S_109|    15950|
+-----------+---------+
only showing top 10 rows



In [21]:
# Bước 1: Map school_id từ original sang mapped
school_mapped = fully_valid_course_school.join(
    school_mapping,
    fully_valid_course_school.school_id == school_mapping.original_id,
    'inner'
).select(
    school_mapping.mapped_id.alias("mapped_school_id"),
    fully_valid_course_school.course_id
)
school_mapped.show(5)

+----------------+---------+
|mapped_school_id|course_id|
+----------------+---------+
|           15941| C_584313|
|           15941| C_584329|
|           15941| C_584381|
|           15941| C_597208|
|           16079| C_597225|
+----------------+---------+
only showing top 5 rows



In [22]:
# Bước 2: Map course_id từ original sang mapped
course_mapped = school_mapped.join(
    course_mapping,
    school_mapped.course_id == course_mapping.original_id,
    "inner"
).select(
    school_mapped.mapped_school_id,
    course_mapping.mapped_id.alias("mapped_course_id")
)
course_mapped.show(5)

+----------------+----------------+
|mapped_school_id|mapped_course_id|
+----------------+----------------+
|           15941|            1158|
|           15941|            1159|
|           15941|            1160|
|           16079|            1161|
|           15941|            1162|
+----------------+----------------+
only showing top 5 rows



In [23]:
# Bước 3: Tạo chuỗi text theo format yêu cầu: course_id 0 school_id (0 là mối quan hệ teacher)
output_df = course_mapped.select(
    concat_ws(" ",
        col("mapped_course_id"),
              lit(3),
        col("mapped_school_id")
    ).alias("output_line")
)

In [24]:
output_df.show(5)

+------------+
| output_line|
+------------+
|1158 3 15941|
|1159 3 15941|
|1160 3 15941|
|1161 3 16079|
|1162 3 15941|
+------------+
only showing top 5 rows



In [25]:
output_df.select("output_line").coalesce(1).write.mode("overwrite").text("course_school")