In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [3]:
spark = SparkSession.builder \
    .appName("Course-Teacher") \
    .getOrCreate()

## Read

In [4]:
course_df = spark.read.json("/content/drive/MyDrive/Big Data/Input/course.json")

In [5]:
course_df.describe().show()

+-------+------------------------------------+---------+-----------------------------------+--------------------------+
|summary|                               about|       id|                               name|             prerequisites|
+-------+------------------------------------+---------+-----------------------------------+--------------------------+
|  count|                                3779|     3781|                               3781|                      3779|
|   mean|                1.587301587301592E19|     NULL|                               NULL|                     111.0|
| stddev|                4.199605255658078E19|     NULL|                               NULL|                      NULL|
|    min|                                    |C_1017355|                    Food Chemistry |                          |
|    max|（1）特色：课程资源建设，充分体现...| C_956450|（疾风计划）面向对象程序设计（C++）|高级语言程序设计、数据结构|
+-------+------------------------------------+---------+-----------------------------

In [6]:
course_df.printSchema()

root
 |-- about: string (nullable = true)
 |-- field: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- prerequisites: string (nullable = true)
 |-- resource: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- chapter: string (nullable = true)
 |    |    |-- resource_id: string (nullable = true)
 |    |    |-- titles: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)



In [7]:
teacher_df = spark.read.json("/content/drive/MyDrive/Big Data/Input/teacher.json")

In [8]:
teacher_df.describe().show()

+-------+------------------------------------+------+----------------------------------------+------------------+------------------+------------+
|summary|                               about|    id|                               job_title|              name|           name_en|    org_name|
+-------+------------------------------------+------+----------------------------------------+------------------+------------------+------------+
|  count|                               17018| 17018|                                   17018|             17018|              9525|       17018|
|   mean|                1.587312700277857...|  NULL|                                  1251.4|3337802.6666666665|             139.0|        NULL|
| stddev|                4.199600355396168...|  NULL|                      2416.7408425398035| 9999916.611229304|139.34130758680286|        NULL|
|    min|                                    |   T_1|                                        |     伊萨克·布赫曼|                 

In [9]:
teacher_df.printSchema()

root
 |-- about: string (nullable = true)
 |-- id: string (nullable = true)
 |-- job_title: string (nullable = true)
 |-- name: string (nullable = true)
 |-- name_en: string (nullable = true)
 |-- org_name: string (nullable = true)



In [10]:
course_teacher_df = spark.read.text("/content/drive/MyDrive/Big Data/Input/course-teacher.txt")

In [11]:
course_teacher_df.show(10)

+----------------+
|           value|
+----------------+
|C_323899\tT_5092|
|C_323899\tT_5092|
|C_324384\tT_6164|
|C_324384\tT_6472|
|C_324384\tT_6471|
|C_324384\tT_6473|
|C_324384\tT_6164|
|C_324384\tT_6472|
|C_324384\tT_6471|
|C_324384\tT_6473|
+----------------+
only showing top 10 rows



In [12]:
course_teacher_df = course_teacher_df.withColumn("course_id", split(course_teacher_df["value"], "\t")[0]) \
                                     .withColumn("teacher_id", split(course_teacher_df["value"], "\t")[1]) \
                                     .drop("value")
course_teacher_df.show(10)

+---------+----------+
|course_id|teacher_id|
+---------+----------+
| C_323899|    T_5092|
| C_323899|    T_5092|
| C_324384|    T_6164|
| C_324384|    T_6472|
| C_324384|    T_6471|
| C_324384|    T_6473|
| C_324384|    T_6164|
| C_324384|    T_6472|
| C_324384|    T_6471|
| C_324384|    T_6473|
+---------+----------+
only showing top 10 rows



## Filter

In [13]:
# Inner join với course_df để lọc course_id hợp lệ
valid_course_teacher_df = course_teacher_df.join(
    course_df,
    course_teacher_df.course_id == course_df.id,
    "inner"
).select(
    course_teacher_df.course_id,
    course_teacher_df.teacher_id
)

In [14]:
# Tiếp tục inner join với teacher_df để lọc teacher_id hợp lệ
fully_valid_df = valid_course_teacher_df.join(
    teacher_df,
    valid_course_teacher_df.teacher_id == teacher_df.id,
    "inner"
).select(
    valid_course_teacher_df.course_id,
    valid_course_teacher_df.teacher_id
)

In [15]:
print("Số lượng dòng dữ liệu hợp lệ:", fully_valid_df.count())
print("Số lượng dòng dữ liệu không hợp lệ:", course_teacher_df.count() - fully_valid_df.count())

Số lượng dòng dữ liệu hợp lệ: 35593
Số lượng dòng dữ liệu không hợp lệ: 61599


## Mapping

### Teacher

In [16]:
valid_teachers = fully_valid_df.select("teacher_id").distinct()
valid_teachers.show()

+----------+
|teacher_id|
+----------+
|     T_163|
|     T_207|
|     T_506|
|    T_1261|
|    T_1432|
|    T_1685|
|    T_1829|
|    T_2032|
|    T_2318|
|    T_2423|
|    T_2417|
|    T_2477|
|    T_2834|
|    T_3020|
|    T_3416|
|    T_1503|
|    T_3736|
|    T_1754|
|     T_914|
|    T_6423|
+----------+
only showing top 20 rows



In [26]:
# Add a new column with numbers starting from 1
window_spec = Window.orderBy("teacher_id")
df_mapped = valid_teachers.withColumn("mapped_id", row_number().over(window_spec) + 3147)

In [30]:
# Select only the id and the new number
mapping_df = df_mapped.select(valid_teachers["teacher_id"].alias("original_id"), "mapped_id")

In [32]:
# Save as a single .txt file with two columns (CSV format)
mapping_df.coalesce(1).write \
    .option("header", True) \
    .option("delimiter", ",") \
    .mode("overwrite") \
    .csv("output/mapping_txt")

### Course-Teacher

In [33]:
# Đọc các file CSV mapping
teacher_mapping_df = spark.read.option("header", "true").csv("/content/drive/MyDrive/Big Data/Output/teacher.csv")
course_mapping_df = spark.read.option("header", "true").csv("/content/drive/MyDrive/Big Data/Output/course_mapping.csv")

# Hiển thị mapping data để kiểm tra
print("Teacher Mapping:")
teacher_mapping_df.show(10)
print("Course Mapping:")
course_mapping_df.show(10)

Teacher Mapping:
+-----------+---------+
|original_id|mapped_id|
+-----------+---------+
|        T_1|     3148|
|       T_10|     3149|
|      T_100|     3150|
|     T_1000|     3151|
|    T_10000|     3152|
|    T_10001|     3153|
|    T_10002|     3154|
|    T_10003|     3155|
|    T_10004|     3156|
|    T_10005|     3157|
+-----------+---------+
only showing top 10 rows

Course Mapping:
+-----------+---------+
|original_id|mapped_id|
+-----------+---------+
|  C_1017355|        0|
|  C_1017419|        1|
|  C_1025064|        2|
|  C_1025076|        3|
|  C_1025079|        4|
|  C_1073350|        5|
|  C_1123814|        6|
|  C_1123848|        7|
|  C_1123944|        8|
|  C_1123979|        9|
+-----------+---------+
only showing top 10 rows



In [34]:
# Bước 1: Map teacher_id từ original sang mapped
teacher_mapped_df = fully_valid_df.join(
    teacher_mapping_df,
    fully_valid_df.teacher_id == teacher_mapping_df.original_id,
    "inner"
).select(
  teacher_mapping_df.mapped_id.alias("mapped_teacher_id"),
  fully_valid_df.course_id
)

In [35]:
# Bước 2: Map course_id từ original sang mapped
course_mapped_df = teacher_mapped_df.join(
    course_mapping_df,
    teacher_mapped_df.course_id == course_mapping_df.original_id,
    "inner"
).select(
    teacher_mapped_df.mapped_teacher_id,
    course_mapping_df.mapped_id.alias("mapped_course_id")
)

In [36]:
course_mapped_df.count()

33249

In [37]:
course_mapped_df.show(10)

+-----------------+----------------+
|mapped_teacher_id|mapped_course_id|
+-----------------+----------------+
|             7266|            1158|
|             7157|            1158|
|             7375|            1159|
|             7906|            1160|
|             9365|            1161|
|             9259|            1161|
|             9468|            1162|
|             7047|            1165|
|            12547|            1172|
|             6816|            1171|
+-----------------+----------------+
only showing top 10 rows



In [38]:
# Bước 3: Tạo chuỗi text theo format yêu cầu: course_id 0 teacher_id (0 là mối quan hệ teacher)
output_df = course_mapped_df.select(
    concat_ws(" ",
        col("mapped_course_id"),
              lit(0),
        col("mapped_teacher_id")
    ).alias("output_line")
)

In [39]:
output_df.show(10)

+------------+
| output_line|
+------------+
| 1158 0 7266|
| 1158 0 7157|
| 1159 0 7375|
| 1160 0 7906|
| 1161 0 9365|
| 1161 0 9259|
| 1162 0 9468|
| 1165 0 7047|
|1172 0 12547|
| 1171 0 6816|
+------------+
only showing top 10 rows



In [40]:
output_df.select("output_line").coalesce(1).write.mode("overwrite").text("course_teacher")