In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [None]:
spark = SparkSession.builder \
    .appName("Course-Teacher") \
    .getOrCreate()

## Read

In [None]:
course_df = spark.read.json("/content/drive/MyDrive/Big Data/Input/course.json")

In [None]:
course_df.describe().show()

+-------+------------------------------------+---------+-----------------------------------+--------------------------+
|summary|                               about|       id|                               name|             prerequisites|
+-------+------------------------------------+---------+-----------------------------------+--------------------------+
|  count|                                3779|     3781|                               3781|                      3779|
|   mean|                1.587301587301592E19|     NULL|                               NULL|                     111.0|
| stddev|                4.199605255658078E19|     NULL|                               NULL|                      NULL|
|    min|                                    |C_1017355|                    Food Chemistry |                          |
|    max|（1）特色：课程资源建设，充分体现...| C_956450|（疾风计划）面向对象程序设计（C++）|高级语言程序设计、数据结构|
+-------+------------------------------------+---------+-----------------------------

In [None]:
course_df.printSchema()

root
 |-- about: string (nullable = true)
 |-- field: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- prerequisites: string (nullable = true)
 |-- resource: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- chapter: string (nullable = true)
 |    |    |-- resource_id: string (nullable = true)
 |    |    |-- titles: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)



In [None]:
teacher_df = spark.read.json("/content/drive/MyDrive/Big Data/Input/teacher.json")

In [None]:
teacher_df.describe().show()

+-------+------------------------------------+------+----------------------------------------+------------------+------------------+------------+
|summary|                               about|    id|                               job_title|              name|           name_en|    org_name|
+-------+------------------------------------+------+----------------------------------------+------------------+------------------+------------+
|  count|                               17018| 17018|                                   17018|             17018|              9525|       17018|
|   mean|                1.587312700277857...|  NULL|                                  1251.4|3337802.6666666665|             139.0|        NULL|
| stddev|                4.199600355396168...|  NULL|                      2416.7408425398035| 9999916.611229304|139.34130758680286|        NULL|
|    min|                                    |   T_1|                                        |     伊萨克·布赫曼|                 

In [None]:
teacher_df.printSchema()

root
 |-- about: string (nullable = true)
 |-- id: string (nullable = true)
 |-- job_title: string (nullable = true)
 |-- name: string (nullable = true)
 |-- name_en: string (nullable = true)
 |-- org_name: string (nullable = true)



In [None]:
teacher_df.count()

17018

In [None]:
course_teacher_df = spark.read.text("/content/drive/MyDrive/Big Data/Input/course-teacher.txt")

In [None]:
course_teacher_df.show(10)

+----------------+
|           value|
+----------------+
|C_323899\tT_5092|
|C_323899\tT_5092|
|C_324384\tT_6164|
|C_324384\tT_6472|
|C_324384\tT_6471|
|C_324384\tT_6473|
|C_324384\tT_6164|
|C_324384\tT_6472|
|C_324384\tT_6471|
|C_324384\tT_6473|
+----------------+
only showing top 10 rows



In [None]:
course_teacher_df = course_teacher_df.withColumn("course_id", split(course_teacher_df["value"], "\t")[0]) \
                                     .withColumn("teacher_id", split(course_teacher_df["value"], "\t")[1]) \
                                     .drop("value")
course_teacher_df.show(10)

+---------+----------+
|course_id|teacher_id|
+---------+----------+
| C_323899|    T_5092|
| C_323899|    T_5092|
| C_324384|    T_6164|
| C_324384|    T_6472|
| C_324384|    T_6471|
| C_324384|    T_6473|
| C_324384|    T_6164|
| C_324384|    T_6472|
| C_324384|    T_6471|
| C_324384|    T_6473|
+---------+----------+
only showing top 10 rows



## Filter

In [None]:
# Inner join với course_df để lọc course_id hợp lệ
valid_course_teacher_df = course_teacher_df.join(
    course_df,
    course_teacher_df.course_id == course_df.id,
    "inner"
).select(
    course_teacher_df.course_id,
    course_teacher_df.teacher_id
)

In [None]:
# Tiếp tục inner join với teacher_df để lọc teacher_id hợp lệ
fully_valid_df = valid_course_teacher_df.join(
    teacher_df,
    valid_course_teacher_df.teacher_id == teacher_df.id,
    "inner"
).select(
    valid_course_teacher_df.course_id,
    valid_course_teacher_df.teacher_id
)

In [None]:
print("Số lượng dòng dữ liệu hợp lệ:", fully_valid_df.count())
print("Số lượng dòng dữ liệu không hợp lệ:", course_teacher_df.count() - fully_valid_df.count())

Số lượng dòng dữ liệu hợp lệ: 35593
Số lượng dòng dữ liệu không hợp lệ: 61599


In [None]:
teacher_counts = fully_valid_df.groupBy("teacher_id").count()
teachers_at_least_5 = teacher_counts.filter(col("count") >= 5).select("teacher_id")
filtered_fully_valid_df = fully_valid_df.alias("a").join(
    teachers_at_least_5.alias("b"),
    col("a.teacher_id") == col("b.teacher_id"),
    "inner"
).select(
    col("a.course_id"),
    col("a.teacher_id")
)

In [None]:
print("Số lượng dòng dữ liệu hợp lệ:", filtered_fully_valid_df.count())

Số lượng dòng dữ liệu hợp lệ: 12476


## Mapping

### Teacher

In [None]:
valid_teachers = filtered_fully_valid_df.select("teacher_id").distinct()
valid_teachers.count()

1883

In [None]:
# Add a new column with numbers starting from 1
window_spec = Window.orderBy("teacher_id")
df_mapped = valid_teachers.withColumn("mapped_id", row_number().over(window_spec) - 1)

In [None]:
df_mapped.tail(2)

[Row(teacher_id='T_9962', mapped_id=1881),
 Row(teacher_id='T_9969', mapped_id=1882)]

In [None]:
# Select only the id and the new number
mapping_df = df_mapped.select(valid_teachers["teacher_id"].alias("original_id"), "mapped_id")

In [None]:
# Save as a single .txt file with two columns (CSV format)
mapping_df.coalesce(1).write \
    .option("header", True) \
    .option("delimiter", ",") \
    .mode("overwrite") \
    .csv("output/mapping_txt")

### Course-Teacher

In [None]:
# Đọc các file CSV mapping
teacher_mapping_df = spark.read.option("header", "true").csv("/content/drive/MyDrive/Big Data/Output/teacher.csv")
course_mapping_df = spark.read.option("header", "true").csv("/content/drive/MyDrive/Big Data/Output/course_mapping.csv")

# Hiển thị mapping data để kiểm tra
print("Teacher Mapping:")
teacher_mapping_df.show(10)
print("Course Mapping:")
course_mapping_df.show(10)

Teacher Mapping:
+-----------+---------+
|original_id|mapped_id|
+-----------+---------+
|    T_10008|        0|
|    T_10087|        1|
|     T_1009|        2|
|     T_1010|        3|
|    T_10106|        4|
|     T_1011|        5|
|    T_10111|        6|
|     T_1012|        7|
|     T_1013|        8|
|     T_1018|        9|
+-----------+---------+
only showing top 10 rows

Course Mapping:
+-----------+---------+
|original_id|mapped_id|
+-----------+---------+
|  C_1017355|        0|
|  C_1017419|        1|
|  C_1025064|        2|
|  C_1025076|        3|
|  C_1025079|        4|
|  C_1073350|        5|
|  C_1123814|        6|
|  C_1123848|        7|
|  C_1123944|        8|
|  C_1123979|        9|
+-----------+---------+
only showing top 10 rows



In [None]:
# Bước 1: Map teacher_id từ original sang mapped
teacher_mapped_df = fully_valid_df.join(
    teacher_mapping_df,
    fully_valid_df.teacher_id == teacher_mapping_df.original_id,
    "inner"
).select(
  teacher_mapping_df.mapped_id.alias("mapped_teacher_id"),
  fully_valid_df.course_id
)

In [None]:
# Bước 2: Map course_id từ original sang mapped
course_mapped_df = teacher_mapped_df.join(
    course_mapping_df,
    teacher_mapped_df.course_id == course_mapping_df.original_id,
    "inner"
).select(
    teacher_mapped_df.mapped_teacher_id,
    course_mapping_df.mapped_id.alias("mapped_course_id")
)

In [None]:
course_mapped_df.count()

11082

In [None]:
course_mapped_df.show(10)

+-----------------+----------------+
|mapped_teacher_id|mapped_course_id|
+-----------------+----------------+
|              525|            1158|
|              493|            1158|
|              542|            1159|
|              728|            1160|
|             1266|            1161|
|             1250|            1161|
|             1287|            1162|
|              472|            1165|
|             1830|            1172|
|              364|            1171|
+-----------------+----------------+
only showing top 10 rows



In [None]:
# Bước 3: Tạo chuỗi text theo format yêu cầu: course_id 0 teacher_id (0 là mối quan hệ teacher)
output_df = course_mapped_df.select(
    concat_ws(" ",
        col("mapped_course_id"),
              lit(0),
        col("mapped_teacher_id")
    ).alias("output_line")
)

In [None]:
output_df.show(10)

+-----------+
|output_line|
+-----------+
| 1158 0 525|
| 1158 0 493|
| 1159 0 542|
| 1160 0 728|
|1161 0 1266|
|1161 0 1250|
|1162 0 1287|
| 1165 0 472|
|1172 0 1830|
| 1171 0 364|
+-----------+
only showing top 10 rows



In [None]:
output_df.select("output_line").coalesce(1).write.mode("overwrite").text("course_teacher")