In [1]:
!wget -q https://lfs.aminer.cn/misc/moocdata/data/mooccube2/entities/school.json

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('preprocessing data').getOrCreate()

In [3]:
from pyspark.sql.functions import col, trim, split, row_number
from pyspark.sql.window import Window

In [4]:
school = spark.read.options(header=True, inferSchema=True).json('school.json')
school.show(5)

+-----------------------------------+---+----------------------+--------+-------------------+----+
|                              about| id|                 motto|    name|            name_en|sign|
+-----------------------------------+---+----------------------+--------+-------------------+----+
|简称“清华”，由中华人民共和国教育...|S_1|     自强不息,厚德载物|清华大学|Tsinghua University| thu|
|          北京大学（Peking Unive...|S_2|博学、审问、慎思、明辨|北京大学|  Peking University| PKU|
|          武汉大学（Wuhan Univer...|S_3|   自强 弘毅 求是 拓新|武汉大学|   Wuhan University| whu|
|          苏州大学（Soochow Univ...|S_4|养天地正气，法古今完人|苏州大学| Soochow University|suda|
|          四川大学（Sichuan Univ...|S_5|                      |四川大学| Sichuan University| scu|
+-----------------------------------+---+----------------------+--------+-------------------+----+
only showing top 5 rows



In [5]:
school.printSchema()

root
 |-- about: string (nullable = true)
 |-- id: string (nullable = true)
 |-- motto: string (nullable = true)
 |-- name: string (nullable = true)
 |-- name_en: string (nullable = true)
 |-- sign: string (nullable = true)



In [6]:
school.summary().show()

+-------+-----------------------------------+----+------------------+------------+--------------------+----+
|summary|                              about|  id|             motto|        name|             name_en|sign|
+-------+-----------------------------------+----+------------------+------------+--------------------+----+
|  count|                                429| 429|               429|         429|                 429| 429|
|   mean|                               NULL|NULL|              NULL|        NULL|                NULL|NULL|
| stddev|                               NULL|NULL|              NULL|        NULL|                NULL|NULL|
|    min|        2018 年11 月1 日，中央编...| S_1|                  |   Microsoft|Air Force Enginee...|AFEU|
|    25%|                               NULL|NULL|              NULL|        NULL|                NULL|NULL|
|    50%|                               NULL|NULL|              NULL|        NULL|                NULL|NULL|
|    75%|                 

In [7]:
# Check null value
def check_missing_value(df, col_name):
    # Get column's schema
    field_type = dict(df.dtypes)[col_name]

    if field_type == "string":
        return df.filter(
            col(col_name).isNull() |
            (col(col_name) == "") |
            (trim(col(col_name)) == "")
        ).count()

    if field_type.startswith("array"):
        return df.filter(
            col(col_name).isNull() |
            (size(col(col_name)) == 0)
        ).count()

In [8]:
null_count = {
    column: check_missing_value(school, column) for column in school.columns
}
null_count

{'about': 0, 'id': 0, 'motto': 282, 'name': 0, 'name_en': 0, 'sign': 0}

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
# Add a new column with numbers starting from 1
window_spec = Window.orderBy("id")
df_mapped = school.withColumn("mapped_id", row_number().over(window_spec) + 657816)

In [12]:
# Select only the id and the new number
mapping_df = df_mapped.select(school["id"].alias("original_id"), "mapped_id")

In [13]:
# Save as a single .txt file with two columns (CSV format)
mapping_df.coalesce(1).write \
    .option("header", True) \
    .option("delimiter", ",") \
    .mode("overwrite") \
    .csv("output/mapping_txt")