In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=a9b03f657b2ff8e5119295c64563f78c384966d0784f50d48fdfce37ab66817b
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
import os
import sys
import re
import json
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [53]:
spark = SparkSession.builder \
    .appName("VRA_Table") \
    .config("spark.sql.parquet.int96RebaseModeInWrite", "LEGACY") \
    .getOrCreate()

In [54]:
json_folder_path = "/content/AIR_CIA"
df_vra = spark.read.json(json_folder_path)

In [55]:
df_vra.show()

+-------------------+-------------------+-----------------+-------------------+---------------+--------------------+-------------------+----------------+---------+-------------------+-------------------+-----------+
|    ChegadaPrevista|        ChegadaReal|CódigoAutorização|CódigoJustificativa|CódigoTipoLinha|ICAOAeródromoDestino|ICAOAeródromoOrigem|ICAOEmpresaAérea|NúmeroVoo|    PartidaPrevista|        PartidaReal|SituaçãoVoo|
+-------------------+-------------------+-----------------+-------------------+---------------+--------------------+-------------------+----------------+---------+-------------------+-------------------+-----------+
|2021-11-12 08:30:00|2021-11-12 08:24:00|                0|                N/A|              X|                KORD|               SBGR|             UAL|     0844|2021-11-11 22:00:00|2021-11-11 22:14:00|  REALIZADO|
|2021-11-15 08:30:00|2021-11-15 08:05:00|                0|                N/A|              X|                KORD|               SBGR|

In [56]:
#replace special characters in the columns names
for col_name in df_vra.columns:
    new_col_name = col_name \
        .replace("ã", "a") \
        .replace("á", "a") \
        .replace("à", "a") \
        .replace("é", "e") \
        .replace("ê", "e") \
        .replace("í", "i") \
        .replace("ó", "o") \
        .replace("ô", "o") \
        .replace("õ", "o") \
        .replace("ú", "u") \
        .replace("ç", "c") \
        .replace("-", "_")

    df_vra  = df_vra.withColumnRenamed(col_name, new_col_name)

In [57]:
df_vra.show()

+-------------------+-------------------+-----------------+-------------------+---------------+--------------------+-------------------+----------------+---------+-------------------+-------------------+-----------+
|    ChegadaPrevista|        ChegadaReal|CodigoAutorizacao|CodigoJustificativa|CodigoTipoLinha|ICAOAerodromoDestino|ICAOAerodromoOrigem|ICAOEmpresaAerea|NumeroVoo|    PartidaPrevista|        PartidaReal|SituacaoVoo|
+-------------------+-------------------+-----------------+-------------------+---------------+--------------------+-------------------+----------------+---------+-------------------+-------------------+-----------+
|2021-11-12 08:30:00|2021-11-12 08:24:00|                0|                N/A|              X|                KORD|               SBGR|             UAL|     0844|2021-11-11 22:00:00|2021-11-11 22:14:00|  REALIZADO|
|2021-11-15 08:30:00|2021-11-15 08:05:00|                0|                N/A|              X|                KORD|               SBGR|

In [58]:
# Define a function to convert kebab case to snake case
def kebab_to_snake(column_name):
    return re.sub(r'(?<=[a-z])(?=[A-Z0-9])|(?<=[0-9])(?=[A-Z])', '_', column_name).lower()

# Get the current column names
columns = df_vra.columns

# Rename all columns using kebab_to_snake function
for column in columns:
    new_column_name = kebab_to_snake(column)
    df_vra = df_vra.withColumnRenamed(column, new_column_name)

# Show the DataFrame to verify the updated column names
df_vra.show()

+-------------------+-------------------+------------------+--------------------+-----------------+---------------------+--------------------+-----------------+----------+-------------------+-------------------+------------+
|   chegada_prevista|       chegada_real|codigo_autorizacao|codigo_justificativa|codigo_tipo_linha|icaoaerodromo_destino|icaoaerodromo_origem|icaoempresa_aerea|numero_voo|   partida_prevista|       partida_real|situacao_voo|
+-------------------+-------------------+------------------+--------------------+-----------------+---------------------+--------------------+-----------------+----------+-------------------+-------------------+------------+
|2021-11-12 08:30:00|2021-11-12 08:24:00|                 0|                 N/A|                X|                 KORD|                SBGR|              UAL|      0844|2021-11-11 22:00:00|2021-11-11 22:14:00|   REALIZADO|
|2021-11-15 08:30:00|2021-11-15 08:05:00|                 0|                 N/A|                X| 

In [59]:
#particular cases that needs to be forced:

df_vra = df_vra.withColumnRenamed("icaoaerodromo_destino", "icao_aerodromo_destino")
df_vra = df_vra.withColumnRenamed("icaoaerodromo_orige", "icao_aerodromo_orige")
df_vra = df_vra.withColumnRenamed("icaoempresa_aerea", "icao_empresa_aerea")

df_vra.show()

+-------------------+-------------------+------------------+--------------------+-----------------+----------------------+--------------------+------------------+----------+-------------------+-------------------+------------+
|   chegada_prevista|       chegada_real|codigo_autorizacao|codigo_justificativa|codigo_tipo_linha|icao_aerodromo_destino|icaoaerodromo_origem|icao_empresa_aerea|numero_voo|   partida_prevista|       partida_real|situacao_voo|
+-------------------+-------------------+------------------+--------------------+-----------------+----------------------+--------------------+------------------+----------+-------------------+-------------------+------------+
|2021-11-12 08:30:00|2021-11-12 08:24:00|                 0|                 N/A|                X|                  KORD|                SBGR|               UAL|      0844|2021-11-11 22:00:00|2021-11-11 22:14:00|   REALIZADO|
|2021-11-15 08:30:00|2021-11-15 08:05:00|                 0|                 N/A|           

In [67]:
df_vra.printSchema()
df_vra.show(truncate=False)

root
 |-- chegada_prevista: timestamp (nullable = true)
 |-- chegada_real: timestamp (nullable = true)
 |-- codigo_autorizacao: string (nullable = true)
 |-- codigo_justificativa: string (nullable = true)
 |-- codigo_tipo_linha: string (nullable = true)
 |-- icao_aerodromo_destino: string (nullable = true)
 |-- icaoaerodromo_origem: string (nullable = true)
 |-- icao_empresa_aerea: string (nullable = true)
 |-- numero_voo: integer (nullable = true)
 |-- partida_prevista: timestamp (nullable = true)
 |-- partida_real: timestamp (nullable = true)
 |-- situacao_voo: string (nullable = true)

+-------------------+-------------------+------------------+--------------------+-----------------+----------------------+--------------------+------------------+----------+-------------------+-------------------+------------+
|chegada_prevista   |chegada_real       |codigo_autorizacao|codigo_justificativa|codigo_tipo_linha|icao_aerodromo_destino|icaoaerodromo_origem|icao_empresa_aerea|numero_voo|part

In [61]:
# Convert 'chegada_prevista', 'partida_prevista', 'partida_real' to TimestampType
df_vra = df_vra.withColumn("chegada_prevista", col("chegada_prevista").cast(TimestampType()))
df_vra = df_vra.withColumn("chegada_real", col("chegada_prevista").cast(TimestampType()))
df_vra = df_vra.withColumn("partida_prevista", col("partida_prevista").cast(TimestampType()))
df_vra = df_vra.withColumn("partida_real", col("partida_real").cast(TimestampType()))

# Convert 'numero_voo' to IntegerType
df_vra = df_vra.withColumn("numero_voo", col("numero_voo").cast(IntegerType()))

In [62]:
df_vra.printSchema()

root
 |-- chegada_prevista: timestamp (nullable = true)
 |-- chegada_real: timestamp (nullable = true)
 |-- codigo_autorizacao: string (nullable = true)
 |-- codigo_justificativa: string (nullable = true)
 |-- codigo_tipo_linha: string (nullable = true)
 |-- icao_aerodromo_destino: string (nullable = true)
 |-- icaoaerodromo_origem: string (nullable = true)
 |-- icao_empresa_aerea: string (nullable = true)
 |-- numero_voo: integer (nullable = true)
 |-- partida_prevista: timestamp (nullable = true)
 |-- partida_real: timestamp (nullable = true)
 |-- situacao_voo: string (nullable = true)



In [63]:
#saving the files

# Coalesce the DataFrame to a single partition for efficient writing
df_vra = df_vra.coalesce(1)

In [65]:
# Save as Parquet
df_vra.write.option("encoding", "UTF-8").parquet("/content/parquet/enconded", mode="overwrite")

In [66]:
# Save as JSON
df_vra.write.json("/content/json", mode="overwrite",encoding="UTF-8")