In [0]:
import datetime
import requests
import json # I find out that I need this to read the "return" from the weather API
from pyspark.sql.functions import (
    col,            
    lit,            
    when,           
    to_timestamp,   
    date_format,    
    hour,           
    dayofweek,      
    count,          
    sum             
)
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    DoubleType,
    TimestampType
)

print("Toolbox opened successfully 🧙‍♂️")

In [0]:
file_loc = "/Volumes/personal_projects/gru_airport/raw_data_vra/VRA_2025_01.csv"

schema_vra = StructType([
    StructField("Sigla ICAO Empresa Aérea", StringType(), True),
    StructField("Empresa Aérea", StringType(), True),
    StructField("Número Voo", IntegerType(), True),
    StructField("Código DI", StringType(), True),
    StructField("Código Tipo Linha", StringType(), True),
    StructField("Modelo Equipamento", StringType(), True),
    StructField("Número de Assentos", IntegerType(), True),  # Math numberr
    StructField("Sigla ICAO Aeroporto Origem", StringType(), True),
    StructField("Descrição Aeroporto Origem", StringType(), True),
    # Dates are initially read as strings (Brazilian format) and converted later to a standard format
    StructField("Partida Prevista", StringType(), True),
    StructField("Partida Real", StringType(), True),
    StructField("Sigla ICAO Aeroporto Destino", StringType(), True),
    StructField("Descrição Aeroporto Destino", StringType(), True),
    StructField("Chegada Prevista", StringType(), True),
    StructField("Chegada Real", StringType(), True),
    StructField("Situação Voo", StringType(), True),
    StructField("Justificativa", StringType(), True),
    StructField("Referência", StringType(), True),
    StructField("Situação Partida", StringType(), True),
    StructField("Situação Chegada", StringType(), True)
])

df_vra = (spark.read
          .format("csv")
          .option("header", "true")
          .option("delimiter", ";")
          .schema(schema_vra)
          .load(file_loc))

display(df_vra)

In [0]:
#while I was developing this, I faced one error in the df_vra.write because the data on the table was with ',;{}
#so bellow we clean the columns names and save the table with the correct columns names
new_columns = [
    "sigla_icao_empresa_aerea",
    "empresa_aerea",
    "numero_voo",
    "codigo_di",
    "codigo_tipo_linha",
    "modelo_equipamento",
    "numero_assentos",
    "sigla_icao_origem",
    "descricao_origem",
    "partida_prevista",
    "partida_real",
    "sigla_icao_destino",
    "descricao_destino",
    "chegada_prevista",
    "chegada_real",
    "situacao_voo",
    "justificativa",
    "referencia",
    "situacao_partida",
    "situacao_chegada"
]

df_vra_renamed = df_vra.toDF(*new_columns)

# Saving the table at the bronze layer
# Path: Catalog.Schema.nameofthetable

(df_vra_renamed.write
  .format("delta")
  .mode("overwrite")
  .saveAsTable("personal_projects.gru_airport.bronze_vra"))

print("Bronze Table saved successfully 🥉")