Notebook responsável por criar a tabela com os dados de taxi na camada gold.

In [0]:
import pyspark.sql.types as T
import pyspark.sql.functions as F
import yaml

In [0]:
# método responsável por incluir os metadados nas colunas da tabela. 
# ele lê um arquivo yaml com a configuração das colunas e atualiza os metadados
def create_update_column_metadata(table_name, file_path):
    try:
        with open(file_path, "r") as file:
            column_comments = yaml.safe_load(file)["columns"]

            for column, comment in column_comments.items():
                spark.sql(f"ALTER TABLE {table_name} ALTER COLUMN {column} COMMENT '{comment}'")
    except Exception as e:
        print(f"Error during update column metadata: {e}")

In [0]:
source_table_name_green = "ifood_case.silver.tb_green"
source_table_name_yellow = "ifood_case.silver.tb_yellow"
gold_table_name = "ifood_case.gold.tb_taxi_trip_data"

In [0]:
df_green = spark.read.table(source_table_name_green)
df_yellow = spark.read.table(source_table_name_yellow)

In [0]:
df_green_gold = (
    df_green
        .withColumn("cd_taxi_type", F.lit("green")) 
        .select(
            F.col("id_vendor"),
            F.col("ts_pickup"),
            F.col("ts_dropoff"),
            F.col("nb_passenger_count"),
            F.col("vl_total_amount"),
            F.col("cd_taxi_type")
        )
)

df_yellow_gold = (
    df_yellow
        .withColumn("cd_taxi_type", F.lit("yellow")) 
        .select(
            F.col("id_vendor"),
            F.col("ts_pickup"),
            F.col("ts_dropoff"),
            F.col("nb_passenger_count"),
            F.col("vl_total_amount"),
            F.col("cd_taxi_type")
        )
)

In [0]:
df_gold = df_green_gold.union(df_yellow_gold)

In [0]:
df_gold.write.format("delta").mode("overwrite").saveAsTable(f"{gold_table_name}")

In [0]:
create_update_column_metadata(gold_table_name, "./metadata/tb_taxi_trip_data.yaml")