In [0]:
import pyspark.sql.types as T
import pyspark.sql.functions as F
import yaml

In [0]:
def create_update_column_metadata(table_name, file_path):
    try:
        with open(file_path, "r") as file:
            column_comments = yaml.safe_load(file)["columns"]

            for column, comment in column_comments.items():
                spark.sql(f"ALTER TABLE {table_name} ALTER COLUMN {column} COMMENT '{comment}'")
    except Exception as e:
        print(f"Error during update column metadata: {e}")

In [0]:
source_table_name_after = "ifood_case.bronze.green_after_202301"
source_table_name = "ifood_case.bronze.green_202301"
silver_table_name = "ifood_case.silver.green"

In [0]:
df_after_202301 = spark.read.table(source_table_name_after)
df_202301 = spark.read.table(source_table_name)

In [0]:
column_store_and_fwd_flag_expression = (
    F.when(F.col("store_and_fwd_flag") == "N", False)
     .when(F.col("store_and_fwd_flag") == "Y", True)
     .otherwise(None)
)

In [0]:
df_202301 = (
    df_202301
        .withColumn("fl_store_and_fwd", column_store_and_fwd_flag_expression)
        .select(
            F.col("VendorID").alias("id_vendor").cast(T.IntegerType()),
            F.col("lpep_pickup_datetime").alias("ts_pickup"),
            F.col("lpep_dropoff_datetime").alias("ts_dropoff"),
            F.col("passenger_count").alias("nb_passenger_count"),
            F.col("trip_distance").alias("vl_trip_distance"),
            F.col("RatecodeID").alias("id_rate_code").cast(T.IntegerType()),
            F.col("fl_store_and_fwd"),
            F.col("PULocationID").alias("id_pickup_location").cast(T.IntegerType()),
            F.col("DOLocationID").alias("id_dropoff_location").cast(T.IntegerType()),
            F.col("payment_type").alias("id_payment_type").cast(T.IntegerType()),
            F.col("fare_amount").alias("vl_fare_amount"),
            F.col("extra").alias("vl_extra"),
            F.col("mta_tax").alias("vl_mta_tax"),
            F.col("tip_amount").alias("vl_tip_amount"),
            F.col("ehail_fee").alias("vl_ehail_fee"),
            F.col("tolls_amount").alias("vl_tolls_amount"),
            F.col("improvement_surcharge").alias("vl_improvement_surcharge"),
            F.col("total_amount").alias("vl_total_amount"),
            F.col("congestion_surcharge").alias("vl_congestion_surcharge"),
            F.col("trip_type").alias("id_trip_type").cast(T.IntegerType())
        )
)

df_after_202301 = (
    df_after_202301
        .withColumn("fl_store_and_fwd", column_store_and_fwd_flag_expression)
        .select(
            F.col("VendorID").alias("id_vendor").cast(T.IntegerType()),
            F.col("lpep_pickup_datetime").alias("ts_pickup"),
            F.col("lpep_dropoff_datetime").alias("ts_dropoff"),
            F.col("passenger_count").alias("nb_passenger_count"),
            F.col("trip_distance").alias("vl_trip_distance"),
            F.col("RatecodeID").alias("id_rate_code").cast(T.IntegerType()),
            F.col("fl_store_and_fwd"),
            F.col("PULocationID").alias("id_pickup_location").cast(T.IntegerType()),
            F.col("DOLocationID").alias("id_dropoff_location").cast(T.IntegerType()),
            F.col("payment_type").alias("id_payment_type").cast(T.IntegerType()),
            F.col("fare_amount").alias("vl_fare_amount"),
            F.col("extra").alias("vl_extra"),
            F.col("mta_tax").alias("vl_mta_tax"),
            F.col("tip_amount").alias("vl_tip_amount"),
            F.col("ehail_fee").alias("vl_ehail_fee"),
            F.col("tolls_amount").alias("vl_tolls_amount"),
            F.col("improvement_surcharge").alias("vl_improvement_surcharge"),
            F.col("total_amount").alias("vl_total_amount"),
            F.col("congestion_surcharge").alias("vl_congestion_surcharge"),
            F.col("trip_type").alias("id_trip_type").cast(T.IntegerType())
        )
)

df_silver = df_after_202301.union(df_202301).coalesce(1)

In [0]:
df_silver = (
    df_silver
        .dropDuplicates()
        .dropna(how="all")
)

In [0]:
df_silver.write.format("delta").mode("overwrite").saveAsTable(f"{silver_table_name}")

In [0]:
create_update_column_metadata(silver_table_name, "./metadata/green.yaml")