In [0]:
import pyspark.sql.types as T
import pyspark.sql.functions as F
import yaml

In [0]:
def create_update_column_metadata(table_name, file_path):
    try:
        with open(file_path, "r") as file:
            column_comments = yaml.safe_load(file)["columns"]

            for column, comment in column_comments.items():
                spark.sql(f"ALTER TABLE {table_name} ALTER COLUMN {column} COMMENT '{comment}'")
    except Exception as e:
        print(f"Error during update column metadata: {e}")

In [0]:
source_table_name_after = "ifood_case.bronze.fhvhv_after_202301"
source_table_name = "ifood_case.bronze.fhvhv_202301"
silver_table_name = "ifood_case.silver.fhvhv"

In [0]:
df_after_202301 = spark.read.table(source_table_name_after)
df_202301 = spark.read.table(source_table_name)

In [0]:
column_shared_request_flag = (
    F.when(F.col("shared_request_flag") == "N", False)
     .when(F.col("shared_request_flag") == "Y", True)
     .otherwise(None)
)

column_shared_match_flag = (
    F.when(F.col("shared_match_flag") == "N", False)
     .when(F.col("shared_match_flag") == "Y", True)
     .otherwise(None)
)

column_access_a_ride_flag = (
    F.when(F.col("access_a_ride_flag") == "N", False)
     .when(F.col("access_a_ride_flag") == "Y", True)
     .otherwise(None)
)

column_wav_request_flag = (
    F.when(F.col("wav_request_flag") == "N", False)
     .when(F.col("wav_request_flag") == "Y", True)
     .otherwise(None)
)

column_wav_match_flag = (
    F.when(F.col("wav_match_flag") == "N", False)
     .when(F.col("wav_match_flag") == "Y", True)
     .otherwise(None)
)

with_columns_dict = {
    "fl_shared_request": column_shared_request_flag,
    "fl_shared_match": column_shared_match_flag,
    "fl_access_a_ride": column_access_a_ride_flag,
    "fl_wav_request": column_wav_request_flag,
    "fl_wav_match": column_wav_match_flag,
}

In [0]:

df_202301 = (
    df_202301
        .withColumns(with_columns_dict)
        .select(
            F.col("hvfhs_license_num").alias("ds_hvfhs_license_number").cast(T.StringType()),
            F.col("dispatching_base_num").alias("ds_dispatching_base_number").cast(T.StringType()),
            F.col("originating_base_num").alias("ds_originating_base_number").cast(T.StringType()),
            F.col("request_datetime").alias("ts_request").cast(T.TimestampType()),
            F.col("on_scene_datetime").alias("ts_on_scene").cast(T.TimestampType()), 
            F.col("pickup_datetime").alias("ts_pickup").cast(T.TimestampType()),
            F.col("dropOff_datetime").alias("ts_dropoff").cast(T.TimestampType()),
            F.col("PULocationID").alias("id_pickup_location").cast(T.IntegerType()),
            F.col("DOLocationID").alias("id_dropoff_location").cast(T.IntegerType()),
            F.col("trip_miles").alias("vl_trip_miles").cast(T.DoubleType()),
            F.col("trip_time").alias("vl_trip_time").cast(T.LongType()),
            F.col("base_passenger_fare").alias("vl_base_passenger_fare").cast(T.DoubleType()),
            F.col("tolls").alias("vl_tolls").cast(T.DoubleType()), 
            F.col("bcf").alias("vl_bcf").cast(T.DoubleType()),
            F.col("sales_tax").alias("vl_sales_tax").cast(T.DoubleType()), 
            F.col("congestion_surcharge").alias("vl_congestion_surcharge").cast(T.DoubleType()), 
            F.col("airport_fee").alias("vl_airport_fee").cast(T.DoubleType()), 
            F.col("tips").alias("vl_tips").cast(T.DoubleType()),
            F.col("driver_pay").alias("vl_driver_pay").cast(T.DoubleType()),
            F.col("fl_shared_request"),
            F.col("fl_shared_match"), 
            F.col("fl_access_a_ride"),
            F.col("fl_wav_request"),
            F.col("fl_wav_match") 
        )
)

df_after_202301 = (
    df_after_202301
        .withColumns(with_columns_dict)
        .select(
            F.col("hvfhs_license_num").alias("ds_hvfhs_license_number").cast(T.StringType()),
            F.col("dispatching_base_num").alias("ds_dispatching_base_number").cast(T.StringType()),
            F.col("originating_base_num").alias("ds_originating_base_number").cast(T.StringType()),
            F.col("request_datetime").alias("ts_request").cast(T.TimestampType()),
            F.col("on_scene_datetime").alias("ts_on_scene").cast(T.TimestampType()), 
            F.col("pickup_datetime").alias("ts_pickup").cast(T.TimestampType()),
            F.col("dropOff_datetime").alias("ts_dropoff").cast(T.TimestampType()),
            F.col("PULocationID").alias("id_pickup_location").cast(T.IntegerType()),
            F.col("DOLocationID").alias("id_dropoff_location").cast(T.IntegerType()),
            F.col("trip_miles").alias("vl_trip_miles").cast(T.DoubleType()),
            F.col("trip_time").alias("vl_trip_time").cast(T.LongType()),
            F.col("base_passenger_fare").alias("vl_base_passenger_fare").cast(T.DoubleType()),
            F.col("tolls").alias("vl_tolls").cast(T.DoubleType()), 
            F.col("bcf").alias("vl_bcf").cast(T.DoubleType()),
            F.col("sales_tax").alias("vl_sales_tax").cast(T.DoubleType()), 
            F.col("congestion_surcharge").alias("vl_congestion_surcharge").cast(T.DoubleType()), 
            F.col("airport_fee").alias("vl_airport_fee").cast(T.DoubleType()), 
            F.col("tips").alias("vl_tips").cast(T.DoubleType()),
            F.col("driver_pay").alias("vl_driver_pay").cast(T.DoubleType()),
            F.col("fl_shared_request"),
            F.col("fl_shared_match"), 
            F.col("fl_access_a_ride"),
            F.col("fl_wav_request"),
            F.col("fl_wav_match")  
        )
)

df_silver = df_202301.union(df_after_202301).repartition(10)

In [0]:
df_silver = (
    df_silver
        .dropDuplicates()
        .dropna(how="all")
)

In [0]:
df_silver.write.format("delta").mode("overwrite").saveAsTable(f"{silver_table_name}")

In [0]:
create_update_column_metadata(silver_table_name, "./metadata/fhvhv.yaml")