## This notebook reads raw data from drivers table in the bronze layer do some cleansing and enrichement and write it to the silver layer

In [1]:
# Define parameters (can set parameters in a workflow job)
target_type   =oidlUtils.parameters.getParameter("TARGET_TYPE", "table")
target_format =oidlUtils.parameters.getParameter("TARGET_FORMAT", "delta")
bronze_catalog    = "f1_bronze"
silver_catalog    = "f1_silver"
bronze_schema     = "bronze"
silver_schema     = "silver"
bronze_table_dlt = "f1_drivers_dlt"
bronze_table_par = "f1_drivers_par"
silver_table_dlt = "f1_drivers_dlt"
silver_table_par = "f1_drivers_par"

# ----------
##### Step 1 - Read Bronze table
#  ----------

In [1]:
drivers_df = spark.read.table(f"{bronze_catalog}.{bronze_schema}.{bronze_table_dlt}")

# ----------
##### Step 2 - Rename columns and add new columns
#####  driverId renamed to driver_id  
#####  driverRef renamed to driver_ref  
#####  ingestion date added
#####  name added with concatenation of forename and surname
# ----------

In [1]:
from pyspark.sql.functions import col, concat, current_timestamp, lit, concat_ws, trim

In [1]:
drivers_with_columns_df = drivers_df.withColumnRenamed("driverId", "driver_id") \
                                    .withColumnRenamed("driverRef", "driver_ref") \
                                    .withColumn("driver_full_name",
                                     trim(concat_ws(" ", col("forename"), col("surname")))) \
                                    .withColumn("ingestion_date", current_timestamp())

# ----------
##### Step 3 - Drop the unwanted columns
# ----------

In [1]:
drivers_final_df = drivers_with_columns_df.drop(col("url"), col("number"), col("forename"), col("surname") )

# ----------
##### Step 4 - Write to output to processed container in parquet format
# ----------

In [1]:
if target_type == 'file':
    if target_format == 'parquet':
        drivers_final_df.write.mode("overwrite").parquet(f"{silver_folder_path}/drivers")
elif target_type == 'table':
    if target_format == 'parquet':
        drivers_final_df.write.mode("overwrite").format("parquet").saveAsTable(f"{silver_catalog}.{silver_schema}.{silver_table_par}")
    elif  target_format == 'delta':
        drivers_final_df.write.mode("overwrite").format("delta").saveAsTable(f"{silver_catalog}.{silver_schema}.{silver_table_dlt}")

In [1]:
if target_type == 'file':
    if target_format == 'parquet':
        drivers_read_df = spark.read.parquet(f"{silver_folder_path}/drivers")
elif target_type == 'table':
    if target_format == 'parquet':
        drivers_read_df = spark.read.table(f"{silver_catalog}.{silver_schema}.{silver_table_par}")
    elif  target_format == 'delta':
        drivers_read_df = spark.read.table(f"{silver_catalog}.{silver_schema}.{silver_table_dlt}")