In [1]:
## This notebook ingest the raw "pit stops" from the bronze layer to silver layer
### Ingest f1_pit_stops_dlt

In [1]:
# Define parameters (can set parameters in a workflow job)
target_type   =oidlUtils.parameters.getParameter("TARGET_TYPE", "table")
target_format =oidlUtils.parameters.getParameter("TARGET_FORMAT", "delta")
bronze_catalog    = "f1_bronze"
silver_catalog    = "f1_silver"
bronze_schema     = "bronze"
silver_schema     = "silver"
bronze_table_dlt = "f1_pit_stops_dlt"
bronze_table_par = "f1_pit_stops_par"
silver_table_dlt = "f1_pit_stops_dlt"
silver_table_par = "f1_pit_stops_par"


# ----------
##### Step 1 - Read Bronze table to dataframe, select rename and cast columns 
# ----------

In [1]:
pit_stops_df = spark.read.table(f"{bronze_catalog}.{bronze_schema}.{bronze_table_dlt}")

In [1]:
pit_stops_df.show()

+------+--------+----+---+--------+--------+------------+
|RACEID|DRIVERID|STOP|LAP|    TIME|DURATION|MILLISECONDS|
+------+--------+----+---+--------+--------+------------+
|   841|     153|   1|  1|17:05:23|  26.898|       26898|
|   841|      30|   1|  1|17:05:52|  25.021|       25021|
|   841|      17|   1| 11|17:20:48|  23.426|       23426|
|   841|       4|   1| 12|17:22:34|  23.251|       23251|
|   841|      13|   1| 13|17:24:10|  23.842|       23842|
|   841|      22|   1| 13|17:24:29|  23.643|       23643|
|   841|      20|   1| 14|17:25:17|  22.603|       22603|
|   841|     814|   1| 14|17:26:03|  24.863|       24863|
|   841|     816|   1| 14|17:26:50|  25.259|       25259|
|   841|      67|   1| 15|17:27:34|  25.342|       25342|
|   841|       2|   1| 15|17:27:41|  22.994|       22994|
|   841|       1|   1| 16|17:28:24|  23.227|       23227|
|   841|     808|   1| 16|17:28:39|  24.535|       24535|
|   841|       3|   1| 16|17:29:00|  23.716|       23716|
|   841|     1

In [1]:
from pyspark.sql.functions import col, current_timestamp
from pyspark.sql.types import IntegerType, StringType

# Apply renaming, casting, and select only renamed columns
pit_stops_selected_df = pit_stops_df.select(
    col("RACEID").cast(IntegerType()).alias("race_id"),
    col("DRIVERID").cast(IntegerType()).alias("driver_id"),
    col("STOP").cast(StringType()).alias("stop"),
    col("LAP").cast(IntegerType()).alias("lap"),
    col("TIME").cast(StringType()).alias("time"),
    col("DURATION").cast(StringType()).alias("duration"),
    col("MILLISECONDS").cast(IntegerType()).alias("milliseconds")
).withColumn("ingestion_date", current_timestamp())

# ----------
##### Step 2 - Write the output to processed container in delta/parquet table or parquet file format
# ----------

In [1]:
if target_type == 'file':
    if target_format == 'parquet':
        pit_stops_selected_df.write.mode("overwrite").parquet(f"{silver_folder_path}/pit_stops")
elif target_type == 'table':
    if target_format == 'parquet':
        pit_stops_selected_df.write.mode("overwrite").format("parquet").saveAsTable(f"{silver_catalog}.{silver_schema}.{silver_table_par}")
    elif  target_format == 'delta':
        pit_stops_selected_df.write.mode("overwrite").format("delta").saveAsTable(f"{silver_catalog}.{silver_schema}.{silver_table_dlt}")

In [1]:
if target_type == 'file':
    if target_format == 'parquet':
        pit_stops_read_df = spark.read.parquet(f"{silver_folder_path}/pit_stops")
elif target_type == 'table':
    if target_format == 'parquet':
        pit_stops_read_df = spark.read.table(f"{silver_catalog}.{silver_schema}.{silver_table_par}")
    elif  target_format == 'delta':
        pit_stops_read_df = spark.read.table(f"{silver_catalog}.{silver_schema}.{silver_table_dlt}")

In [1]:
pit_stops_read_df.show()

+-------+---------+----+---+--------+--------+------------+--------------------+
|race_id|driver_id|stop|lap|    time|duration|milliseconds|      ingestion_date|
+-------+---------+----+---+--------+--------+------------+--------------------+
|    841|      153|   1|  1|17:05:23|  26.898|       26898|2025-11-14 10:23:...|
|    841|       30|   1|  1|17:05:52|  25.021|       25021|2025-11-14 10:23:...|
|    841|       17|   1| 11|17:20:48|  23.426|       23426|2025-11-14 10:23:...|
|    841|        4|   1| 12|17:22:34|  23.251|       23251|2025-11-14 10:23:...|
|    841|       13|   1| 13|17:24:10|  23.842|       23842|2025-11-14 10:23:...|
|    841|       22|   1| 13|17:24:29|  23.643|       23643|2025-11-14 10:23:...|
|    841|       20|   1| 14|17:25:17|  22.603|       22603|2025-11-14 10:23:...|
|    841|      814|   1| 14|17:26:03|  24.863|       24863|2025-11-14 10:23:...|
|    841|      816|   1| 14|17:26:50|  25.259|       25259|2025-11-14 10:23:...|
|    841|       67|   1| 15|