In [None]:
## This notebook ingest the raw "races" from the bronze layer to silver layer
### Ingest f1_races_dlt

In [None]:
# Define parameters (can set parameters in a workflow job)
target_type   =oidlUtils.parameters.getParameter("TARGET_TYPE", "table")
target_format =oidlUtils.parameters.getParameter("TARGET_FORMAT", "delta")
bronze_catalog    = "f1_bronze"
silver_catalog    = "f1_silver"
bronze_schema     = "bronze"
silver_schema     = "silver"
bronze_table_dlt = "f1_races_dlt"
bronze_table_par = "f1_races_par"
silver_table_dlt = "f1_races_dlt"
silver_table_par = "f1_races_par"


# ----------
##### Step 1 - Read Bronze table to dataframe, select and rename columns 
# ----------

In [None]:
races_df = spark.read.table(f"{bronze_catalog}.{bronze_schema}.{bronze_table_dlt}")

In [None]:
from pyspark.sql.functions import col, current_timestamp
races_selected_df = races_df.select(col('RACEID').alias('race_id'), col('YEAR').alias('race_year'), col('ROUND').alias('round'),
                                    col('NAME').alias('name'), col('DATE').alias('race_date'), col('TIME').alias('race_time'),
                                    col('CIRCUITREF').alias('circuit_id'))

# ----------
##### Step 2 - Change time format, Add ingestion date and race_timestamp to the dataframe
# ----------

In [None]:
from pyspark.sql.functions import current_timestamp, to_timestamp, concat, col, lit, lpad, length, split, concat_ws, expr, when

In [None]:
# Clean and pad the time string to always be in HH:mm:ss format
races_with_clean_time_df = races_selected_df.withColumn(
    "clean_race_time",
    when(col("race_time").isNull() | (col("race_time") == "") | (col("race_time").rlike(r"^\s*$")), lit("00:00:00"))
    .otherwise(
        expr("""
            format_string(
                '%02d:%02d:%02d',
                int(split(race_time, ':')[0]),
                int(split(race_time, ':')[1]),
                int(split(race_time, ':')[2])
            )
        """)
    )
)

In [None]:
races_with_timestamp_df = races_with_clean_time_df.withColumn("ingestion_date", current_timestamp()) \
                                  .withColumn("race_timestamp", to_timestamp(concat(col('race_date'), lit(' '), col('clean_race_time')), 'dd-MMM-yy HH:mm:ss'))

# ----------
##### Step 3 - Select final columns required
# ----------

In [None]:
races_selected_df = races_with_timestamp_df.select(col('race_id'), col('race_year'), col('round'), 
                                                   col('circuit_id'), col('name'), col('ingestion_date'), col('race_timestamp'))

# ----------
##### Step 4 - Write the output to processed container in delta/parquet table or parquet file format
# ----------

In [None]:
if target_type == 'file':
    if target_format == 'parquet':
        races_selected_df.write.mode("overwrite").parquet(f"{silver_folder_path}/races")
elif target_type == 'table':
    if target_format == 'parquet':
        races_selected_df.write.mode("overwrite").format("parquet").saveAsTable(f"{silver_catalog}.{silver_schema}.{silver_table_par}")
    elif  target_format == 'delta':
        races_selected_df.write.mode("overwrite").format("delta").saveAsTable(f"{silver_catalog}.{silver_schema}.{silver_table_dlt}")

In [None]:
if target_type == 'file':
    if target_format == 'parquet':
        races_read_df = spark.read.parquet(f"{silver_folder_path}/races")
elif target_type == 'table':
    if target_format == 'parquet':
        races_read_df = spark.read.table(f"{silver_catalog}.{silver_schema}.{silver_table_par}")
    elif  target_format == 'delta':
        races_read_df = spark.read.table(f"{silver_catalog}.{silver_schema}.{silver_table_dlt}")

In [None]:
races_read_df.show()