In [None]:
## This notebook ingest the raw "results" from the bronze layer to silver layer
### Ingest f1_results_dlt

In [None]:
# Define parameters (can set parameters in a workflow job)
target_type   =oidlUtils.parameters.getParameter("TARGET_TYPE", "table")
target_format =oidlUtils.parameters.getParameter("TARGET_FORMAT", "delta")
bronze_catalog    = "f1_bronze"
silver_catalog    = "f1_silver"
bronze_schema     = "bronze"
silver_schema     = "silver"
bronze_table_dlt = "f1_results_dlt"
bronze_table_par = "f1_results_par"
silver_table_dlt = "f1_results_dlt"
silver_table_par = "f1_results_par"


# ----------
##### Step 1 - Read Bronze table to dataframe. select columns, rename and cast type (if needed). 
# ----------

In [None]:
results_df = spark.read.table(f"{bronze_catalog}.{bronze_schema}.{bronze_table_dlt}")

In [None]:
from pyspark.sql.functions import col, current_timestamp
from pyspark.sql.types import IntegerType, StringType, DoubleType, FloatType

# Define mapping: original column -> (new name, new data type or None if unchanged)
column_transformations = {
    "RESULTID": ("result_id", IntegerType()),
    "RACEID": ("race_id", IntegerType()),
    "DRIVERID": ("driver_id", IntegerType()),
    "CONSTRUCTORID": ("constructor_id", IntegerType()),
    "F1NUM": ("number", IntegerType()),  # changed to String if needed
    "GRID": ("grid", IntegerType()),
    "POSITION": ("position", IntegerType()),
    "POSITIONTEXT": ("position_text", StringType()),
    "POSITIONORDER": ("position_order", IntegerType()),
    "POINTS": ("points", FloatType()),
    "LAPS": ("laps", IntegerType()),
    "TIME": ("time", StringType()),
    "MILLISECONDS": ("milliseconds", IntegerType()),
    "FASTESTLAP": ("fastest_lap", IntegerType()),
    "RANK": ("rank", IntegerType()),
    "FASTESTLAPTIME": ("fastest_lap_time", StringType()),
    "FASTESTLAPSPEED": ("fastest_lap_speed", StringType()),
     "STATUSID": ("status_id", IntegerType())
}

# Apply renaming and casting
transformed_columns = [
    col(old).cast(dtype).alias(new) if dtype else col(old).alias(new)
    for old, (new, dtype) in column_transformations.items()
]

# Create the transformed DataFrame
results_selected_df = results_df.select(*transformed_columns) \
                                    .withColumn("ingestion_date", current_timestamp())

# ----------
##### Step 2 - Write the output to processed container in delta/parquet table or parquet file format
# ----------

In [None]:
if target_type == 'file':
    if target_format == 'parquet':
        results_selected_df.write.mode("overwrite").parquet(f"{silver_folder_path}/results")
elif target_type == 'table':
    if target_format == 'parquet':
        results_selected_df.write.mode("overwrite").format("parquet").saveAsTable(f"{silver_catalog}.{silver_schema}.{silver_table_par}")
    elif  target_format == 'delta':
        results_selected_df.write.mode("overwrite").format("delta").saveAsTable(f"{silver_catalog}.{silver_schema}.{silver_table_dlt}")

In [None]:
if target_type == 'file':
    if target_format == 'parquet':
        results_read_df = spark.read.parquet(f"{silver_folder_path}/results")
elif target_type == 'table':
    if target_format == 'parquet':
        results_read_df = spark.read.table(f"{silver_catalog}.{silver_schema}.{silver_table_par}")
    elif  target_format == 'delta':
        results_read_df = spark.read.table(f"{silver_catalog}.{silver_schema}.{silver_table_dlt}")

In [None]:
results_read_df.show()