In [1]:
## This notebook ingest the raw "results" oracle redbull file from the bronze volume to bronze table
### Ingest results.csv file

In [1]:
# Define parameters (can set parameters in a workflow job)
target_type   =oidlUtils.parameters.getParameter("TARGET_TYPE", "table")
target_format =oidlUtils.parameters.getParameter("TARGET_FORMAT", "delta")
bronze_folder_path    = "/Volumes/f1_bronze/bronze/f1_bronze_volume"
bronze_catalog    = "f1_bronze"
bronze_schema     = "bronze"
bronze_table_dlt = "f1_results_dlt"
bronze_table_par = "f1_results_par"


# ----------
##### Step 1 - Read the CSV file using the spark dataframe reader
#  ----------

In [1]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, LongType

results_schema = StructType([
    StructField("RESULTID", IntegerType(), True),
    StructField("RACEID", IntegerType(), True),
    StructField("DRIVERID", IntegerType(), True),
    StructField("CONSTRUCTORID", IntegerType(), True),
    StructField("F1NUM", StringType(), True),
    StructField("GRID", IntegerType(), True),
    StructField("POSITION", StringType(), True),              # Can be 'R' or a number
    StructField("POSITIONTEXT", StringType(), True),
    StructField("POSITIONORDER", IntegerType(), True),
    StructField("POINTS", IntegerType(), True),
    StructField("LAPS", IntegerType(), True),
    StructField("TIME", StringType(), True),                  # Can be null or a float string
    StructField("MILLISECONDS", LongType(), True),
    StructField("FASTESTLAP", IntegerType(), True),
    StructField("RANK", IntegerType(), True),
    StructField("FASTESTLAPTIME", StringType(), True),        # '01:45.6' format
    StructField("FASTESTLAPSPEED", DoubleType(), True),
    StructField("STATUSID", IntegerType(), True)
])


In [1]:
results_df = (
    spark.read
    .option("header", True)
    .option("nullValue", "\\N")   # interpret \N as null
    .schema(results_schema)
    .csv(f"{bronze_folder_path}/results.csv")
)

# ----------
##### Step 2 - Write the output to bronze catalog as parquet file or delta/parquet table
#  ----------

In [1]:
if target_type == 'file':
    if target_format == 'parquet':
        results_df.write.mode("overwrite").parquet(f"{bronze_folder_path}/results_par")
elif target_type == 'table':
    if target_format == 'parquet':
        results_df.write.mode("overwrite").format("parquet").saveAsTable(f"{bronze_catalog}.{bronze_schema}.{bronze_table_par}")
    elif  target_format == 'delta':
        results_df.write.mode("overwrite").format("delta").saveAsTable(f"{bronze_catalog}.{bronze_schema}.{bronze_table_dlt}")

In [1]:
if target_type == 'file':
    if target_format == 'parquet':
        results_read_df = spark.read.parquet(f"{bronze_folder_path}/results_par")
elif target_type == 'table':
    if target_format == 'parquet':
        results_read_df = spark.read.table(f"{bronze_catalog}.{bronze_schema}.{bronze_table_par}")
    elif  target_format == 'delta':
        results_read_df = spark.read.table(f"{bronze_catalog}.{bronze_schema}.{bronze_table_dlt}")