### Ingest qualifying json files

In [0]:
# Create a text widget for data source input with an empty default value
dbutils.widgets.text("p_data_source", "")

# Retrieve the value of the data source widget
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
# Create a text widget for file date input with a default value of "2021-03-21"
dbutils.widgets.text("p_file_date", "2021-03-21")

# Retrieve the value of the file date widget
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
# Running the configuration notebook to load necessary configurations and settings
%run "../includes/configuration"

In [0]:
# Run the common functions notebook to make its functions available in the current notebook
%run "../includes/common_functions"

##### Step 1 - Read the JSON file using the spark dataframe reader API

In [0]:
# Importing necessary classes from pyspark.sql.types to define schema for DataFrame
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [0]:
# Define schema for the qualifying DataFrame
qualifying_schema = StructType(fields=[StructField("qualifyId", IntegerType(), False),
                                      StructField("raceId", IntegerType(), True),
                                      StructField("driverId", IntegerType(), True),
                                      StructField("constructorId", IntegerType(), True),
                                      StructField("number", IntegerType(), True),
                                      StructField("position", IntegerType(), True),
                                      StructField("q1", StringType(), True),
                                      StructField("q2", StringType(), True),
                                      StructField("q3", StringType(), True),
                                     ])

In [0]:
# Read JSON data into DataFrame using the defined schema, allowing multi-line JSON
qualifying_df = spark.read \
    .schema(qualifying_schema) \
    .option("multiLine", True) \
    .json(f"{raw_folder_path}/{v_file_date}/qualifying")

##### Step 2 - Rename columns and add new columns
1. Rename qualifyingId, driverId, constructorId and raceId
1. Add ingestion_date with current timestamp

In [0]:
# Add ingestion date to the qualifying DataFrame
qualifying_with_ingestion_date_df = add_ingestion_date(qualifying_df)

In [0]:
# Import the lit function from pyspark.sql.functions to add constant column values
from pyspark.sql.functions import lit

In [0]:
# Rename columns and add ingestion_date, data_source, and file_date columns
final_df = qualifying_with_ingestion_date_df.withColumnRenamed("qualifyId", "qualify_id") \
    .withColumnRenamed("driverId", "driver_id") \
    .withColumnRenamed("raceId", "race_id") \
    .withColumnRenamed("constructorId", "constructor_id") \
    .withColumn("ingestion_date", current_timestamp()) \
    .withColumn("data_source", lit(v_data_source)) \
    .withColumn("file_date", lit(v_file_date))

##### Step 3 - Write to output to processed container in parquet format

In [0]:
# Define the merge condition for the Delta table
merge_condition = "tgt.qualify_id = src.qualify_id AND tgt.race_id = src.race_id"

# Merge the final DataFrame into the Delta table in the specified path
merge_delta_data(final_df, 'f1_processed', 'qualifying', processed_folder_path, merge_condition, 'race_id')

In [0]:
# Exit the notebook with a "Success" message
dbutils.notebook.exit("Success")