### Ingest lap_times folder

In [0]:
# Create a text widget for data source input with an empty default value
dbutils.widgets.text("p_data_source", "")

# Retrieve the value of the data source widget
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
# Create a text widget for file date input with a default value of "2021-03-21"
dbutils.widgets.text("p_file_date", "2021-03-21")

# Retrieve the value of the file date widget
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
# Running the configuration notebook to load necessary configurations and settings
%run "../includes/configuration"

In [0]:
# Run the common functions notebook to make its functions available in the current notebook
%run "../includes/common_functions"

##### Step 1 - Read the CSV file using the spark dataframe reader API

In [0]:
# Importing necessary classes for defining schema
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [0]:
# Define schema for lap_times data
lap_times_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                      StructField("driverId", IntegerType(), True),
                                      StructField("lap", IntegerType(), True),
                                      StructField("position", IntegerType(), True),
                                      StructField("time", StringType(), True),
                                      StructField("milliseconds", IntegerType(), True)
                                     ])

In [0]:
# Read the lap_times CSV file into a DataFrame using the predefined schema
lap_times_df = spark.read \
    .schema(lap_times_schema) \
    .csv(f"{raw_folder_path}/{v_file_date}/lap_times")

##### Step 2 - Rename columns and add new columns
1. Rename driverId and raceId
1. Add ingestion_date with current timestamp

In [0]:
# Add ingestion date to the lap_times DataFrame
lap_times_with_ingestion_date_df = add_ingestion_date(lap_times_df)

In [0]:
from pyspark.sql.functions import lit, current_timestamp

# Rename columns and add ingestion_date, data_source, and file_date columns
final_df = lap_times_with_ingestion_date_df.withColumnRenamed("driverId", "driver_id") \
    .withColumnRenamed("raceId", "race_id") \
    .withColumn("ingestion_date", current_timestamp()) \
    .withColumn("data_source", lit(v_data_source)) \
    .withColumn("file_date", lit(v_file_date))

##### Step 3 - Write to output to processed container in parquet format

In [0]:
# Define the merge condition for the Delta table
merge_condition = "tgt.race_id = src.race_id AND tgt.driver_id = src.driver_id AND tgt.lap = src.lap"

# Merge the final DataFrame into the Delta table
merge_delta_data(final_df, 'f1_processed', 'lap_times', processed_folder_path, merge_condition, 'race_id')

In [0]:
# Exit the notebook with a success message
dbutils.notebook.exit("Success")