In [0]:
# Create a text widget for parameter input with a default value of "2021-03-21"
dbutils.widgets.text("p_file_date", "2021-03-21")

# Retrieve the value of the widget parameter "p_file_date"
v_file_date = dbutils.widgets.get("p_file_date")

##### Read all the data as required

In [0]:
# Running the configuration notebook to load necessary configurations and settings
%run "../includes/configuration"

In [0]:
# Run the common functions notebook to make its functions available in the current notebook
%run "../includes/common_functions"

In [0]:
drivers_df = spark.read.format("delta").load(f"{processed_folder_path}/drivers") \
.withColumnRenamed("number", "driver_number") \
.withColumnRenamed("name", "driver_name") \
.withColumnRenamed("nationality", "driver_nationality") 

In [0]:
# Load the constructors data from the Delta table and rename the 'name' column to 'team'
constructors_df = spark.read.format("delta").load(f"{processed_folder_path}/constructors") \
    .withColumnRenamed("name", "team")

In [0]:
# Load the circuits data from the Delta table and rename the 'location' column to 'circuit_location'
circuits_df = spark.read.format("delta").load(f"{processed_folder_path}/circuits") \
    .withColumnRenamed("location", "circuit_location")

In [0]:
# Load the races data from the Delta table, rename 'name' to 'race_name' and 'race_timestamp' to 'race_date'
races_df = spark.read.format("delta").load(f"{processed_folder_path}/races") \
    .withColumnRenamed("name", "race_name") \
    .withColumnRenamed("race_timestamp", "race_date")

In [0]:
# Load the results data from the Delta table, filter by file_date, and rename columns
results_df = spark.read.format("delta").load(f"{processed_folder_path}/results") \
    .filter(f"file_date = '{v_file_date}'") \
    .withColumnRenamed("time", "race_time") \
    .withColumnRenamed("race_id", "result_race_id") \
    .withColumnRenamed("file_date", "result_file_date")

##### Join circuits to races

In [0]:
# Join races_df with circuits_df on circuit_id and select specific columns
race_circuits_df = races_df.join(circuits_df, races_df.circuit_id == circuits_df.circuit_id, "inner") \
    .select(races_df.race_id, races_df.race_year, races_df.race_name, races_df.race_date, circuits_df.circuit_location)

##### Join results to all other dataframes

In [0]:
# Join results_df with race_circuits_df on race_id, then join with drivers_df on driver_id, and constructors_df on constructor_id
race_results_df = results_df.join(race_circuits_df, results_df.result_race_id == race_circuits_df.race_id) \
                            .join(drivers_df, results_df.driver_id == drivers_df.driver_id) \
                            .join(constructors_df, results_df.constructor_id == constructors_df.constructor_id)

In [0]:
# Import the current_timestamp function from pyspark.sql.functions to add the current timestamp to DataFrame columns
from pyspark.sql.functions import current_timestamp

In [0]:
# Select specific columns from race_results_df and add a current timestamp column
final_df = race_results_df.select(
    "race_id", "race_year", "race_name", "race_date", "circuit_location", 
    "driver_name", "driver_number", "driver_nationality", "team", "grid", 
    "fastest_lap", "race_time", "points", "position", "result_file_date"
).withColumn("created_date", current_timestamp()) \
 .withColumnRenamed("result_file_date", "file_date")

In [0]:
# Define the merge condition for the Delta table
merge_condition = "tgt.driver_name = src.driver_name AND tgt.race_id = src.race_id"

# Merge the final DataFrame into the Delta table in the specified folder path
merge_delta_data(final_df, 'f1_presentation', 'race_results', presentation_folder_path, merge_condition, 'race_id')

In [0]:
%sql
-- Select all columns from the race_results table in the f1_presentation schema
SELECT * FROM f1_presentation.race_results;