### Ingest circuits.csv file

In [0]:
# Create a text widget for data source input with an empty default value
dbutils.widgets.text("p_data_source", "")

# Retrieve the value of the data source widget
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
# Create a text widget for file date input with a default value of "2021-03-21"
dbutils.widgets.text("p_file_date", "2021-03-21")

# Retrieve the value of the file date widget
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
# Running the configuration notebook to load necessary configurations and settings
%run "../includes/configuration"

In [0]:
# Run the common functions notebook to make its functions available in the current notebook
%run "../includes/common_functions"

##### Step 1 - Read the CSV file using the spark dataframe reader

In [0]:
# Importing necessary classes from pyspark.sql.types to define schema for DataFrames
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

In [0]:
# Define the schema for the circuits DataFrame
circuits_schema = StructType(fields=[StructField("circuitId", IntegerType(), False),
                                     StructField("circuitRef", StringType(), True),
                                     StructField("name", StringType(), True),
                                     StructField("location", StringType(), True),
                                     StructField("country", StringType(), True),
                                     StructField("lat", DoubleType(), True),
                                     StructField("lng", DoubleType(), True),
                                     StructField("alt", IntegerType(), True),
                                     StructField("url", StringType(), True)
])

In [0]:
# Read the CSV file into a DataFrame with the specified schema and header option
circuits_df = spark.read \
    .option("header", True) \
    .schema(circuits_schema) \
    .csv(f"{raw_folder_path}/{v_file_date}/circuits.csv")

##### Step 2 - Select only the required columns

In [0]:
# Import the col function from pyspark.sql.functions to select and manipulate DataFrame columns
from pyspark.sql.functions import col

In [0]:
# Select specific columns from the circuits DataFrame
circuits_selected_df = circuits_df.select(
    col("circuitId"), 
    col("circuitRef"), 
    col("name"), 
    col("location"), 
    col("country"), 
    col("lat"), 
    col("lng"), 
    col("alt")
)

##### Step 3 - Rename the columns as required

In [0]:
# Import the lit function from pyspark.sql.functions to add constant values as columns in a DataFrame
from pyspark.sql.functions import lit

In [0]:
# Rename columns and add new columns with constant values
circuits_renamed_df = circuits_selected_df.withColumnRenamed("circuitId", "circuit_id") \
    .withColumnRenamed("circuitRef", "circuit_ref") \
    .withColumnRenamed("lat", "latitude") \
    .withColumnRenamed("lng", "longitude") \
    .withColumnRenamed("alt", "altitude") \
    .withColumn("data_source", lit(v_data_source)) \
    .withColumn("file_date", lit(v_file_date))

##### Step 4 - Add ingestion date to the dataframe

In [0]:
# Add the current ingestion date to the DataFrame
circuits_final_df = add_ingestion_date(circuits_renamed_df)

##### Step 5 - Write data to datalake as parquet

In [0]:
# Write the DataFrame to a Delta table in overwrite mode
circuits_final_df.write.mode("overwrite").format("delta").saveAsTable("f1_processed.circuits")

In [0]:
%sql
-- Select all columns from the circuits table in the f1_processed schema
SELECT * FROM f1_processed.circuits;

In [0]:
# Exit the notebook with a success message
dbutils.notebook.exit("Success")