## Ingest circuits.csv file
Step 1- Read the CSV file using the spark datafram reader

In [0]:
dbutils.widgets.help()

In [0]:
dbutils.widgets.text("p_file_date","2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
v_data_source

Out[70]: ''

In [0]:
%run "../includes/configuration"


In [0]:
%run "../includes/common_functions"

In [0]:
from pyspark.sql.types import StructType, StringType, IntegerType, DoubleType, StructField

In [0]:
circuits_schema = StructType(fields=[
    StructField("circuitId", IntegerType(), False),
    StructField("circuitRef", StringType(), True),
    StructField("name", StringType(), True),
    StructField("location", StringType(), True),
    StructField("country", StringType(), True),
    StructField("lat", DoubleType(), True),
    StructField("lng", DoubleType(), True),
    StructField("alt", IntegerType(), True),
    StructField("url", StringType(), True)
])

In [0]:
circuits_df =spark.read\
.option("header", True)\
.schema(circuits_schema)\
.csv(f"{raw_folder_path}/{v_file_date}/circuits.csv")

In [0]:
display(circuits_df)

circuitId,circuitRef,name,location,country,lat,lng,alt,url
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,http://en.wikipedia.org/wiki/Melbourne_Grand_Prix_Circuit
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,http://en.wikipedia.org/wiki/Sepang_International_Circuit
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,http://en.wikipedia.org/wiki/Bahrain_International_Circuit
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,http://en.wikipedia.org/wiki/Circuit_de_Barcelona-Catalunya
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,http://en.wikipedia.org/wiki/Istanbul_Park
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7,http://en.wikipedia.org/wiki/Circuit_de_Monaco
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13,http://en.wikipedia.org/wiki/Circuit_Gilles_Villeneuve
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228,http://en.wikipedia.org/wiki/Circuit_de_Nevers_Magny-Cours
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153,http://en.wikipedia.org/wiki/Silverstone_Circuit
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103,http://en.wikipedia.org/wiki/Hockenheimring


In [0]:
circuits_df.printSchema()

root
 |-- circuitId: integer (nullable = true)
 |-- circuitRef: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- country: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lng: double (nullable = true)
 |-- alt: integer (nullable = true)
 |-- url: string (nullable = true)



Select only the required columns

In [0]:
circuits_selected_df = circuits_df.select("circuitId", "circuitRef", "name", "location", "country", "lat", "lng", "alt")

In [0]:
circuits_selected_df = circuits_df.select(circuits_df.circuitId, circuits_df.circuitRef, circuits_df.name, circuits_df.location, circuits_df.country, circuits_df.lat, circuits_df.lng  , circuits_df.alt)

In [0]:
circuits_selected_df = circuits_df.select(circuits_df["circuitId"], \
                                          circuits_df["circuitRef"], circuits_df["name"], circuits_df["location"], circuits_df["country"], circuits_df["lat"], circuits_df["lng"], circuits_df["alt"])

In [0]:
from pyspark.sql.functions import col
circuits_selected_df = circuits_df.select(col("circuitId"), col("circuitRef"), col("name"), col("location"), col("country"), col("lat"), col("lng"), col("alt"))

In [0]:
display(circuits_selected_df)

circuitId,circuitRef,name,location,country,lat,lng,alt
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103


### Step 3 - Rename the Columns as required

In [0]:
from pyspark.sql.functions import col, lit

In [0]:
circuits_renamed_df = circuits_selected_df.withColumnRenamed("circuitId", "circuit_id")\
            .withColumnRenamed("circuitRef", "circuit_ref")\
            .withColumnRenamed("lat", "latitude")\
            .withColumnRenamed("lng", "longitude")\
            .withColumnRenamed("alt", "altitude")\
            .withColumn("data_source", lit(v_data_source))  \
            .withColumn("file_date", lit(v_file_date)) 

In [0]:
display(circuits_renamed_df)

circuit_id,circuit_ref,name,location,country,latitude,longitude,altitude,data_source,file_date
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,,2021-03-21
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,,2021-03-21
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,,2021-03-21
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,,2021-03-21
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,,2021-03-21
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7,,2021-03-21
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13,,2021-03-21
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228,,2021-03-21
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153,,2021-03-21
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103,,2021-03-21


### Step 4 - Add ingestion date to the dataframe

In [0]:
from pyspark.sql.functions import current_timestamp
from pyspark.sql.functions import lit

In [0]:
circuits_final_df = add_ingestion_date(circuits_renamed_df) 


In [0]:
display(circuits_final_df)

circuit_id,circuit_ref,name,location,country,latitude,longitude,altitude,data_source,file_date,ingestion_date
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,,2021-03-21,2025-04-18T17:32:22.824+0000
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,,2021-03-21,2025-04-18T17:32:22.824+0000
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,,2021-03-21,2025-04-18T17:32:22.824+0000
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,,2021-03-21,2025-04-18T17:32:22.824+0000
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,,2021-03-21,2025-04-18T17:32:22.824+0000
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7,,2021-03-21,2025-04-18T17:32:22.824+0000
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13,,2021-03-21,2025-04-18T17:32:22.824+0000
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228,,2021-03-21,2025-04-18T17:32:22.824+0000
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153,,2021-03-21,2025-04-18T17:32:22.824+0000
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103,,2021-03-21,2025-04-18T17:32:22.824+0000


### Write data to datalake as parquet 

In [0]:
display(dbutils.fs.mounts())

mountPoint,source,encryptionType
/databricks-datasets,databricks-datasets,
/mnt/formula1dl2025practice/presentation,abfss://presentation@formula1dl2025practice.dfs.core.windows.net/,
/mnt/formula1dl2025practice/processed,abfss://processed@formula1dl2025practice.dfs.core.windows.net/,
/Volumes,UnityCatalogVolumes,
/databricks/mlflow-tracking,databricks/mlflow-tracking,
/databricks-results,databricks-results,
/databricks/mlflow-registry,databricks/mlflow-registry,
/mnt/formula1dl2025practice/raw,abfss://raw@formula1dl2025practice.dfs.core.windows.net/,
/mnt/formula1dl2025practice/demo,abfss://demo@formula1dl2025practice.dfs.core.windows.net/,
/mnt/formula1/demo,abfss://demo@formula1dl2025practice.dfs.core.windows.net/,


##### Step 5 - Write data to datalake as parquet

In [0]:
#circuits_final_df.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.circuits")
circuits_final_df.write.mode("overwrite").format("delta").saveAsTable("f1_processed.circuits")



In [0]:
%sql
SELECT * FROM f1_processed.circuits

circuit_id,circuit_ref,name,location,country,latitude,longitude,altitude,data_source,file_date,ingestion_date
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,,2021-03-21,2025-04-18T17:32:23.971+0000
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,,2021-03-21,2025-04-18T17:32:23.971+0000
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,,2021-03-21,2025-04-18T17:32:23.971+0000
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,,2021-03-21,2025-04-18T17:32:23.971+0000
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,,2021-03-21,2025-04-18T17:32:23.971+0000
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7,,2021-03-21,2025-04-18T17:32:23.971+0000
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13,,2021-03-21,2025-04-18T17:32:23.971+0000
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228,,2021-03-21,2025-04-18T17:32:23.971+0000
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153,,2021-03-21,2025-04-18T17:32:23.971+0000
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103,,2021-03-21,2025-04-18T17:32:23.971+0000


In [0]:
%fs
ls /mnt/formula1dl2025practice/processed/circuits

path,name,size,modificationTime
dbfs:/mnt/formula1dl2025practice/processed/circuits/_delta_log/,_delta_log/,0,1744151678000
dbfs:/mnt/formula1dl2025practice/processed/circuits/part-00000-074ba5dd-227e-48bd-a24d-be67aeb0c95f-c000.snappy.parquet,part-00000-074ba5dd-227e-48bd-a24d-be67aeb0c95f-c000.snappy.parquet,8441,1744997544000
dbfs:/mnt/formula1dl2025practice/processed/circuits/part-00000-0d7013d3-d03a-4973-96b3-3a4b02e9eaec-c000.snappy.parquet,part-00000-0d7013d3-d03a-4973-96b3-3a4b02e9eaec-c000.snappy.parquet,8484,1744382735000
dbfs:/mnt/formula1dl2025practice/processed/circuits/part-00000-31a539aa-b5fe-43c0-9dcd-076a1f82be47-c000.snappy.parquet,part-00000-31a539aa-b5fe-43c0-9dcd-076a1f82be47-c000.snappy.parquet,8441,1744997412000
dbfs:/mnt/formula1dl2025practice/processed/circuits/part-00000-49db253a-4f7f-40d7-b92f-63aa3e94c52e-c000.snappy.parquet,part-00000-49db253a-4f7f-40d7-b92f-63aa3e94c52e-c000.snappy.parquet,8484,1744394599000
dbfs:/mnt/formula1dl2025practice/processed/circuits/part-00000-5c9a6beb-b347-44f7-b44e-71d97de7ef6e-c000.snappy.parquet,part-00000-5c9a6beb-b347-44f7-b44e-71d97de7ef6e-c000.snappy.parquet,8484,1744383767000
dbfs:/mnt/formula1dl2025practice/processed/circuits/part-00000-7de4f529-8eb7-4028-be72-478a711011a9-c000.snappy.parquet,part-00000-7de4f529-8eb7-4028-be72-478a711011a9-c000.snappy.parquet,8441,1744151682000
dbfs:/mnt/formula1dl2025practice/processed/circuits/part-00000-a87d257c-3297-4e4d-81d6-454b5708853f-c000.snappy.parquet,part-00000-a87d257c-3297-4e4d-81d6-454b5708853f-c000.snappy.parquet,8484,1744385523000
dbfs:/mnt/formula1dl2025practice/processed/circuits/part-00000-c6c57890-5d89-4cfb-a29e-697b55782410-c000.snappy.parquet,part-00000-c6c57890-5d89-4cfb-a29e-697b55782410-c000.snappy.parquet,8484,1744384880000
dbfs:/mnt/formula1dl2025practice/processed/circuits/part-00000-ced812f4-5084-4d08-b5ed-17fa3cd632c0-c000.snappy.parquet,part-00000-ced812f4-5084-4d08-b5ed-17fa3cd632c0-c000.snappy.parquet,8484,1744391864000


In [0]:
#df = spark.read.delta("/mnt/formula1dl2025practice/processed/circuits")

In [0]:
df = spark.read.format("delta").load(f"{processed_folder_path}/circuits")

In [0]:
display(df)

circuit_id,circuit_ref,name,location,country,latitude,longitude,altitude,data_source,file_date,ingestion_date
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,,2021-03-21,2025-04-18T17:32:23.971+0000
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,,2021-03-21,2025-04-18T17:32:23.971+0000
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,,2021-03-21,2025-04-18T17:32:23.971+0000
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,,2021-03-21,2025-04-18T17:32:23.971+0000
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,,2021-03-21,2025-04-18T17:32:23.971+0000
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7,,2021-03-21,2025-04-18T17:32:23.971+0000
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13,,2021-03-21,2025-04-18T17:32:23.971+0000
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228,,2021-03-21,2025-04-18T17:32:23.971+0000
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153,,2021-03-21,2025-04-18T17:32:23.971+0000
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103,,2021-03-21,2025-04-18T17:32:23.971+0000


In [0]:
dbutils.notebook.exit("Success")