### ingest races.csv file

##### Read the CSV file using the spark dataframe reader

In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

In [0]:
races_schema = StructType([
    StructField("raceId", IntegerType(), True),
    StructField("year", IntegerType(), True),
    StructField("round", IntegerType(), True),
    StructField("circuitId", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("date", StringType(), True),
    StructField("time", StringType(), True),
    StructField("url", StringType(), True)
])
                                  

In [0]:
races_df = spark.read \
.option("header", True) \
.schema(races_schema) \
.csv("/mnt/f1datalake2025/raw/races.csv")


In [0]:
display(races_df)

raceId,year,round,circuitId,name,date,time,url
1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,http://en.wikipedia.org/wiki/2009_Australian_Grand_Prix
2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00,http://en.wikipedia.org/wiki/2009_Malaysian_Grand_Prix
3,2009,3,17,Chinese Grand Prix,2009-04-19,07:00:00,http://en.wikipedia.org/wiki/2009_Chinese_Grand_Prix
4,2009,4,3,Bahrain Grand Prix,2009-04-26,12:00:00,http://en.wikipedia.org/wiki/2009_Bahrain_Grand_Prix
5,2009,5,4,Spanish Grand Prix,2009-05-10,12:00:00,http://en.wikipedia.org/wiki/2009_Spanish_Grand_Prix
6,2009,6,6,Monaco Grand Prix,2009-05-24,12:00:00,http://en.wikipedia.org/wiki/2009_Monaco_Grand_Prix
7,2009,7,5,Turkish Grand Prix,2009-06-07,12:00:00,http://en.wikipedia.org/wiki/2009_Turkish_Grand_Prix
8,2009,8,9,British Grand Prix,2009-06-21,12:00:00,http://en.wikipedia.org/wiki/2009_British_Grand_Prix
9,2009,9,20,German Grand Prix,2009-07-12,12:00:00,http://en.wikipedia.org/wiki/2009_German_Grand_Prix
10,2009,10,11,Hungarian Grand Prix,2009-07-26,12:00:00,http://en.wikipedia.org/wiki/2009_Hungarian_Grand_Prix


In [0]:
races_df.printSchema()

root
 |-- raceId: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- round: integer (nullable = true)
 |-- circuitId: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |-- url: string (nullable = true)



In [0]:
races_df.describe().show()


+-------+------------------+-----------------+------------------+------------------+--------------------+----------+--------+--------------------+
|summary|            raceId|             year|             round|         circuitId|                name|      date|    time|                 url|
+-------+------------------+-----------------+------------------+------------------+--------------------+----------+--------+--------------------+
|  count|              1058|             1058|              1058|              1058|                1058|      1058|    1058|                1058|
|   mean| 531.2315689981097|1990.780718336484| 8.382797731568997|22.089792060491494|                NULL|      NULL|    NULL|                NULL|
| stddev|308.16570918807656|19.73008802240494|5.0002806845260235|17.154605278616593|                NULL|      NULL|    NULL|                NULL|
|    min|                 1|             1950|                 1|                 1|70th Anniversary ...|1950-05-13|03

#####add ingestion date and  race timestamp


In [0]:
from pyspark.sql.functions import current_timestamp, to_timestamp, concat, lit, col

In [0]:
races_wdatetime_df = races_df.withColumn("race_timestamp", to_timestamp(concat(col("date"), lit(" "), col("time")), "yyyy-MM-dd HH:mm:ss")) \
                             .withColumn("ingestion_date", current_timestamp())
display(races_wdatetime_df)


raceId,year,round,circuitId,name,date,time,url,race_timestamp,ingestion_date
1,2009,1,1,Australian Grand Prix,2009-03-29,06:00:00,http://en.wikipedia.org/wiki/2009_Australian_Grand_Prix,2009-03-29T06:00:00Z,2025-07-30T00:03:08.698Z
2,2009,2,2,Malaysian Grand Prix,2009-04-05,09:00:00,http://en.wikipedia.org/wiki/2009_Malaysian_Grand_Prix,2009-04-05T09:00:00Z,2025-07-30T00:03:08.698Z
3,2009,3,17,Chinese Grand Prix,2009-04-19,07:00:00,http://en.wikipedia.org/wiki/2009_Chinese_Grand_Prix,2009-04-19T07:00:00Z,2025-07-30T00:03:08.698Z
4,2009,4,3,Bahrain Grand Prix,2009-04-26,12:00:00,http://en.wikipedia.org/wiki/2009_Bahrain_Grand_Prix,2009-04-26T12:00:00Z,2025-07-30T00:03:08.698Z
5,2009,5,4,Spanish Grand Prix,2009-05-10,12:00:00,http://en.wikipedia.org/wiki/2009_Spanish_Grand_Prix,2009-05-10T12:00:00Z,2025-07-30T00:03:08.698Z
6,2009,6,6,Monaco Grand Prix,2009-05-24,12:00:00,http://en.wikipedia.org/wiki/2009_Monaco_Grand_Prix,2009-05-24T12:00:00Z,2025-07-30T00:03:08.698Z
7,2009,7,5,Turkish Grand Prix,2009-06-07,12:00:00,http://en.wikipedia.org/wiki/2009_Turkish_Grand_Prix,2009-06-07T12:00:00Z,2025-07-30T00:03:08.698Z
8,2009,8,9,British Grand Prix,2009-06-21,12:00:00,http://en.wikipedia.org/wiki/2009_British_Grand_Prix,2009-06-21T12:00:00Z,2025-07-30T00:03:08.698Z
9,2009,9,20,German Grand Prix,2009-07-12,12:00:00,http://en.wikipedia.org/wiki/2009_German_Grand_Prix,2009-07-12T12:00:00Z,2025-07-30T00:03:08.698Z
10,2009,10,11,Hungarian Grand Prix,2009-07-26,12:00:00,http://en.wikipedia.org/wiki/2009_Hungarian_Grand_Prix,2009-07-26T12:00:00Z,2025-07-30T00:03:08.698Z


#####select the required column and rename


In [0]:

from pyspark.sql.functions import col
races_selected_df = races_wdatetime_df.select(col("raceId").alias("race_id"), col("year").alias("race_year"), col("round"), 
                                              col("circuitId").alias("circuit_id"), col("name"), col("race_timestamp"), col("ingestion_date"))
display(races_selected_df)


race_id,race_year,round,circuit_id,name,race_timestamp,ingestion_date
1,2009,1,1,Australian Grand Prix,2009-03-29T06:00:00Z,2025-07-30T00:03:12.5Z
2,2009,2,2,Malaysian Grand Prix,2009-04-05T09:00:00Z,2025-07-30T00:03:12.5Z
3,2009,3,17,Chinese Grand Prix,2009-04-19T07:00:00Z,2025-07-30T00:03:12.5Z
4,2009,4,3,Bahrain Grand Prix,2009-04-26T12:00:00Z,2025-07-30T00:03:12.5Z
5,2009,5,4,Spanish Grand Prix,2009-05-10T12:00:00Z,2025-07-30T00:03:12.5Z
6,2009,6,6,Monaco Grand Prix,2009-05-24T12:00:00Z,2025-07-30T00:03:12.5Z
7,2009,7,5,Turkish Grand Prix,2009-06-07T12:00:00Z,2025-07-30T00:03:12.5Z
8,2009,8,9,British Grand Prix,2009-06-21T12:00:00Z,2025-07-30T00:03:12.5Z
9,2009,9,20,German Grand Prix,2009-07-12T12:00:00Z,2025-07-30T00:03:12.5Z
10,2009,10,11,Hungarian Grand Prix,2009-07-26T12:00:00Z,2025-07-30T00:03:12.5Z


######
write the file as parquet to cleaned and processed container

In [0]:
races_selected_df.write.mode("overwrite").parquet("dbfs:/mnt/f1datalake2025/cleaned-and-processed/races")

In [0]:
display(spark.read.parquet("/mnt/f1datalake2025/cleaned-and-processed/races"))

race_id,race_year,round,circuit_id,name,race_timestamp,ingestion_date
1,2009,1,1,Australian Grand Prix,2009-03-29T06:00:00Z,2025-07-30T00:03:15.175Z
2,2009,2,2,Malaysian Grand Prix,2009-04-05T09:00:00Z,2025-07-30T00:03:15.175Z
3,2009,3,17,Chinese Grand Prix,2009-04-19T07:00:00Z,2025-07-30T00:03:15.175Z
4,2009,4,3,Bahrain Grand Prix,2009-04-26T12:00:00Z,2025-07-30T00:03:15.175Z
5,2009,5,4,Spanish Grand Prix,2009-05-10T12:00:00Z,2025-07-30T00:03:15.175Z
6,2009,6,6,Monaco Grand Prix,2009-05-24T12:00:00Z,2025-07-30T00:03:15.175Z
7,2009,7,5,Turkish Grand Prix,2009-06-07T12:00:00Z,2025-07-30T00:03:15.175Z
8,2009,8,9,British Grand Prix,2009-06-21T12:00:00Z,2025-07-30T00:03:15.175Z
9,2009,9,20,German Grand Prix,2009-07-12T12:00:00Z,2025-07-30T00:03:15.175Z
10,2009,10,11,Hungarian Grand Prix,2009-07-26T12:00:00Z,2025-07-30T00:03:15.175Z


In [0]:
#partition by race year
races_selected_df.write.mode("overwrite").partitionBy('race_year').parquet("dbfs:/mnt/f1datalake2025/cleaned-and-processed/races")

In [0]:
dbutils.notebook.exit("Success")