####ingest drivers file

In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [0]:
name_schema = StructType(fields = [StructField("forename", StringType(), True),
                                  StructField("surname", StringType(), True)])

In [0]:
driver_schema = StructType(fields = (StructField("driverId", IntegerType(), False),
                                     StructField("driverRef", StringType(), True),
                                     StructField("number", IntegerType(), True),
                                     StructField("code", StringType(), True),
                                     StructField("name", name_schema),
                                     StructField("dob", DateType(), True),
                                     StructField("nationality", StringType(), True),
                                     StructField("url", StringType(), True)))

In [0]:
driver_df = spark.read \
.schema(driver_schema) \
.json("/mnt/f1datalake2025/raw/drivers.json")

#####rename the columns

In [0]:
from pyspark.sql.functions import current_timestamp, concat, lit

In [0]:
driver_renamed_df = driver_df \
.withColumnRenamed("driverId", "driver_id") \
.withColumnRenamed("driverRef", "driver_ref") \
.withColumn("ingestion_date", current_timestamp()) \
.withColumn("name", concat("name.forename", lit(" "), "name.surname")) 

In [0]:
display(driver_renamed_df)

driver_id,driver_ref,number,code,name,dob,nationality,url,ingestion_date
1,hamilton,44.0,HAM,Lewis Hamilton,1985-01-07,British,http://en.wikipedia.org/wiki/Lewis_Hamilton,2025-07-26T00:02:39.772Z
2,heidfeld,,HEI,Nick Heidfeld,1977-05-10,German,http://en.wikipedia.org/wiki/Nick_Heidfeld,2025-07-26T00:02:39.772Z
3,rosberg,6.0,ROS,Nico Rosberg,1985-06-27,German,http://en.wikipedia.org/wiki/Nico_Rosberg,2025-07-26T00:02:39.772Z
4,alonso,14.0,ALO,Fernando Alonso,1981-07-29,Spanish,http://en.wikipedia.org/wiki/Fernando_Alonso,2025-07-26T00:02:39.772Z
5,kovalainen,,KOV,Heikki Kovalainen,1981-10-19,Finnish,http://en.wikipedia.org/wiki/Heikki_Kovalainen,2025-07-26T00:02:39.772Z
6,nakajima,,NAK,Kazuki Nakajima,1985-01-11,Japanese,http://en.wikipedia.org/wiki/Kazuki_Nakajima,2025-07-26T00:02:39.772Z
7,bourdais,,BOU,Sébastien Bourdais,1979-02-28,French,http://en.wikipedia.org/wiki/S%C3%A9bastien_Bourdais,2025-07-26T00:02:39.772Z
8,raikkonen,7.0,RAI,Kimi Räikkönen,1979-10-17,Finnish,http://en.wikipedia.org/wiki/Kimi_R%C3%A4ikk%C3%B6nen,2025-07-26T00:02:39.772Z
9,kubica,88.0,KUB,Robert Kubica,1984-12-07,Polish,http://en.wikipedia.org/wiki/Robert_Kubica,2025-07-26T00:02:39.772Z
10,glock,,GLO,Timo Glock,1982-03-18,German,http://en.wikipedia.org/wiki/Timo_Glock,2025-07-26T00:02:39.772Z


In [0]:
driver_final_df = driver_renamed_df.drop("url")

In [0]:
driver_final_df.write.mode("overwrite").parquet("/mnt/f1datalake2025/cleaned-and-processed/drivers")

In [0]:
dbutils.notebook.exit("Success")