In [0]:
#Step 1 - Read the JSON file using the spark dataframe reader API

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [0]:

name_schema = StructType(fields=[StructField("forename", StringType(), True),
                                 StructField("surname", StringType(), True)
  
])

In [0]:

drivers_schema = StructType(fields=[StructField("driverId", IntegerType(), False),
                                    StructField("driverRef", StringType(), True),
                                    StructField("number", IntegerType(), True),
                                    StructField("code", StringType(), True),
                                    StructField("name", name_schema),
                                    StructField("dob", DateType(), True),
                                    StructField("nationality", StringType(), True),
                                    StructField("url", StringType(), True)  
])

In [0]:
drivers_df = spark.read.json("dbfs:/FileStore/drivers.json" , schema = drivers_schema)

In [0]:
#Step 2 - Rename columns and add new columns
#driverId renamed to driver_id
#driverRef renamed to driver_ref
#ingestion date added
#name added with concatenation of forename and surname

In [0]:
from pyspark.sql.functions import col, concat, lit , current_timestamp

In [0]:
drivers_with_ingestion_date_df = drivers_df.withColumn("ingestion_date",current_timestamp())

In [0]:
drivers_with_columns_df = drivers_with_ingestion_date_df.withColumnRenamed("driverId", "driver_id") \
                                    .withColumnRenamed("driverRef", "driver_ref") \
                                    .withColumn("name", concat(col("name.forename"), lit(" "), col("name.surname"))) 

In [0]:
#Step 3 - Drop the unwanted columns
#name.forename
#name.surname
#url

In [0]:
drivers_final_df = drivers_with_columns_df.drop(col("url"))

In [0]:
#Step 4 - Write to output to processed container in parquet format

In [0]:
drivers_final_df.write.parquet("dbfs:/FileStore/Formula1/processed/f1_processed.drivers") 

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-2747915330769742>:1[0m
[0;32m----> 1[0m [43mdrivers_final_df[49m[38;5;241;43m.[39;49m[43mwrite[49m[38;5;241;43m.[39;49m[43mparquet[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mdbfs:/FileStore/Formula1/processed/f1_processed.drivers[39;49m[38;5;124;43m"[39;49m[43m)[49m

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39m [43mfunc[49m[43m([49m[38;5;241;43m*[39;49m[43margs[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mkwargs[49m[43m)[49m
[1;32m     49[0m     logger[38;5;

In [0]:
drivers_final_df.show(truncate=False)

+---------+----------+------+----+------------------+----------+-----------+-----------------------+
|driver_id|driver_ref|number|code|name              |dob       |nationality|ingestion_date         |
+---------+----------+------+----+------------------+----------+-----------+-----------------------+
|1        |hamilton  |44    |HAM |Lewis Hamilton    |1985-01-07|British    |2024-04-22 18:07:10.948|
|2        |heidfeld  |null  |HEI |Nick Heidfeld     |1977-05-10|German     |2024-04-22 18:07:10.948|
|3        |rosberg   |6     |ROS |Nico Rosberg      |1985-06-27|German     |2024-04-22 18:07:10.948|
|4        |alonso    |14    |ALO |Fernando Alonso   |1981-07-29|Spanish    |2024-04-22 18:07:10.948|
|5        |kovalainen|null  |KOV |Heikki Kovalainen |1981-10-19|Finnish    |2024-04-22 18:07:10.948|
|6        |nakajima  |null  |NAK |Kazuki Nakajima   |1985-01-11|Japanese   |2024-04-22 18:07:10.948|
|7        |bourdais  |null  |BOU |Sébastien Bourdais|1979-02-28|French     |2024-04-22 18:0