In [0]:
dbutils.fs.mkdirs("dbfs:/FileStore/Formula1/qualifying")

Out[1]: True

In [0]:
#Step 1 - Read the JSON file using the spark dataframe reader API

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [0]:

qualifying_schema = StructType(fields=[StructField("qualifyId", IntegerType(), False),
                                      StructField("raceId", IntegerType(), True),
                                      StructField("driverId", IntegerType(), True),
                                      StructField("constructorId", IntegerType(), True),
                                      StructField("number", IntegerType(), True),
                                      StructField("position", IntegerType(), True),
                                      StructField("q1", StringType(), True),
                                      StructField("q2", StringType(), True),
                                      StructField("q3", StringType(), True),
                                     ])

In [0]:
qualifying_df = spark.read.json("dbfs:/FileStore/Formula1/qualifying",schema =qualifying_schema , multiLine= True)

In [0]:
qualifying_df.show()

+---------+------+--------+-------------+------+--------+--------+--------+--------+
|qualifyId|raceId|driverId|constructorId|number|position|      q1|      q2|      q3|
+---------+------+--------+-------------+------+--------+--------+--------+--------+
|        1|    18|       1|            1|    22|       1|1:26.572|1:25.187|1:26.714|
|        2|    18|       9|            2|     4|       2|1:26.103|1:25.315|1:26.869|
|        3|    18|       5|            1|    23|       3|1:25.664|1:25.452|1:27.079|
|        4|    18|      13|            6|     2|       4|1:25.994|1:25.691|1:27.178|
|        5|    18|       2|            2|     3|       5|1:25.960|1:25.518|1:27.236|
|        6|    18|      15|            7|    11|       6|1:26.427|1:26.101|1:28.527|
|        7|    18|       3|            3|     7|       7|1:26.295|1:26.059|1:28.687|
|        8|    18|      14|            9|     9|       8|1:26.381|1:26.063|1:29.041|
|        9|    18|      10|            7|    12|       9|1:26.919

In [0]:
#Step 2 - Rename columns and add new columns
#Rename qualifyingId, driverId, constructorId and raceId
#Add ingestion_date with current timestamp

In [0]:
from pyspark.sql.functions import lit , current_timestamp

In [0]:
final_df = qualifying_df.withColumnRenamed("qualifyId", "qualify_id") \
.withColumnRenamed("driverId", "driver_id") \
.withColumnRenamed("raceId", "race_id") \
.withColumnRenamed("constructorId", "constructor_id") \
.withColumn("ingestion_date", current_timestamp()) 

In [0]:
final_df.show(truncate=False)

+----------+-------+---------+--------------+------+--------+--------+--------+--------+-----------------------+
|qualify_id|race_id|driver_id|constructor_id|number|position|q1      |q2      |q3      |ingestion_date         |
+----------+-------+---------+--------------+------+--------+--------+--------+--------+-----------------------+
|1         |18     |1        |1             |22    |1       |1:26.572|1:25.187|1:26.714|2024-04-23 04:00:32.158|
|2         |18     |9        |2             |4     |2       |1:26.103|1:25.315|1:26.869|2024-04-23 04:00:32.158|
|3         |18     |5        |1             |23    |3       |1:25.664|1:25.452|1:27.079|2024-04-23 04:00:32.158|
|4         |18     |13       |6             |2     |4       |1:25.994|1:25.691|1:27.178|2024-04-23 04:00:32.158|
|5         |18     |2        |2             |3     |5       |1:25.960|1:25.518|1:27.236|2024-04-23 04:00:32.158|
|6         |18     |15       |7             |11    |6       |1:26.427|1:26.101|1:28.527|2024-04-

In [0]:
#Step 3 - Write to output to processed container in parquet format

In [0]:
final_df.write.parquet("dbfs:/FileStore/Formula1/processed/f1_processed.qualifying") 

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-3417363049632157>:1[0m
[0;32m----> 1[0m [43mfinal_df[49m[38;5;241;43m.[39;49m[43mwrite[49m[38;5;241;43m.[39;49m[43mparquet[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mdbfs:/FileStore/Formula1/processed/f1_processed.qualifying[39;49m[38;5;124;43m"[39;49m[43m)[49m

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39m [43mfunc[49m[43m([49m[38;5;241;43m*[39;49m[43margs[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mkwargs[49m[43m)[49m
[1;32m     49[0m     logger[38;5;241m.

In [0]:
final_df.show(truncate=False)

+----------+-------+---------+--------------+------+--------+--------+--------+--------+-----------------------+
|qualify_id|race_id|driver_id|constructor_id|number|position|q1      |q2      |q3      |ingestion_date         |
+----------+-------+---------+--------------+------+--------+--------+--------+--------+-----------------------+
|1         |18     |1        |1             |22    |1       |1:26.572|1:25.187|1:26.714|2024-04-23 04:01:10.743|
|2         |18     |9        |2             |4     |2       |1:26.103|1:25.315|1:26.869|2024-04-23 04:01:10.743|
|3         |18     |5        |1             |23    |3       |1:25.664|1:25.452|1:27.079|2024-04-23 04:01:10.743|
|4         |18     |13       |6             |2     |4       |1:25.994|1:25.691|1:27.178|2024-04-23 04:01:10.743|
|5         |18     |2        |2             |3     |5       |1:25.960|1:25.518|1:27.236|2024-04-23 04:01:10.743|
|6         |18     |15       |7             |11    |6       |1:26.427|1:26.101|1:28.527|2024-04-