In [0]:
#Step 1 - Read the JSON file using the spark dataframe reader API

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [0]:
pit_stops_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                      StructField("driverId", IntegerType(), True),
                                      StructField("stop", StringType(), True),
                                      StructField("lap", IntegerType(), True),
                                      StructField("time", StringType(), True),
                                      StructField("duration", StringType(), True),
                                      StructField("milliseconds", IntegerType(), True)
                                     ])

In [0]:
pit_stops_df = spark.read.json("dbfs:/FileStore/pit_stops.json",schema = pit_stops_schema , multiLine=True)

In [0]:
pit_stops_df.show()

+------+--------+----+---+--------+--------+------------+
|raceId|driverId|stop|lap|    time|duration|milliseconds|
+------+--------+----+---+--------+--------+------------+
|   841|     153|   1|  1|17:05:23|  26.898|       26898|
|   841|      30|   1|  1|17:05:52|  25.021|       25021|
|   841|      17|   1| 11|17:20:48|  23.426|       23426|
|   841|       4|   1| 12|17:22:34|  23.251|       23251|
|   841|      13|   1| 13|17:24:10|  23.842|       23842|
|   841|      22|   1| 13|17:24:29|  23.643|       23643|
|   841|      20|   1| 14|17:25:17|  22.603|       22603|
|   841|     814|   1| 14|17:26:03|  24.863|       24863|
|   841|     816|   1| 14|17:26:50|  25.259|       25259|
|   841|      67|   1| 15|17:27:34|  25.342|       25342|
|   841|       2|   1| 15|17:27:41|  22.994|       22994|
|   841|       1|   1| 16|17:28:24|  23.227|       23227|
|   841|     808|   1| 16|17:28:39|  24.535|       24535|
|   841|       3|   1| 16|17:29:00|  23.716|       23716|
|   841|     1

In [0]:
#Step 2 - Rename columns and add new columns
#Rename driverId and raceId
#Add ingestion_date with current timestamp

In [0]:
from pyspark.sql.functions import lit , current_timestamp

In [0]:
pit_stops_with_ingestion_date_df=pit_stops_df.withColumn("ingestion_date",current_timestamp())

In [0]:
final_df = pit_stops_with_ingestion_date_df.withColumnRenamed("driverId", "driver_id") \
.withColumnRenamed("raceId", "race_id") 

In [0]:
final_df.show(truncate=False)

+-------+---------+----+---+--------+--------+------------+-----------------------+
|race_id|driver_id|stop|lap|time    |duration|milliseconds|ingestion_date         |
+-------+---------+----+---+--------+--------+------------+-----------------------+
|841    |153      |1   |1  |17:05:23|26.898  |26898       |2024-04-22 18:17:35.691|
|841    |30       |1   |1  |17:05:52|25.021  |25021       |2024-04-22 18:17:35.691|
|841    |17       |1   |11 |17:20:48|23.426  |23426       |2024-04-22 18:17:35.691|
|841    |4        |1   |12 |17:22:34|23.251  |23251       |2024-04-22 18:17:35.691|
|841    |13       |1   |13 |17:24:10|23.842  |23842       |2024-04-22 18:17:35.691|
|841    |22       |1   |13 |17:24:29|23.643  |23643       |2024-04-22 18:17:35.691|
|841    |20       |1   |14 |17:25:17|22.603  |22603       |2024-04-22 18:17:35.691|
|841    |814      |1   |14 |17:26:03|24.863  |24863       |2024-04-22 18:17:35.691|
|841    |816      |1   |14 |17:26:50|25.259  |25259       |2024-04-22 18:17:

In [0]:
#Step 3 - Write to output to processed container in parquet format

In [0]:
final_df.write.mode('overwrite').parquet("dbfs:/FileStore/Formula1/processed/f1_processed.pit_stops")