###Ingest pit_stops.json file

In [0]:
dbutils.widgets.text('p_data_source','')
v_data_source = dbutils.widgets.get('p_data_source')

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

#####Step 1 - Read the JSON file using the spark dataframe reader API

In [0]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType

In [0]:
pit_stops_schema = StructType(fields = [StructField('raceId',IntegerType(),False),
                                      StructField('driverId',IntegerType(),True),
                                      StructField('stop',StringType(),True),
                                      StructField('lap',IntegerType(),True),
                                      StructField('time',StringType(),True),
                                      StructField('duration',StringType(),True),
                                      StructField('milliseconds',IntegerType(),True)])

In [0]:
pit_stops_df = spark.read.option('multiline',True).json(f"{raw_folder_path}/pit_stops.json",schema = pit_stops_schema)

In [0]:
pit_stops_df.show(10)

+------+--------+----+---+--------+--------+------------+
|raceId|driverId|stop|lap|    time|duration|milliseconds|
+------+--------+----+---+--------+--------+------------+
|   841|     153|   1|  1|17:05:23|  26.898|       26898|
|   841|      30|   1|  1|17:05:52|  25.021|       25021|
|   841|      17|   1| 11|17:20:48|  23.426|       23426|
|   841|       4|   1| 12|17:22:34|  23.251|       23251|
|   841|      13|   1| 13|17:24:10|  23.842|       23842|
|   841|      22|   1| 13|17:24:29|  23.643|       23643|
|   841|      20|   1| 14|17:25:17|  22.603|       22603|
|   841|     814|   1| 14|17:26:03|  24.863|       24863|
|   841|     816|   1| 14|17:26:50|  25.259|       25259|
|   841|      67|   1| 15|17:27:34|  25.342|       25342|
+------+--------+----+---+--------+--------+------------+
only showing top 10 rows



#####Step 2 - Rename columns and add new columns

In [0]:
from pyspark.sql.functions import current_timestamp,lit

In [0]:
pit_stops_final_df = pit_stops_df.withColumnRenamed('driverId','driver_id') \
        .withColumnRenamed('raceId','race_id') \
        .withColumn('ingestion_date',current_timestamp()) \
        .withColumn('data_source',lit(v_data_source))

In [0]:
pit_stops_final_df.show(10)

+-------+---------+----+---+--------+--------+------------+--------------------+-----------+
|race_id|driver_id|stop|lap|    time|duration|milliseconds|      ingestion_date|data_source|
+-------+---------+----+---+--------+--------+------------+--------------------+-----------+
|    841|      153|   1|  1|17:05:23|  26.898|       26898|2023-09-30 18:02:...| Ergast API|
|    841|       30|   1|  1|17:05:52|  25.021|       25021|2023-09-30 18:02:...| Ergast API|
|    841|       17|   1| 11|17:20:48|  23.426|       23426|2023-09-30 18:02:...| Ergast API|
|    841|        4|   1| 12|17:22:34|  23.251|       23251|2023-09-30 18:02:...| Ergast API|
|    841|       13|   1| 13|17:24:10|  23.842|       23842|2023-09-30 18:02:...| Ergast API|
|    841|       22|   1| 13|17:24:29|  23.643|       23643|2023-09-30 18:02:...| Ergast API|
|    841|       20|   1| 14|17:25:17|  22.603|       22603|2023-09-30 18:02:...| Ergast API|
|    841|      814|   1| 14|17:26:03|  24.863|       24863|2023-09-30 

#####Step 3 - Write the output to processed container in parquet format

In [0]:
pit_stops_final_df.write.mode('overwrite').parquet(f"{processed_folder_path}/pit_stops")

In [0]:
dbutils.notebook.exit('Success')