## Ingest pit_stops.json file
Step 1- Read the json file using the spark dataframe reader API

In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")


In [0]:
dbutils.widgets.text("p_file_date", "2021-03-28")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
from pyspark.sql.types import StructType, StringType, IntegerType, DoubleType, StructField, TimestampType, DateType, FloatType
from pyspark.sql.functions import col, struct,current_timestamp , concat, lit

In [0]:
pit_stop_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                     StructField("driverId", IntegerType(), True),
                                     StructField("stop", StringType(), True),
                                     StructField("lap", IntegerType(), True),
                                     StructField("time", StringType(), True),
                                     StructField("duration", StringType(), True),
                                     StructField("milliseconds", IntegerType(), True)])

In [0]:
# Load the JSON file
pit_stops_df = spark.read\
    .schema(pit_stop_schema)\
        .option("multiLine", True)\
    .json(f"{raw_folder_path}/{v_file_date}/pit_stops.json")

In [0]:
display(pit_stops_df)

raceId,driverId,stop,lap,time,duration,milliseconds
1053,839,1,1,15:05:16,30.866,30866
1053,20,1,3,15:10:09,32.024,32024
1053,854,1,5,15:15:11,51.007,51007
1053,853,1,12,15:27:20,31.168,31168
1053,842,1,14,15:30:10,31.068,31068
1053,20,2,20,15:39:11,31.184,31184
1053,854,2,21,15:41:24,32.479,32479
1053,20,3,22,15:42:52,39.502,39502
1053,853,2,23,15:45:20,31.5,31500
1053,852,1,25,15:46:39,30.696,30696


In [0]:
pit_stops_df.printSchema()

root
 |-- raceId: integer (nullable = true)
 |-- driverId: integer (nullable = true)
 |-- stop: string (nullable = true)
 |-- lap: integer (nullable = true)
 |-- time: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- milliseconds: integer (nullable = true)



#### Step 2 - Rename cloumns and add new columns

In [0]:
pit_stops_final_df = pit_stops_df.withColumnRenamed("driverId", "driver_id")\
  .withColumnRenamed("raceId", "race_id")\
    .withColumn("data_source", lit(v_data_source))
                                           

In [0]:
pit_stops_final_df = add_ingestion_date(pit_stops_final_df)

In [0]:
display(pit_stops_final_df)

race_id,driver_id,stop,lap,time,duration,milliseconds,data_source,ingestion_date
1053,839,1,1,15:05:16,30.866,30866,test,2025-04-10T22:34:22.938+0000
1053,20,1,3,15:10:09,32.024,32024,test,2025-04-10T22:34:22.938+0000
1053,854,1,5,15:15:11,51.007,51007,test,2025-04-10T22:34:22.938+0000
1053,853,1,12,15:27:20,31.168,31168,test,2025-04-10T22:34:22.938+0000
1053,842,1,14,15:30:10,31.068,31068,test,2025-04-10T22:34:22.938+0000
1053,20,2,20,15:39:11,31.184,31184,test,2025-04-10T22:34:22.938+0000
1053,854,2,21,15:41:24,32.479,32479,test,2025-04-10T22:34:22.938+0000
1053,20,3,22,15:42:52,39.502,39502,test,2025-04-10T22:34:22.938+0000
1053,853,2,23,15:45:20,31.5,31500,test,2025-04-10T22:34:22.938+0000
1053,852,1,25,15:46:39,30.696,30696,test,2025-04-10T22:34:22.938+0000



#### Step 3 - Write to output to processed container in parquet format

In [0]:
##pit_stops_final_df.write.mode("overwrite").parquet(f"{processed_folder_path}/pit_stops")

#pit_stops_final_df.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.pit_stops")

In [0]:
#overwrite_partition(pit_stops_final_df, "f1_processed", "pit_stops", "race_id")

In [0]:
merge_condition ="tgt.race_id =src.race_id AND tgt.driver_id = src.driver_id AND tgt.stop = src.stop AND tgt.race_id = src.race_id"
merge_delta_data (pit_stops_final_df,"f1_processed", "pit_stops",processed_folder_path,merge_condition,"race_id")

In [0]:
%fs
ls /mnt/formula1dl2025practice/processed/pit_stops

path,name,size,modificationTime
dbfs:/mnt/formula1dl2025practice/processed/pit_stops/_delta_log/,_delta_log/,0,1744324257000
dbfs:/mnt/formula1dl2025practice/processed/pit_stops/race_id=1000/,race_id=1000/,0,1744324271000
dbfs:/mnt/formula1dl2025practice/processed/pit_stops/race_id=1001/,race_id=1001/,0,1744324271000
dbfs:/mnt/formula1dl2025practice/processed/pit_stops/race_id=1002/,race_id=1002/,0,1744324271000
dbfs:/mnt/formula1dl2025practice/processed/pit_stops/race_id=1003/,race_id=1003/,0,1744324272000
dbfs:/mnt/formula1dl2025practice/processed/pit_stops/race_id=1004/,race_id=1004/,0,1744324272000
dbfs:/mnt/formula1dl2025practice/processed/pit_stops/race_id=1005/,race_id=1005/,0,1744324272000
dbfs:/mnt/formula1dl2025practice/processed/pit_stops/race_id=1006/,race_id=1006/,0,1744324272000
dbfs:/mnt/formula1dl2025practice/processed/pit_stops/race_id=1007/,race_id=1007/,0,1744324272000
dbfs:/mnt/formula1dl2025practice/processed/pit_stops/race_id=1008/,race_id=1008/,0,1744324272000


In [0]:
#display(spark.read.parquet("/mnt/formula1dl2025practice/processed/pit_stops"))

In [0]:
dbutils.notebook.exit("Success")

In [0]:
%sql

SELECT race_id, count(1) from f1_processed.pit_stops
GROUP BY race_id
ORDER BY race_id dESC 