## Ingest pit_stops.json file
Step 1- Read the json file using the spark dataframe reader API

In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")


In [0]:
dbutils.widgets.text("p_file_date", "2021-03-28")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
from pyspark.sql.types import StructType, StringType, IntegerType, DoubleType, StructField, TimestampType, DateType, FloatType
from pyspark.sql.functions import col, struct,current_timestamp , concat, lit

In [0]:
pit_stop_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                     StructField("driverId", IntegerType(), True),
                                     StructField("stop", StringType(), True),
                                     StructField("lap", IntegerType(), True),
                                     StructField("time", StringType(), True),
                                     StructField("duration", StringType(), True),
                                     StructField("milliseconds", IntegerType(), True)])

In [0]:
# Load the JSON file
pit_stops_df = spark.read\
    .schema(pit_stop_schema)\
        .option("multiLine", True)\
    .json(f"{raw_folder_path}/{v_file_date}/pit_stops.json")

In [0]:
display(pit_stops_df)

In [0]:
pit_stops_df.printSchema()

#### Step 2 - Rename cloumns and add new columns

In [0]:
pit_stops_final_df = pit_stops_df.withColumnRenamed("driverId", "driver_id")\
  .withColumnRenamed("raceId", "race_id")\
    .withColumn("data_source", lit(v_data_source))
                                           

In [0]:
pit_stops_final_df = add_ingestion_date(pit_stops_final_df)

In [0]:
display(pit_stops_final_df)


#### Step 3 - Write to output to processed container in parquet format

In [0]:
##pit_stops_final_df.write.mode("overwrite").parquet(f"{processed_folder_path}/pit_stops")

#pit_stops_final_df.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.pit_stops")

In [0]:
#overwrite_partition(pit_stops_final_df, "f1_processed", "pit_stops", "race_id")

In [0]:
merge_condition ="tgt.race_id =src.race_id AND tgt.driver_id = src.driver_id AND tgt.stop = src.stop AND tgt.race_id = src.race_id"
merge_delta_data (pit_stops_final_df,"f1_processed", "pit_stops",processed_folder_path,merge_condition,"race_id")

In [0]:
%fs
ls /mnt/formula1dl2025practice/processed/pit_stops

In [0]:
#display(spark.read.parquet("/mnt/formula1dl2025practice/processed/pit_stops"))

In [0]:
dbutils.notebook.exit("Success")

In [0]:
%sql

SELECT race_id, count(1) from f1_processed.pit_stops
GROUP BY race_id
ORDER BY race_id dESC 