# Ingest Pitstops JSON File

1. Read Data
2. Transform Data
3. Write Data

In [0]:
# Import Modules
from pyspark.sql.functions import current_timestamp, lit
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [0]:
# File Date Parameter
dbutils.widgets.text("FileDate", "2021-03-28")
file_date = dbutils.widgets.get('FileDate')

In [0]:
%run "../../01-Setup/09-Global-Variables"

In [0]:
%run "../../01-Setup/10-Global-Functions"

### Read Data

In [0]:
# Write Schema (Explicitly)
# StructType Represents Rows, StructField Represents Columns
pitstops_schema = StructType(fields = [
    StructField('raceId', IntegerType(), False),
    StructField('driverId', IntegerType(), True),
    StructField('stop', StringType(), True),
    StructField('lap', IntegerType(), True),
    StructField('time', StringType(), True),
    StructField('duration', StringType(), True),
    StructField('milliseconds', IntegerType(), True)
])

# Apply Schema
pitstops_df = spark.read \
    .schema(pitstops_schema) \
    .option('multiLine', True) \
    .json(f'{raw_inc_folder_path}/{file_date}/pit_stops.json')

# Display Data
display(pitstops_df)

raceId,driverId,stop,lap,time,duration,milliseconds
1053,839,1,1,15:05:16,30.866,30866
1053,20,1,3,15:10:09,32.024,32024
1053,854,1,5,15:15:11,51.007,51007
1053,853,1,12,15:27:20,31.168,31168
1053,842,1,14,15:30:10,31.068,31068
1053,20,2,20,15:39:11,31.184,31184
1053,854,2,21,15:41:24,32.479,32479
1053,20,3,22,15:42:52,39.502,39502
1053,853,2,23,15:45:20,31.5,31500
1053,852,1,25,15:46:39,30.696,30696


In [0]:
# Print Schema
pitstops_df.printSchema()

root
 |-- raceId: integer (nullable = true)
 |-- driverId: integer (nullable = true)
 |-- stop: string (nullable = true)
 |-- lap: integer (nullable = true)
 |-- time: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- milliseconds: integer (nullable = true)



In [0]:
# Describe Data
pitstops_df.describe().show()

+-------+------+-----------------+------------------+------------------+--------+-----------------+-----------------+
|summary|raceId|         driverId|              stop|               lap|    time|         duration|     milliseconds|
+-------+------+-----------------+------------------+------------------+--------+-----------------+-----------------+
|  count|    56|               56|                56|                56|      56|               56|               56|
|   mean|1053.0|            618.0|2.2142857142857144|27.946428571428573|    null|          36.6055|       490435.125|
| stddev|   0.0|371.6344733985019|1.1554501006382465| 7.706949975319544|    null|9.322011969884322|689062.9786592821|
|    min|  1053|                1|                 1|                 1|15:05:16|         1:00.172|            29742|
|    max|  1053|              854|                 5|                40|16:36:13|           57.601|          1621361|
+-------+------+-----------------+------------------+---

### Transform Data

In [0]:
# Rename Columns Using withColumnRenamed
pitstops_renamed_df = pitstops_df \
    .withColumnRenamed('raceId', 'race_id') \
    .withColumnRenamed('driverId', 'driver_id') \
    .withColumn('file_date', lit(file_date))

# Display Data
display(pitstops_renamed_df)

race_id,driver_id,stop,lap,time,duration,milliseconds,file_date
1053,839,1,1,15:05:16,30.866,30866,2021-04-18
1053,20,1,3,15:10:09,32.024,32024,2021-04-18
1053,854,1,5,15:15:11,51.007,51007,2021-04-18
1053,853,1,12,15:27:20,31.168,31168,2021-04-18
1053,842,1,14,15:30:10,31.068,31068,2021-04-18
1053,20,2,20,15:39:11,31.184,31184,2021-04-18
1053,854,2,21,15:41:24,32.479,32479,2021-04-18
1053,20,3,22,15:42:52,39.502,39502,2021-04-18
1053,853,2,23,15:45:20,31.5,31500,2021-04-18
1053,852,1,25,15:46:39,30.696,30696,2021-04-18


In [0]:
# Add Ingested Date Date Using withColumn
# pitstops_transformed_df = pitstops_renamed_df \
#     .withColumn('ingestion_date', current_timestamp()) 

# Add Ingested Date Column w/ Current Timestamp
pitstops_transformed_df = add_ingestion_date(pitstops_renamed_df)

# Display Data
display(pitstops_transformed_df)

race_id,driver_id,stop,lap,time,duration,milliseconds,file_date,ingestion_date
1053,839,1,1,15:05:16,30.866,30866,2021-04-18,2023-08-15T16:49:52.704+0000
1053,20,1,3,15:10:09,32.024,32024,2021-04-18,2023-08-15T16:49:52.704+0000
1053,854,1,5,15:15:11,51.007,51007,2021-04-18,2023-08-15T16:49:52.704+0000
1053,853,1,12,15:27:20,31.168,31168,2021-04-18,2023-08-15T16:49:52.704+0000
1053,842,1,14,15:30:10,31.068,31068,2021-04-18,2023-08-15T16:49:52.704+0000
1053,20,2,20,15:39:11,31.184,31184,2021-04-18,2023-08-15T16:49:52.704+0000
1053,854,2,21,15:41:24,32.479,32479,2021-04-18,2023-08-15T16:49:52.704+0000
1053,20,3,22,15:42:52,39.502,39502,2021-04-18,2023-08-15T16:49:52.704+0000
1053,853,2,23,15:45:20,31.5,31500,2021-04-18,2023-08-15T16:49:52.704+0000
1053,852,1,25,15:46:39,30.696,30696,2021-04-18,2023-08-15T16:49:52.704+0000


In [0]:
# Drop Column
pitstops_final_df = pitstops_transformed_df.drop('url')

# Display Data
display(pitstops_final_df)

race_id,driver_id,stop,lap,time,duration,milliseconds,file_date,ingestion_date
1053,839,1,1,15:05:16,30.866,30866,2021-04-18,2023-08-15T16:49:52.985+0000
1053,20,1,3,15:10:09,32.024,32024,2021-04-18,2023-08-15T16:49:52.985+0000
1053,854,1,5,15:15:11,51.007,51007,2021-04-18,2023-08-15T16:49:52.985+0000
1053,853,1,12,15:27:20,31.168,31168,2021-04-18,2023-08-15T16:49:52.985+0000
1053,842,1,14,15:30:10,31.068,31068,2021-04-18,2023-08-15T16:49:52.985+0000
1053,20,2,20,15:39:11,31.184,31184,2021-04-18,2023-08-15T16:49:52.985+0000
1053,854,2,21,15:41:24,32.479,32479,2021-04-18,2023-08-15T16:49:52.985+0000
1053,20,3,22,15:42:52,39.502,39502,2021-04-18,2023-08-15T16:49:52.985+0000
1053,853,2,23,15:45:20,31.5,31500,2021-04-18,2023-08-15T16:49:52.985+0000
1053,852,1,25,15:46:39,30.696,30696,2021-04-18,2023-08-15T16:49:52.985+0000


### Write Data

In [0]:
# Write DataFrame to FileSystem in Parquet Format
# Partition on Race ID
pitstops_final_df.write \
    .mode('overwrite') \
    .parquet(f'{processed_inc_folder_path}/pitstops')

In [0]:
# INCREMENTAL LOAD METHOD 2 W/ FUNCTIONS
# Re-Arrange DF (input_df, partition_column)
output_df = re_arrange_partition_column(pitstops_final_df, 'race_id')

# Overwrite Partitions (input_df, db_name, table_name, partition_column)
overwrite_partition(pitstops_final_df, 'formula1_processed_inc', 'pitstops_inc', 'race_id')

In [0]:
# Read File from File System (Test)
df = spark.read.parquet('abfss://processed-incremental@dbcourselakehouse.dfs.core.windows.net/pitstops')

# Display Data
display(df)

race_id,driver_id,stop,lap,time,duration,milliseconds,file_date,ingestion_date
1053,839,1,1,15:05:16,30.866,30866,2021-04-18,2023-08-15T16:49:53.218+0000
1053,20,1,3,15:10:09,32.024,32024,2021-04-18,2023-08-15T16:49:53.218+0000
1053,854,1,5,15:15:11,51.007,51007,2021-04-18,2023-08-15T16:49:53.218+0000
1053,853,1,12,15:27:20,31.168,31168,2021-04-18,2023-08-15T16:49:53.218+0000
1053,842,1,14,15:30:10,31.068,31068,2021-04-18,2023-08-15T16:49:53.218+0000
1053,20,2,20,15:39:11,31.184,31184,2021-04-18,2023-08-15T16:49:53.218+0000
1053,854,2,21,15:41:24,32.479,32479,2021-04-18,2023-08-15T16:49:53.218+0000
1053,20,3,22,15:42:52,39.502,39502,2021-04-18,2023-08-15T16:49:53.218+0000
1053,853,2,23,15:45:20,31.5,31500,2021-04-18,2023-08-15T16:49:53.218+0000
1053,852,1,25,15:46:39,30.696,30696,2021-04-18,2023-08-15T16:49:53.218+0000


In [0]:
%sql
-- Read Table
select race_id, count(*)
from formula1_processed_inc.pitstops_inc
group by race_id;

race_id,count(1)
883,71
879,69
970,82
950,66
847,75
842,59
914,79
844,82
861,76
855,65


In [0]:
# Notbook Exit Output
dbutils.notebook.exit("Pitstops Successful")