# Ingest Lap Times CSV Files (Folder)
1. Read Data
2. Transform Data
3. Write Data

In [0]:
# Import Modules
from delta.tables import DeltaTable
from pyspark.sql.functions import current_timestamp, lit
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [0]:
# File Date Parameter
dbutils.widgets.text("FileDate", "2021-03-21")
file_date = dbutils.widgets.get('FileDate')

In [0]:
%run "../../01-Setup/09-Global-Variables"

In [0]:
%run "../../01-Setup/10-Global-Functions"

### Read Data

In [0]:
# Write Schema (Explicitly)
# StructType Represents Rows, StructField Represents Columns
laptimes_schema = StructType(fields = [
    StructField('raceId', IntegerType(), False),
    StructField('driverId', IntegerType(), True),
    StructField('lap', IntegerType(), True),
    StructField('position', IntegerType(), True),
    StructField('time', StringType(), True),
    StructField('milliseconds', DateType(), True)
])

# Apply Schema
laptimes_df = spark.read \
    .option('header', True) \
    .schema(laptimes_schema) \
    .csv(f'{raw_delta_folder_path}/{file_date}/lap_times/*.csv')

# Display Data
display(laptimes_df)

raceId,driverId,lap,position,time,milliseconds
1053,830,2,1,2:29.163,�163-01-01
1053,830,3,1,2:23.247,�247-01-01
1053,830,4,1,2:20.332,�332-01-01
1053,830,5,1,2:25.691,�691-01-01
1053,830,6,1,2:20.804,�804-01-01
1053,830,7,1,1:36.303,�303-01-01
1053,830,8,1,1:32.925,�925-01-01
1053,830,9,1,1:30.953,�953-01-01
1053,830,10,1,1:30.130,�130-01-01
1053,830,11,1,1:29.168,�168-01-01


In [0]:
# Print Schema
laptimes_df.printSchema()

root
 |-- raceId: integer (nullable = true)
 |-- driverId: integer (nullable = true)
 |-- lap: integer (nullable = true)
 |-- position: integer (nullable = true)
 |-- time: string (nullable = true)
 |-- milliseconds: date (nullable = true)



In [0]:
# Describe Data
laptimes_df.describe().show()

+-------+------+------------------+------------------+-----------------+--------+
|summary|raceId|          driverId|               lap|         position|    time|
+-------+------+------------------+------------------+-----------------+--------+
|  count|  1123|              1123|              1123|             1123|    1123|
|   mean|1053.0| 653.6411398040962| 30.95547640249332|9.472840605520926|    null|
| stddev|   0.0|345.75568685200244|18.067110332743095| 5.21344088004138|    null|
|    min|  1053|                 1|                 1|                1|1:16.702|
|    max|  1053|               854|                63|               19|3:10.646|
+-------+------+------------------+------------------+-----------------+--------+



### Transform Data

In [0]:
# Rename Columns Using withColumnRenamed
laptimes_renamed_df = laptimes_df \
    .withColumnRenamed('driverId', 'driver_id') \
    .withColumnRenamed('raceId', 'race_id') \
    .withColumn('file_date', lit(file_date))

# Display Data
display(laptimes_renamed_df)

race_id,driver_id,lap,position,time,milliseconds,file_date
1053,830,2,1,2:29.163,�163-01-01,2021-04-18
1053,830,3,1,2:23.247,�247-01-01,2021-04-18
1053,830,4,1,2:20.332,�332-01-01,2021-04-18
1053,830,5,1,2:25.691,�691-01-01,2021-04-18
1053,830,6,1,2:20.804,�804-01-01,2021-04-18
1053,830,7,1,1:36.303,�303-01-01,2021-04-18
1053,830,8,1,1:32.925,�925-01-01,2021-04-18
1053,830,9,1,1:30.953,�953-01-01,2021-04-18
1053,830,10,1,1:30.130,�130-01-01,2021-04-18
1053,830,11,1,1:29.168,�168-01-01,2021-04-18


In [0]:
# Add Ingested Date Column w/ Current Timestamp
# laptimes_final_df = laptimes_renamed_df \
#     .withColumn('ingestion_date', current_timestamp())

# Add Ingested Date Column w/ Current Timestamp
laptimes_final_df = add_ingestion_date(laptimes_renamed_df)

# Display Data
display(laptimes_final_df)

race_id,driver_id,lap,position,time,milliseconds,file_date,ingestion_date
1053,830,2,1,2:29.163,�163-01-01,2021-04-18,2023-08-22T16:10:59.322+0000
1053,830,3,1,2:23.247,�247-01-01,2021-04-18,2023-08-22T16:10:59.322+0000
1053,830,4,1,2:20.332,�332-01-01,2021-04-18,2023-08-22T16:10:59.322+0000
1053,830,5,1,2:25.691,�691-01-01,2021-04-18,2023-08-22T16:10:59.322+0000
1053,830,6,1,2:20.804,�804-01-01,2021-04-18,2023-08-22T16:10:59.322+0000
1053,830,7,1,1:36.303,�303-01-01,2021-04-18,2023-08-22T16:10:59.322+0000
1053,830,8,1,1:32.925,�925-01-01,2021-04-18,2023-08-22T16:10:59.322+0000
1053,830,9,1,1:30.953,�953-01-01,2021-04-18,2023-08-22T16:10:59.322+0000
1053,830,10,1,1:30.130,�130-01-01,2021-04-18,2023-08-22T16:10:59.322+0000
1053,830,11,1,1:29.168,�168-01-01,2021-04-18,2023-08-22T16:10:59.322+0000


### Write Data

In [0]:
# Write DataFrame to File System in Delta Format
laptimes_final_df.write \
    .mode('append') \
    .format('delta') \
    .partitionBy('race_id') \
    .save(f'{processed_delta_folder_path}/laptimes_delta')

In [0]:
# Merge Delta Data and Write DataFrame to Database
merge_condition = 'tgt.driver_id = src.driver_id and tgt.race_id = src.race_id and tgt.lap = src.lap'
merge_partitioned_delta_data(laptimes_final_df, 'formula1_processed_delta', 'laptimes_delta', processed_delta_database_folder_path, 'race_id', merge_condition)

In [0]:
# Read File
df = spark.read.format('delta').load(f'{processed_delta_database_folder_path}/laptimes_delta')

# Display Data
display(df)

race_id,driver_id,lap,position,time,milliseconds,file_date,ingestion_date
847,20,1,1,2:18.174,�174-01-01,2021-03-21,2023-08-22T16:08:04.319+0000
847,20,2,1,2:06.919,�919-01-01,2021-03-21,2023-08-22T16:08:04.319+0000
847,20,3,1,2:05.303,�303-01-01,2021-03-21,2023-08-22T16:08:04.319+0000
847,20,4,1,2:05.715,�715-01-01,2021-03-21,2023-08-22T16:08:04.319+0000
847,20,5,1,1:36.175,�175-01-01,2021-03-21,2023-08-22T16:08:04.319+0000
847,20,6,1,1:34.827,�827-01-01,2021-03-21,2023-08-22T16:08:04.319+0000
847,20,7,1,1:35.452,�452-01-01,2021-03-21,2023-08-22T16:08:04.319+0000
847,20,8,1,1:41.724,�724-01-01,2021-03-21,2023-08-22T16:08:04.319+0000
847,20,9,1,2:13.172,�172-01-01,2021-03-21,2023-08-22T16:08:04.319+0000
847,20,10,1,2:08.183,�183-01-01,2021-03-21,2023-08-22T16:08:04.319+0000


In [0]:
%sql
-- Read Table
select race_id, count(*)
from formula1_processed_delta.laptimes_delta
group by race_id;

race_id,count(1)
148,957
1025,900
858,1172
897,1145
31,1013
137,733
85,1045
65,1098
879,1395
883,1206


In [0]:
# Notbook Exit Output
dbutils.notebook.exit("Lap Times Successful")