# Ingest Lap Times CSV Files (Folder)
1. Read Data
2. Transform Data
3. Write Data

In [0]:
# Import Modules
from pyspark.sql.functions import current_timestamp, lit
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [0]:
# File Date Parameter
dbutils.widgets.text("FileDate", "2021-03-28")
file_date = dbutils.widgets.get('FileDate')

In [0]:
%run "../../01-Setup/09-Global-Variables"

In [0]:
%run "../../01-Setup/10-Global-Functions"

### Read Data

In [0]:
# Write Schema (Explicitly)
# StructType Represents Rows, StructField Represents Columns
laptimes_schema = StructType(fields = [
    StructField('raceId', IntegerType(), False),
    StructField('driverId', IntegerType(), True),
    StructField('lap', IntegerType(), True),
    StructField('position', IntegerType(), True),
    StructField('time', StringType(), True),
    StructField('milliseconds', DateType(), True)
])

# Apply Schema
laptimes_df = spark.read \
    .option('header', True) \
    .schema(laptimes_schema) \
    .csv(f'{raw_inc_folder_path}/{file_date}/lap_times/*.csv')

# Display Data
display(laptimes_df)

raceId,driverId,lap,position,time,milliseconds
1053,830,2,1,2:29.163,�163-01-01
1053,830,3,1,2:23.247,�247-01-01
1053,830,4,1,2:20.332,�332-01-01
1053,830,5,1,2:25.691,�691-01-01
1053,830,6,1,2:20.804,�804-01-01
1053,830,7,1,1:36.303,�303-01-01
1053,830,8,1,1:32.925,�925-01-01
1053,830,9,1,1:30.953,�953-01-01
1053,830,10,1,1:30.130,�130-01-01
1053,830,11,1,1:29.168,�168-01-01


In [0]:
# Print Schema
laptimes_df.printSchema()

root
 |-- raceId: integer (nullable = true)
 |-- driverId: integer (nullable = true)
 |-- lap: integer (nullable = true)
 |-- position: integer (nullable = true)
 |-- time: string (nullable = true)
 |-- milliseconds: date (nullable = true)



In [0]:
# Describe Data
laptimes_df.describe().show()

+-------+------+------------------+------------------+-----------------+--------+
|summary|raceId|          driverId|               lap|         position|    time|
+-------+------+------------------+------------------+-----------------+--------+
|  count|  1123|              1123|              1123|             1123|    1123|
|   mean|1053.0| 653.6411398040962| 30.95547640249332|9.472840605520926|    null|
| stddev|   0.0|345.75568685200244|18.067110332743095| 5.21344088004138|    null|
|    min|  1053|                 1|                 1|                1|1:16.702|
|    max|  1053|               854|                63|               19|3:10.646|
+-------+------+------------------+------------------+-----------------+--------+



### Transform Data

In [0]:
# Rename Columns Using withColumnRenamed
laptimes_renamed_df = laptimes_df \
    .withColumnRenamed('driverId', 'driver_id') \
    .withColumnRenamed('raceId', 'race_id') \
    .withColumn('file_date', lit(file_date))

# Display Data
display(laptimes_renamed_df)

race_id,driver_id,lap,position,time,milliseconds,file_date
1053,830,2,1,2:29.163,�163-01-01,2021-04-18
1053,830,3,1,2:23.247,�247-01-01,2021-04-18
1053,830,4,1,2:20.332,�332-01-01,2021-04-18
1053,830,5,1,2:25.691,�691-01-01,2021-04-18
1053,830,6,1,2:20.804,�804-01-01,2021-04-18
1053,830,7,1,1:36.303,�303-01-01,2021-04-18
1053,830,8,1,1:32.925,�925-01-01,2021-04-18
1053,830,9,1,1:30.953,�953-01-01,2021-04-18
1053,830,10,1,1:30.130,�130-01-01,2021-04-18
1053,830,11,1,1:29.168,�168-01-01,2021-04-18


In [0]:
# Add Ingested Date Column w/ Current Timestamp
# laptimes_final_df = laptimes_renamed_df \
#     .withColumn('ingestion_date', current_timestamp())

# Add Ingested Date Column w/ Current Timestamp
laptimes_final_df = add_ingestion_date(laptimes_renamed_df)

# Display Data
display(laptimes_final_df)

race_id,driver_id,lap,position,time,milliseconds,file_date,ingestion_date
1053,830,2,1,2:29.163,�163-01-01,2021-04-18,2023-08-15T16:57:05.536+0000
1053,830,3,1,2:23.247,�247-01-01,2021-04-18,2023-08-15T16:57:05.536+0000
1053,830,4,1,2:20.332,�332-01-01,2021-04-18,2023-08-15T16:57:05.536+0000
1053,830,5,1,2:25.691,�691-01-01,2021-04-18,2023-08-15T16:57:05.536+0000
1053,830,6,1,2:20.804,�804-01-01,2021-04-18,2023-08-15T16:57:05.536+0000
1053,830,7,1,1:36.303,�303-01-01,2021-04-18,2023-08-15T16:57:05.536+0000
1053,830,8,1,1:32.925,�925-01-01,2021-04-18,2023-08-15T16:57:05.536+0000
1053,830,9,1,1:30.953,�953-01-01,2021-04-18,2023-08-15T16:57:05.536+0000
1053,830,10,1,1:30.130,�130-01-01,2021-04-18,2023-08-15T16:57:05.536+0000
1053,830,11,1,1:29.168,�168-01-01,2021-04-18,2023-08-15T16:57:05.536+0000


### Write Data

In [0]:
# Write DataFrame to FileSystem in Parquet Format
laptimes_final_df.write \
    .mode('overwrite') \
    .partitionBy('race_id') \
    .parquet(f'{processed_inc_folder_path}/lap-times')

In [0]:
# INCREMENTAL LOAD METHOD 2 W/ FUNCTIONS
# Re-Arrange DF (input_df, partition_column)
output_df = re_arrange_partition_column(laptimes_final_df, 'race_id')

# Overwrite Partitions (input_df, db_name, table_name, partition_column)
overwrite_partition(laptimes_final_df, 'formula1_processed_inc', 'laptimes_inc', 'race_id')

In [0]:
# Read File
df = spark.read.parquet('abfss://processed-incremental@dbcourselakehouse.dfs.core.windows.net/lap-times')

# Display Data
display(df)

driver_id,lap,position,time,milliseconds,file_date,ingestion_date,race_id
847,1,1,1:08.922,�922-01-01,2021-03-21,2023-08-15T16:54:40.837+0000,1046
847,2,1,1:38.421,�421-01-01,2021-03-21,2023-08-15T16:54:40.837+0000,1046
847,3,1,1:29.830,�830-01-01,2021-03-21,2023-08-15T16:54:40.837+0000,1046
847,4,1,1:32.256,�256-01-01,2021-03-21,2023-08-15T16:54:40.837+0000,1046
847,5,1,1:30.133,�133-01-01,2021-03-21,2023-08-15T16:54:40.837+0000,1046
847,6,1,1:28.698,�698-01-01,2021-03-21,2023-08-15T16:54:40.837+0000,1046
847,7,1,0:58.160,�160-01-01,2021-03-21,2023-08-15T16:54:40.837+0000,1046
847,8,1,0:58.236,�236-01-01,2021-03-21,2023-08-15T16:54:40.837+0000,1046
847,9,1,0:58.059,�059-01-01,2021-03-21,2023-08-15T16:54:40.837+0000,1046
847,10,1,0:58.329,�329-01-01,2021-03-21,2023-08-15T16:54:40.837+0000,1046


In [0]:
%sql
-- Read Table
select race_id, count(*)
from formula1_processed_inc.laptimes_inc
group by race_id;

race_id,count(1)
879,1395
1016,1310
974,1414
847,1488
183,1286
27,1263
1046,1531
223,1345
861,1270
185,1458


In [0]:
# Notbook Exit Output
dbutils.notebook.exit("Lap Times Successful")