In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('circuits').getOrCreate()

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [3]:
lap_times_schema = StructType([StructField("raceId", IntegerType(), False),
                             StructField("driverId", IntegerType(), True),
                             StructField("lap", IntegerType(), True),
                             StructField("time", StringType(), True),
                             StructField("duration", StringType(), True),
                             StructField("milliseconds", IntegerType(), True)])

In [4]:
lap_times_df = spark.read.csv('../../raw/lap_times/', schema=lap_times_schema)

In [5]:
lap_times_df.show(5)

+------+--------+---+----+--------+------------+
|raceId|driverId|lap|time|duration|milliseconds|
+------+--------+---+----+--------+------------+
|   841|      20|  1|   1|1:38.109|       98109|
|   841|      20|  2|   1|1:33.006|       93006|
|   841|      20|  3|   1|1:32.713|       92713|
|   841|      20|  4|   1|1:32.803|       92803|
|   841|      20|  5|   1|1:32.342|       92342|
+------+--------+---+----+--------+------------+
only showing top 5 rows



In [7]:
from pyspark.sql.functions import current_timestamp

In [8]:
lap_times_df = lap_times_df.withColumn('ingestion_date', current_timestamp()) \
                            .withColumnRenamed('raceId', 'race_id') \
                            .withColumnRenamed('driverId', 'driver_id')

In [9]:
lap_times_df.show(5)

+-------+---------+---+----+--------+------------+--------------------+
|race_id|driver_id|lap|time|duration|milliseconds|      ingestion_date|
+-------+---------+---+----+--------+------------+--------------------+
|    841|       20|  1|   1|1:38.109|       98109|2022-03-16 14:41:...|
|    841|       20|  2|   1|1:33.006|       93006|2022-03-16 14:41:...|
|    841|       20|  3|   1|1:32.713|       92713|2022-03-16 14:41:...|
|    841|       20|  4|   1|1:32.803|       92803|2022-03-16 14:41:...|
|    841|       20|  5|   1|1:32.342|       92342|2022-03-16 14:41:...|
+-------+---------+---+----+--------+------------+--------------------+
only showing top 5 rows



In [10]:
lap_times_df.write.mode('overwrite').parquet('../../processed/lap_times')

In [11]:
# Test
lap_times = spark.read.parquet('../../processed/lap_times/', header=True)
lap_times.show(5)

+-------+---------+---+----+--------+------------+--------------------+
|race_id|driver_id|lap|time|duration|milliseconds|      ingestion_date|
+-------+---------+---+----+--------+------------+--------------------+
|     67|       14| 26|  13|1:25.802|       85802|2022-03-16 14:42:...|
|     67|       14| 27|  13|1:25.338|       85338|2022-03-16 14:42:...|
|     67|       14| 28|  13|1:25.395|       85395|2022-03-16 14:42:...|
|     67|       14| 29|  12|1:26.191|       86191|2022-03-16 14:42:...|
|     67|       14| 30|  11|1:25.439|       85439|2022-03-16 14:42:...|
+-------+---------+---+----+--------+------------+--------------------+
only showing top 5 rows

