In [9]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('pitstops').getOrCreate()

In [10]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType

pitstops_schema = StructType([StructField("raceId", IntegerType(), False),
                             StructField("driverId", IntegerType(), False),
                             StructField("stop", StringType(), False),
                             StructField("lap", IntegerType(), False),
                             StructField("time", StringType(), False),
                             StructField("duration", StringType(), False),
                             StructField("milliseconds", IntegerType(), False)])

In [11]:
pitstops_df = spark.read.json('../../raw/pit_stops.json', schema=pitstops_schema, multiLine=True)

In [12]:
pitstops_df.show(5)

+------+--------+----+---+--------+--------+------------+
|raceId|driverId|stop|lap|    time|duration|milliseconds|
+------+--------+----+---+--------+--------+------------+
|   841|     153|   1|  1|17:05:23|  26.898|       26898|
|   841|      30|   1|  1|17:05:52|  25.021|       25021|
|   841|      17|   1| 11|17:20:48|  23.426|       23426|
|   841|       4|   1| 12|17:22:34|  23.251|       23251|
|   841|      13|   1| 13|17:24:10|  23.842|       23842|
+------+--------+----+---+--------+--------+------------+
only showing top 5 rows



In [13]:
from pyspark.sql.functions import current_timestamp

In [14]:
pitstops_df = pitstops_df.withColumnRenamed('raceId', 'race_id') \
                        .withColumnRenamed('driverId', 'driver_id') \
                        .withColumn('ingestion_date', current_timestamp())

In [15]:
pitstops_df.write.mode('overwrite').parquet('../../processed/pitstops')

# Test
pitstops = spark.read.parquet('../../processed/pitstops')
pitstops.show(5)

+-------+---------+----+---+--------+--------+------------+--------------------+
|race_id|driver_id|stop|lap|    time|duration|milliseconds|      ingestion_date|
+-------+---------+----+---+--------+--------+------------+--------------------+
|    841|      153|   1|  1|17:05:23|  26.898|       26898|2022-03-14 17:51:...|
|    841|       30|   1|  1|17:05:52|  25.021|       25021|2022-03-14 17:51:...|
|    841|       17|   1| 11|17:20:48|  23.426|       23426|2022-03-14 17:51:...|
|    841|        4|   1| 12|17:22:34|  23.251|       23251|2022-03-14 17:51:...|
|    841|       13|   1| 13|17:24:10|  23.842|       23842|2022-03-14 17:51:...|
+-------+---------+----+---+--------+--------+------------+--------------------+
only showing top 5 rows

