In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('qualifying').getOrCreate()

In [4]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [5]:
qualifying_schema = StructType([StructField("qualifyId", IntegerType(), False),
                             StructField("raceId", IntegerType(), True),
                             StructField("driverId", IntegerType(), True),
                             StructField("constructorId", IntegerType(), True),
                             StructField("number", IntegerType(), True),
                             StructField("position", IntegerType(), True),
                             StructField("q1", StringType(), True),
                             StructField("q2", StringType(), True),
                             StructField("q3", StringType(), True)])

In [9]:
qualifying_df = spark.read.json('../../raw/qualifying/', schema=qualifying_schema, multiLine=True)

In [10]:
qualifying_df.show(5)

+---------+------+--------+-------------+------+--------+--------+--------+--------+
|qualifyId|raceId|driverId|constructorId|number|position|      q1|      q2|      q3|
+---------+------+--------+-------------+------+--------+--------+--------+--------+
|        1|    18|       1|            1|    22|       1|1:26.572|1:25.187|1:26.714|
|        2|    18|       9|            2|     4|       2|1:26.103|1:25.315|1:26.869|
|        3|    18|       5|            1|    23|       3|1:25.664|1:25.452|1:27.079|
|        4|    18|      13|            6|     2|       4|1:25.994|1:25.691|1:27.178|
|        5|    18|       2|            2|     3|       5|1:25.960|1:25.518|1:27.236|
+---------+------+--------+-------------+------+--------+--------+--------+--------+
only showing top 5 rows



In [11]:
from pyspark.sql.functions import current_timestamp

In [13]:
qualifying_df = qualifying_df.withColumnRenamed('qualifyId', 'qualify_id') \
                            .withColumnRenamed('raceId', 'race_id') \
                            .withColumnRenamed('driverId', 'driver_id') \
                            .withColumnRenamed('constructorId', 'constructor_id') \
                            .withColumn('ingestion_date', current_timestamp())

In [14]:
qualifying_df.show(5)

+----------+-------+---------+--------------+------+--------+--------+--------+--------+--------------------+
|qualify_id|race_id|driver_id|constructor_id|number|position|      q1|      q2|      q3|      ingestion_date|
+----------+-------+---------+--------------+------+--------+--------+--------+--------+--------------------+
|         1|     18|        1|             1|    22|       1|1:26.572|1:25.187|1:26.714|2022-03-16 14:54:...|
|         2|     18|        9|             2|     4|       2|1:26.103|1:25.315|1:26.869|2022-03-16 14:54:...|
|         3|     18|        5|             1|    23|       3|1:25.664|1:25.452|1:27.079|2022-03-16 14:54:...|
|         4|     18|       13|             6|     2|       4|1:25.994|1:25.691|1:27.178|2022-03-16 14:54:...|
|         5|     18|        2|             2|     3|       5|1:25.960|1:25.518|1:27.236|2022-03-16 14:54:...|
+----------+-------+---------+--------------+------+--------+--------+--------+--------+--------------------+
only showi

In [15]:
qualifying_df.write.mode('overwrite').parquet('../../processed/qualifying')

In [16]:
# Test
qualifying = spark.read.parquet('../../processed/qualifying/', header=True)
qualifying.show(5)

+----------+-------+---------+--------------+------+--------+--------+--------+--------+--------------------+
|qualify_id|race_id|driver_id|constructor_id|number|position|      q1|      q2|      q3|      ingestion_date|
+----------+-------+---------+--------------+------+--------+--------+--------+--------+--------------------+
|         1|     18|        1|             1|    22|       1|1:26.572|1:25.187|1:26.714|2022-03-16 14:56:...|
|         2|     18|        9|             2|     4|       2|1:26.103|1:25.315|1:26.869|2022-03-16 14:56:...|
|         3|     18|        5|             1|    23|       3|1:25.664|1:25.452|1:27.079|2022-03-16 14:56:...|
|         4|     18|       13|             6|     2|       4|1:25.994|1:25.691|1:27.178|2022-03-16 14:56:...|
|         5|     18|        2|             2|     3|       5|1:25.960|1:25.518|1:27.236|2022-03-16 14:56:...|
+----------+-------+---------+--------------+------+--------+--------+--------+--------+--------------------+
only showi