In [79]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('circuits').getOrCreate()

In [80]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType

In [81]:
races_schema = StructType([StructField("raceId", IntegerType(), False),
                             StructField("year", IntegerType(), False),
                             StructField("round", IntegerType(), False),
                             StructField("circuitId", IntegerType(), False),
                             StructField("name", StringType(), False),
                             StructField("date", DateType(), False),
                             StructField("time", StringType(), False),
                             StructField("url", StringType(), False)])

In [82]:
races_df = spark.read.csv('../../raw/races.csv', schema=races_schema, header=True)

In [83]:
races_df.show(5)

+------+----+-----+---------+--------------------+----------+--------+--------------------+
|raceId|year|round|circuitId|                name|      date|    time|                 url|
+------+----+-----+---------+--------------------+----------+--------+--------------------+
|     1|2009|    1|        1|Australian Grand ...|2009-03-29|06:00:00|http://en.wikiped...|
|     2|2009|    2|        2|Malaysian Grand Prix|2009-04-05|09:00:00|http://en.wikiped...|
|     3|2009|    3|       17|  Chinese Grand Prix|2009-04-19|07:00:00|http://en.wikiped...|
|     4|2009|    4|        3|  Bahrain Grand Prix|2009-04-26|12:00:00|http://en.wikiped...|
|     5|2009|    5|        4|  Spanish Grand Prix|2009-05-10|12:00:00|http://en.wikiped...|
+------+----+-----+---------+--------------------+----------+--------+--------------------+
only showing top 5 rows



In [84]:
races_df = races_df.withColumnRenamed('raceId', 'race_id') \
                    .withColumnRenamed('year', 'race_year') \
                    .withColumnRenamed('circuitId', ('circuit_id'))

In [85]:
from pyspark.sql.functions import current_timestamp, lit, concat, to_timestamp, col

In [86]:
races_df = races_df.withColumn('race_timestamp', to_timestamp(concat(col('date'), lit(' '), col('time')))) \
                    .withColumn('ingestion_date', current_timestamp())

In [87]:
races_df = races_df.drop('date', 'time')

In [88]:
races_df.show(5)

+-------+---------+-----+----------+--------------------+--------------------+-------------------+--------------------+
|race_id|race_year|round|circuit_id|                name|                 url|     race_timestamp|      ingestion_date|
+-------+---------+-----+----------+--------------------+--------------------+-------------------+--------------------+
|      1|     2009|    1|         1|Australian Grand ...|http://en.wikiped...|2009-03-29 06:00:00|2022-03-14 14:29:...|
|      2|     2009|    2|         2|Malaysian Grand Prix|http://en.wikiped...|2009-04-05 09:00:00|2022-03-14 14:29:...|
|      3|     2009|    3|        17|  Chinese Grand Prix|http://en.wikiped...|2009-04-19 07:00:00|2022-03-14 14:29:...|
|      4|     2009|    4|         3|  Bahrain Grand Prix|http://en.wikiped...|2009-04-26 12:00:00|2022-03-14 14:29:...|
|      5|     2009|    5|         4|  Spanish Grand Prix|http://en.wikiped...|2009-05-10 12:00:00|2022-03-14 14:29:...|
+-------+---------+-----+----------+----

In [89]:
races_df.write.mode('overwrite').partitionBy('race_year').parquet('../../processed/races')

In [90]:
races = spark.read.parquet('../../processed/races')

In [91]:
races.show(5)

+-------+-----+----------+--------------------+--------------------+-------------------+--------------------+---------+
|race_id|round|circuit_id|                name|                 url|     race_timestamp|      ingestion_date|race_year|
+-------+-----+----------+--------------------+--------------------+-------------------+--------------------+---------+
|   1053|    2|        21|Emilia Romagna Gr...|http://en.wikiped...|2021-04-18 13:00:00|2022-03-14 14:29:...|     2021|
|   1052|    1|         3|  Bahrain Grand Prix|http://en.wikiped...|2021-03-28 15:00:00|2022-03-14 14:29:...|     2021|
|   1051|   21|         1|Australian Grand ...|http://en.wikiped...|2021-11-21 06:00:00|2022-03-14 14:29:...|     2021|
|   1054|    3|        20|                 TBC|http://en.wikiped...|               null|2022-03-14 14:29:...|     2021|
|   1055|    4|         4|  Spanish Grand Prix|http://en.wikiped...|2021-05-09 13:00:00|2022-03-14 14:29:...|     2021|
+-------+-----+----------+--------------