In [27]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('drivers').getOrCreate()

In [28]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType

In [41]:
name_schema = StructType([StructField("forename", StringType(), False),
                             StructField("surname", StringType(), False)])

drivers_schema = StructType([StructField("driverId", IntegerType(), False),
                             StructField("driverRef", StringType(), False),
                             StructField("number", IntegerType(), False),
                             StructField("code", StringType(), False),
                             StructField("name", name_schema),
                             StructField("dob", DateType(), False),
                             StructField("nationality", StringType(), False),
                             StructField("url", StringType(), False)])

In [42]:
drivers_df = spark.read.json('../../raw/drivers.json', schema=drivers_schema)

In [43]:
drivers_df.show(5)

+--------+----------+------+----+--------------------+----------+-----------+--------------------+
|driverId| driverRef|number|code|                name|       dob|nationality|                 url|
+--------+----------+------+----+--------------------+----------+-----------+--------------------+
|       1|  hamilton|    44| HAM|   {Lewis, Hamilton}|1985-01-07|    British|http://en.wikiped...|
|       2|  heidfeld|  null| HEI|    {Nick, Heidfeld}|1977-05-10|     German|http://en.wikiped...|
|       3|   rosberg|     6| ROS|     {Nico, Rosberg}|1985-06-27|     German|http://en.wikiped...|
|       4|    alonso|    14| ALO|  {Fernando, Alonso}|1981-07-29|    Spanish|http://en.wikiped...|
|       5|kovalainen|  null| KOV|{Heikki, Kovalainen}|1981-10-19|    Finnish|http://en.wikiped...|
+--------+----------+------+----+--------------------+----------+-----------+--------------------+
only showing top 5 rows



In [44]:
drivers_df.printSchema()

root
 |-- driverId: integer (nullable = true)
 |-- driverRef: string (nullable = true)
 |-- number: integer (nullable = true)
 |-- code: string (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- forename: string (nullable = true)
 |    |-- surname: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- nationality: string (nullable = true)
 |-- url: string (nullable = true)



In [45]:
from pyspark.sql.functions import current_timestamp, concat, lit, col

In [46]:
drivers_df = drivers_df.withColumnRenamed('driverId', 'driver_id') \
                        .withColumnRenamed('driverRef', 'driver_ref') \
                        .withColumn('ingestion_date', current_timestamp()) \
                        .drop('url') 

In [49]:
drivers_df = drivers_df.withColumn('name', concat(col('name.forename'), lit(' '), col('name.surname'))) 

In [50]:
drivers_df.show(5)

+---------+----------+------+----+-----------------+----------+-----------+--------------------+
|driver_id|driver_ref|number|code|             name|       dob|nationality|      ingestion_date|
+---------+----------+------+----+-----------------+----------+-----------+--------------------+
|        1|  hamilton|    44| HAM|   Lewis Hamilton|1985-01-07|    British|2022-03-14 15:30:...|
|        2|  heidfeld|  null| HEI|    Nick Heidfeld|1977-05-10|     German|2022-03-14 15:30:...|
|        3|   rosberg|     6| ROS|     Nico Rosberg|1985-06-27|     German|2022-03-14 15:30:...|
|        4|    alonso|    14| ALO|  Fernando Alonso|1981-07-29|    Spanish|2022-03-14 15:30:...|
|        5|kovalainen|  null| KOV|Heikki Kovalainen|1981-10-19|    Finnish|2022-03-14 15:30:...|
+---------+----------+------+----+-----------------+----------+-----------+--------------------+
only showing top 5 rows



In [51]:
drivers_df.write.mode('overwrite').parquet('../../processed/drivers')

In [54]:
drivers = spark.read.parquet('../../processed/drivers/')

In [None]:
drivers.show(5)

+---------+----------+------+----+-----------------+----------+-----------+--------------------+
|driver_id|driver_ref|number|code|             name|       dob|nationality|      ingestion_date|
+---------+----------+------+----+-----------------+----------+-----------+--------------------+
|        1|  hamilton|    44| HAM|   Lewis Hamilton|1985-01-07|    British|2022-03-14 15:35:...|
|        2|  heidfeld|  null| HEI|    Nick Heidfeld|1977-05-10|     German|2022-03-14 15:35:...|
|        3|   rosberg|     6| ROS|     Nico Rosberg|1985-06-27|     German|2022-03-14 15:35:...|
|        4|    alonso|    14| ALO|  Fernando Alonso|1981-07-29|    Spanish|2022-03-14 15:35:...|
|        5|kovalainen|  null| KOV|Heikki Kovalainen|1981-10-19|    Finnish|2022-03-14 15:35:...|
+---------+----------+------+----+-----------------+----------+-----------+--------------------+
only showing top 5 rows

