###Ingest drivers.json file

In [0]:
dbutils.widgets.text('p_data_source','')
v_data_source = dbutils.widgets.get('p_data_source')

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

#####Step 1 - Read the JSON file using the spark dataframe reader API

In [0]:
from pyspark.sql.types import StructType,StructField,StringType,DateType,IntegerType

In [0]:
name_schema = StructType(fields = [StructField('forename',StringType(),True),
                                   StructField('surname',StringType(),True)])

In [0]:
drivers_schema = StructType(fields = [StructField('driverId',IntegerType(),False),
                                      StructField('driverRef',StringType(),True),
                                      StructField('number',IntegerType(),True),
                                      StructField('code',StringType(),True),
                                      StructField('name',name_schema),
                                      StructField('dob',DateType(),True),
                                      StructField('nationality',StringType(),True),
                                      StructField('url',StringType(),True)])

In [0]:
drivers_df = spark.read.json(f"{raw_folder_path}/drivers.json",schema = drivers_schema)

In [0]:
drivers_df.show(10)

+--------+----------+------+----+--------------------+----------+-----------+--------------------+
|driverId| driverRef|number|code|                name|       dob|nationality|                 url|
+--------+----------+------+----+--------------------+----------+-----------+--------------------+
|       1|  hamilton|    44| HAM|   {Lewis, Hamilton}|1985-01-07|    British|http://en.wikiped...|
|       2|  heidfeld|  null| HEI|    {Nick, Heidfeld}|1977-05-10|     German|http://en.wikiped...|
|       3|   rosberg|     6| ROS|     {Nico, Rosberg}|1985-06-27|     German|http://en.wikiped...|
|       4|    alonso|    14| ALO|  {Fernando, Alonso}|1981-07-29|    Spanish|http://en.wikiped...|
|       5|kovalainen|  null| KOV|{Heikki, Kovalainen}|1981-10-19|    Finnish|http://en.wikiped...|
|       6|  nakajima|  null| NAK|  {Kazuki, Nakajima}|1985-01-11|   Japanese|http://en.wikiped...|
|       7|  bourdais|  null| BOU|{Sébastien, Bourd...|1979-02-28|     French|http://en.wikiped...|
|       8|

#####Step 2 - Rename columns and add new columns

In [0]:
from pyspark.sql.functions import current_timestamp,col,lit,concat

In [0]:
drivers_with_columns_df = drivers_df.withColumnRenamed('driverId','driver_id') \
    .withColumnRenamed('driverRef','driver_ref') \
    .withColumn('ingestion_date',current_timestamp()) \
    .withColumn('name',concat(col('name.forename'),lit(' '),col('name.surname'))) \
    .withColumn('data_source',lit(v_data_source))

In [0]:
drivers_with_columns_df.show(10)

+---------+----------+------+----+------------------+----------+-----------+--------------------+--------------------+-----------+
|driver_id|driver_ref|number|code|              name|       dob|nationality|                 url|      ingestion_date|data_source|
+---------+----------+------+----+------------------+----------+-----------+--------------------+--------------------+-----------+
|        1|  hamilton|    44| HAM|    Lewis Hamilton|1985-01-07|    British|http://en.wikiped...|2023-09-30 18:01:...| Ergast API|
|        2|  heidfeld|  null| HEI|     Nick Heidfeld|1977-05-10|     German|http://en.wikiped...|2023-09-30 18:01:...| Ergast API|
|        3|   rosberg|     6| ROS|      Nico Rosberg|1985-06-27|     German|http://en.wikiped...|2023-09-30 18:01:...| Ergast API|
|        4|    alonso|    14| ALO|   Fernando Alonso|1981-07-29|    Spanish|http://en.wikiped...|2023-09-30 18:01:...| Ergast API|
|        5|kovalainen|  null| KOV| Heikki Kovalainen|1981-10-19|    Finnish|http://

#####Step 3 - Drop the unwanted columns

In [0]:
drivers_final_df = drivers_with_columns_df.drop(col('url'))

In [0]:
drivers_final_df.show(10)

+---------+----------+------+----+------------------+----------+-----------+--------------------+-----------+
|driver_id|driver_ref|number|code|              name|       dob|nationality|      ingestion_date|data_source|
+---------+----------+------+----+------------------+----------+-----------+--------------------+-----------+
|        1|  hamilton|    44| HAM|    Lewis Hamilton|1985-01-07|    British|2023-09-30 18:01:...| Ergast API|
|        2|  heidfeld|  null| HEI|     Nick Heidfeld|1977-05-10|     German|2023-09-30 18:01:...| Ergast API|
|        3|   rosberg|     6| ROS|      Nico Rosberg|1985-06-27|     German|2023-09-30 18:01:...| Ergast API|
|        4|    alonso|    14| ALO|   Fernando Alonso|1981-07-29|    Spanish|2023-09-30 18:01:...| Ergast API|
|        5|kovalainen|  null| KOV| Heikki Kovalainen|1981-10-19|    Finnish|2023-09-30 18:01:...| Ergast API|
|        6|  nakajima|  null| NAK|   Kazuki Nakajima|1985-01-11|   Japanese|2023-09-30 18:01:...| Ergast API|
|        7

#####Step 4 - Write the output to processed container in parquet format

In [0]:
drivers_final_df.write.mode('overwrite').parquet(f"{processed_folder_path}/drivers")

In [0]:
dbutils.notebook.exit('Success')