In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, current_timestamp, to_timestamp, concat
from config_file import raw_path, processed_path
from common_func import add_timestamp

In [2]:
spark = SparkSession.builder.appName('ETL').getOrCreate()
spark

## Drivers Data Ingestion

In [3]:
driver_df = spark.read.json(f"{raw_path}\drivers.json")
driver_df.show()

+----+----------+--------+----------+--------------------+-----------+------+--------------------+
|code|       dob|driverId| driverRef|                name|nationality|number|                 url|
+----+----------+--------+----------+--------------------+-----------+------+--------------------+
| HAM|1985-01-07|       1|  hamilton|   [Lewis, Hamilton]|    British|    44|http://en.wikiped...|
| HEI|1977-05-10|       2|  heidfeld|    [Nick, Heidfeld]|     German|    \N|http://en.wikiped...|
| ROS|1985-06-27|       3|   rosberg|     [Nico, Rosberg]|     German|     6|http://en.wikiped...|
| ALO|1981-07-29|       4|    alonso|  [Fernando, Alonso]|    Spanish|    14|http://en.wikiped...|
| KOV|1981-10-19|       5|kovalainen|[Heikki, Kovalainen]|    Finnish|    \N|http://en.wikiped...|
| NAK|1985-01-11|       6|  nakajima|  [Kazuki, Nakajima]|   Japanese|    \N|http://en.wikiped...|
| BOU|1979-02-28|       7|  bourdais|[Sébastien, Bourd...|     French|    \N|http://en.wikiped...|
| RAI|1979

In [4]:
driver_df.printSchema()

root
 |-- code: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- driverId: long (nullable = true)
 |-- driverRef: string (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- forename: string (nullable = true)
 |    |-- surname: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- number: string (nullable = true)
 |-- url: string (nullable = true)



In [5]:
driver_df = driver_df.select(col('driverId').alias('driver_Id'),col('driverRef').alias('driver_Ref'),col('name.forename').alias('first_Name')\
                            ,col('name.surname').alias('last_Name'),col('dob'),col('code'),col('number'),col('nationality'))

In [6]:
driver_df.show()

+---------+----------+----------+----------+----------+----+------+-----------+
|driver_Id|driver_Ref|first_Name| last_Name|       dob|code|number|nationality|
+---------+----------+----------+----------+----------+----+------+-----------+
|        1|  hamilton|     Lewis|  Hamilton|1985-01-07| HAM|    44|    British|
|        2|  heidfeld|      Nick|  Heidfeld|1977-05-10| HEI|    \N|     German|
|        3|   rosberg|      Nico|   Rosberg|1985-06-27| ROS|     6|     German|
|        4|    alonso|  Fernando|    Alonso|1981-07-29| ALO|    14|    Spanish|
|        5|kovalainen|    Heikki|Kovalainen|1981-10-19| KOV|    \N|    Finnish|
|        6|  nakajima|    Kazuki|  Nakajima|1985-01-11| NAK|    \N|   Japanese|
|        7|  bourdais| Sébastien|  Bourdais|1979-02-28| BOU|    \N|     French|
|        8| raikkonen|      Kimi| Räikkönen|1979-10-17| RAI|     7|    Finnish|
|        9|    kubica|    Robert|    Kubica|1984-12-07| KUB|    88|     Polish|
|       10|     glock|      Timo|     Gl

In [7]:
driver_df = driver_df.withColumn("number",driver_df.number.cast('int'))\
                    .withColumn('ingestion_Date',current_timestamp())

In [8]:
driver_df = add_timestamp(driver_df)

In [9]:
driver_df.show()

+---------+----------+----------+----------+----------+----+------+-----------+--------------------+
|driver_Id|driver_Ref|first_Name| last_Name|       dob|code|number|nationality|      ingestion_Date|
+---------+----------+----------+----------+----------+----+------+-----------+--------------------+
|        1|  hamilton|     Lewis|  Hamilton|1985-01-07| HAM|    44|    British|2022-06-13 10:35:...|
|        2|  heidfeld|      Nick|  Heidfeld|1977-05-10| HEI|  null|     German|2022-06-13 10:35:...|
|        3|   rosberg|      Nico|   Rosberg|1985-06-27| ROS|     6|     German|2022-06-13 10:35:...|
|        4|    alonso|  Fernando|    Alonso|1981-07-29| ALO|    14|    Spanish|2022-06-13 10:35:...|
|        5|kovalainen|    Heikki|Kovalainen|1981-10-19| KOV|  null|    Finnish|2022-06-13 10:35:...|
|        6|  nakajima|    Kazuki|  Nakajima|1985-01-11| NAK|  null|   Japanese|2022-06-13 10:35:...|
|        7|  bourdais| Sébastien|  Bourdais|1979-02-28| BOU|  null|     French|2022-06-13 1

In [10]:
driver_df = driver_df.na.fill(0,['number'])

In [11]:
driver_df.show()

+---------+----------+----------+----------+----------+----+------+-----------+--------------------+
|driver_Id|driver_Ref|first_Name| last_Name|       dob|code|number|nationality|      ingestion_Date|
+---------+----------+----------+----------+----------+----+------+-----------+--------------------+
|        1|  hamilton|     Lewis|  Hamilton|1985-01-07| HAM|    44|    British|2022-06-13 10:35:...|
|        2|  heidfeld|      Nick|  Heidfeld|1977-05-10| HEI|     0|     German|2022-06-13 10:35:...|
|        3|   rosberg|      Nico|   Rosberg|1985-06-27| ROS|     6|     German|2022-06-13 10:35:...|
|        4|    alonso|  Fernando|    Alonso|1981-07-29| ALO|    14|    Spanish|2022-06-13 10:35:...|
|        5|kovalainen|    Heikki|Kovalainen|1981-10-19| KOV|     0|    Finnish|2022-06-13 10:35:...|
|        6|  nakajima|    Kazuki|  Nakajima|1985-01-11| NAK|     0|   Japanese|2022-06-13 10:35:...|
|        7|  bourdais| Sébastien|  Bourdais|1979-02-28| BOU|     0|     French|2022-06-13 1

In [12]:
driver_df.write.mode('overwrite').parquet(f"{processed_path}\Drivers")