###Ingest constructors.json file

In [0]:
dbutils.widgets.text('p_data_source','')
v_data_source = dbutils.widgets.get('p_data_source')

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

#####Step 1 - Read the JSON file using the spark dataframe reader

In [0]:
constructors_schema = "constructorId INT,constructorRef STRING,name STRING,nationality STRING,url STRING"

In [0]:
constructors_df = spark.read.json(f"{raw_folder_path}/constructors.json",schema=constructors_schema)

In [0]:
constructors_df.show(10)

+-------------+--------------+-----------+-----------+--------------------+
|constructorId|constructorRef|       name|nationality|                 url|
+-------------+--------------+-----------+-----------+--------------------+
|            1|       mclaren|    McLaren|    British|http://en.wikiped...|
|            2|    bmw_sauber| BMW Sauber|     German|http://en.wikiped...|
|            3|      williams|   Williams|    British|http://en.wikiped...|
|            4|       renault|    Renault|     French|http://en.wikiped...|
|            5|    toro_rosso| Toro Rosso|    Italian|http://en.wikiped...|
|            6|       ferrari|    Ferrari|    Italian|http://en.wikiped...|
|            7|        toyota|     Toyota|   Japanese|http://en.wikiped...|
|            8|   super_aguri|Super Aguri|   Japanese|http://en.wikiped...|
|            9|      red_bull|   Red Bull|   Austrian|http://en.wikiped...|
|           10|   force_india|Force India|     Indian|http://en.wikiped...|
+-----------

#####Step 2 - Drop unwanted columns from the dataframe

In [0]:
from pyspark.sql.functions import col

In [0]:
constructors_dropped_df = constructors_df.drop(col('url'))

In [0]:
constructors_dropped_df.show(10)

+-------------+--------------+-----------+-----------+
|constructorId|constructorRef|       name|nationality|
+-------------+--------------+-----------+-----------+
|            1|       mclaren|    McLaren|    British|
|            2|    bmw_sauber| BMW Sauber|     German|
|            3|      williams|   Williams|    British|
|            4|       renault|    Renault|     French|
|            5|    toro_rosso| Toro Rosso|    Italian|
|            6|       ferrari|    Ferrari|    Italian|
|            7|        toyota|     Toyota|   Japanese|
|            8|   super_aguri|Super Aguri|   Japanese|
|            9|      red_bull|   Red Bull|   Austrian|
|           10|   force_india|Force India|     Indian|
+-------------+--------------+-----------+-----------+
only showing top 10 rows



#####Step 3 - Rename columns and add ingestion date

In [0]:
from pyspark.sql.functions import current_timestamp,lit

In [0]:
constructors_final_df = constructors_dropped_df.withColumnRenamed('constructorId','constructor_id') \
                                                .withColumnRenamed('constructorRef','constructor_ref') \
                                                .withColumn('ingestion_date',current_timestamp()) \
                                                .withColumn('data_source',lit(v_data_source))

In [0]:
constructors_final_df.show(10)

+--------------+---------------+-----------+-----------+--------------------+-----------+
|constructor_id|constructor_ref|       name|nationality|      ingestion_date|data_source|
+--------------+---------------+-----------+-----------+--------------------+-----------+
|             1|        mclaren|    McLaren|    British|2023-09-30 17:59:...| Ergast API|
|             2|     bmw_sauber| BMW Sauber|     German|2023-09-30 17:59:...| Ergast API|
|             3|       williams|   Williams|    British|2023-09-30 17:59:...| Ergast API|
|             4|        renault|    Renault|     French|2023-09-30 17:59:...| Ergast API|
|             5|     toro_rosso| Toro Rosso|    Italian|2023-09-30 17:59:...| Ergast API|
|             6|        ferrari|    Ferrari|    Italian|2023-09-30 17:59:...| Ergast API|
|             7|         toyota|     Toyota|   Japanese|2023-09-30 17:59:...| Ergast API|
|             8|    super_aguri|Super Aguri|   Japanese|2023-09-30 17:59:...| Ergast API|
|         

#####Step 4 - Write the output to parquet file

In [0]:
constructors_final_df.write.mode('overwrite').parquet(f"{processed_folder_path}/constructors")

In [0]:
dbutils.notebook.exit('Success')