###Ingest qualifying json files

In [0]:
dbutils.widgets.text('p_data_source','')
v_data_source = dbutils.widgets.get('p_data_source')

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

#####Step 1 - Read the JSON file using the spark dataframe reader API

In [0]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType

In [0]:
qualifying_schema = StructType(fields = [StructField('qualifyId',IntegerType(),False),
                                        StructField('raceId',IntegerType(),True),
                                        StructField('driverId',IntegerType(),True),
                                        StructField('constructorId',IntegerType(),True),
                                        StructField('number',IntegerType(),True),
                                        StructField('position',IntegerType(),True),
                                        StructField('q1',StringType(),True),
                                        StructField('q2',StringType(),True),
                                        StructField('q3',StringType(),True)])

In [0]:
qualifying_df = spark.read.option('multiline',True).json(f"{raw_folder_path}/qualifying",schema = qualifying_schema)

In [0]:
qualifying_df.show(10)

+---------+------+--------+-------------+------+--------+--------+--------+--------+
|qualifyId|raceId|driverId|constructorId|number|position|      q1|      q2|      q3|
+---------+------+--------+-------------+------+--------+--------+--------+--------+
|        1|    18|       1|            1|    22|       1|1:26.572|1:25.187|1:26.714|
|        2|    18|       9|            2|     4|       2|1:26.103|1:25.315|1:26.869|
|        3|    18|       5|            1|    23|       3|1:25.664|1:25.452|1:27.079|
|        4|    18|      13|            6|     2|       4|1:25.994|1:25.691|1:27.178|
|        5|    18|       2|            2|     3|       5|1:25.960|1:25.518|1:27.236|
|        6|    18|      15|            7|    11|       6|1:26.427|1:26.101|1:28.527|
|        7|    18|       3|            3|     7|       7|1:26.295|1:26.059|1:28.687|
|        8|    18|      14|            9|     9|       8|1:26.381|1:26.063|1:29.041|
|        9|    18|      10|            7|    12|       9|1:26.919

#####Step 2 - Rename columns and add new columns

In [0]:
from pyspark.sql.functions import current_timestamp,lit

In [0]:
qualifying_final_df = qualifying_df.withColumnRenamed('qualifyId','qualify_id') \
        .withColumnRenamed('driverId','driver_id') \
        .withColumnRenamed('raceId','race_id') \
        .withColumnRenamed('constructorId','constructor_id') \
        .withColumn('ingestion_date',current_timestamp()) \
        .withColumn('data_source',lit(v_data_source))

In [0]:
qualifying_final_df.show(10)

+----------+-------+---------+--------------+------+--------+--------+--------+--------+--------------------+-----------+
|qualify_id|race_id|driver_id|constructor_id|number|position|      q1|      q2|      q3|      ingestion_date|data_source|
+----------+-------+---------+--------------+------+--------+--------+--------+--------+--------------------+-----------+
|         1|     18|        1|             1|    22|       1|1:26.572|1:25.187|1:26.714|2023-09-30 18:06:...| Ergast API|
|         2|     18|        9|             2|     4|       2|1:26.103|1:25.315|1:26.869|2023-09-30 18:06:...| Ergast API|
|         3|     18|        5|             1|    23|       3|1:25.664|1:25.452|1:27.079|2023-09-30 18:06:...| Ergast API|
|         4|     18|       13|             6|     2|       4|1:25.994|1:25.691|1:27.178|2023-09-30 18:06:...| Ergast API|
|         5|     18|        2|             2|     3|       5|1:25.960|1:25.518|1:27.236|2023-09-30 18:06:...| Ergast API|
|         6|     18|    

#####Step 3 - Write the output to processed container in parquet format

In [0]:
qualifying_final_df.write.mode('overwrite').parquet(f"{processed_folder_path}/qualifying")

In [0]:
dbutils.notebook.exit('Success')