### **Paso 2.6 - Ingesta del archivo "pit_stops.json"**

Nos permite crear e indicar parámetros en tiempo de ejecución

<center><img src="https://i.postimg.cc/J4Zr2JJy/db73.png"></center>

In [None]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [None]:
v_data_source

Out[2]: 'Ergast'

In [None]:
dbutils.widgets.text("p_file_date", "2021-03-28")
v_file_date = dbutils.widgets.get("p_file_date")

In [None]:
v_file_date

Out[4]: '2023-06-11'

In [None]:
%run "../includes/configuration"

In [None]:
%run "../includes/common_functions"

#### Paso 1 - Leer el archivo JSON

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [None]:
pit_stops_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                      StructField("driverId", IntegerType(), True),
                                      StructField("stop", StringType(), True),
                                      StructField("lap", IntegerType(), True),
                                      StructField("time", StringType(), True),
                                      StructField("duration", StringType(), True),
                                      StructField("milliseconds", IntegerType(), True)
                                     ])

In [None]:
# El parámetro "raw_folder_path" se encuentra en el notebook "configuration"
# El parámetro "v_file_date" se encuentra en el notebook e indicamos su valor en tiempo de ejecución
pit_stops_df = spark.read \
.schema(pit_stops_schema) \
.option("multiLine", True) \
.json(f"{raw_folder_path}/pit_stops.json")
#.json(f"{raw_folder_path}/{v_file_date}/pit_stops.json")

In [None]:
pit_stops_df.show(truncate=False)

+------+--------+----+---+--------+--------+------------+
|raceId|driverId|stop|lap|time    |duration|milliseconds|
+------+--------+----+---+--------+--------+------------+
|841   |153     |1   |1  |17:05:23|26.898  |26898       |
|841   |30      |1   |1  |17:05:52|25.021  |25021       |
|841   |17      |1   |11 |17:20:48|23.426  |23426       |
|841   |4       |1   |12 |17:22:34|23.251  |23251       |
|841   |13      |1   |13 |17:24:10|23.842  |23842       |
|841   |22      |1   |13 |17:24:29|23.643  |23643       |
|841   |20      |1   |14 |17:25:17|22.603  |22603       |
|841   |814     |1   |14 |17:26:03|24.863  |24863       |
|841   |816     |1   |14 |17:26:50|25.259  |25259       |
|841   |67      |1   |15 |17:27:34|25.342  |25342       |
|841   |2       |1   |15 |17:27:41|22.994  |22994       |
|841   |1       |1   |16 |17:28:24|23.227  |23227       |
|841   |808     |1   |16 |17:28:39|24.535  |24535       |
|841   |3       |1   |16 |17:29:00|23.716  |23716       |
|841   |155   

In [None]:
pit_stops_df.printSchema()

root
 |-- raceId: integer (nullable = true)
 |-- driverId: integer (nullable = true)
 |-- stop: string (nullable = true)
 |-- lap: integer (nullable = true)
 |-- time: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- milliseconds: integer (nullable = true)



#### Paso 2 - Renombrar columnas y añadir nuevas columnas
1. Renombrar driverId y raceId
2. Añadir ingestion_date con current timestamp

In [None]:
from pyspark.sql.functions import lit

In [None]:
pit_stops_new_df = pit_stops_df.withColumnRenamed("driverId", "driver_id") \
                               .withColumnRenamed("raceId", "race_id") \
                               .withColumn("data_source", lit(v_data_source)) \
                               .withColumn("file_date", lit(v_file_date))

In [None]:
# La función "add_ingestion_date()" se encuentra en el notebook "common_functions"
pit_stops_with_ingestion_date_df = add_ingestion_date(pit_stops_new_df)

In [None]:
pit_stops_with_ingestion_date_df.show(truncate=False)

+-------+---------+----+---+--------+--------+------------+-----------+----------+-----------------------+
|race_id|driver_id|stop|lap|time    |duration|milliseconds|data_source|file_date |ingestion_date         |
+-------+---------+----+---+--------+--------+------------+-----------+----------+-----------------------+
|841    |153      |1   |1  |17:05:23|26.898  |26898       |Ergast     |2023-06-11|2023-06-11 14:49:41.461|
|841    |30       |1   |1  |17:05:52|25.021  |25021       |Ergast     |2023-06-11|2023-06-11 14:49:41.461|
|841    |17       |1   |11 |17:20:48|23.426  |23426       |Ergast     |2023-06-11|2023-06-11 14:49:41.461|
|841    |4        |1   |12 |17:22:34|23.251  |23251       |Ergast     |2023-06-11|2023-06-11 14:49:41.461|
|841    |13       |1   |13 |17:24:10|23.842  |23842       |Ergast     |2023-06-11|2023-06-11 14:49:41.461|
|841    |22       |1   |13 |17:24:29|23.643  |23643       |Ergast     |2023-06-11|2023-06-11 14:49:41.461|
|841    |20       |1   |14 |17:25:17|

In [None]:
pit_stops_with_ingestion_date_df.printSchema()

root
 |-- race_id: integer (nullable = true)
 |-- driver_id: integer (nullable = true)
 |-- stop: string (nullable = true)
 |-- lap: integer (nullable = true)
 |-- time: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- milliseconds: integer (nullable = true)
 |-- data_source: string (nullable = false)
 |-- file_date: string (nullable = false)
 |-- ingestion_date: timestamp (nullable = false)



In [None]:
final_df = pit_stops_with_ingestion_date_df

#### Paso 3 - Escribir datos en el contenedor **processed** del ADLS como **parquet**

In [None]:
# Escribimos el archivo con formato PARQUET en el contenedor "processed" y directorio "pit_stops"
# El parámetro "processed_folder_path" se encuentra en el notebook "configuration"
final_df.write.mode('overwrite').parquet(f"{processed_folder_path}/pit_stops")

In [None]:
# Visualizamos los objetos que se encuentran en el directorio "pit_stops"
%fs
ls /mnt/formula1dl/processed/pit_stops

In [None]:
dbutils.notebook.exit("Success")