### **Paso 5.2.5 - Ingesta del archivo "results.json" para el directorio "2021-03-28"**

Nos permite crear e indicar parámetros en tiempo de ejecución
<center><img src="https://i.postimg.cc/pTcSSHJQ/db153.png"></center>

In [None]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [None]:
v_data_source

Out[33]: 'Ergast'

In [None]:
dbutils.widgets.text("p_file_date", "2021-03-28")
v_file_date = dbutils.widgets.get("p_file_date")

In [None]:
v_file_date

Out[35]: '2021-03-28'

In [None]:
%run "../includes/configuration"

In [None]:
%run "../includes/common_functions"

#### Paso 1 - Leer el archivo JSON

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

In [None]:
results_schema = StructType(fields=[StructField("resultId", IntegerType(), False),
                                    StructField("raceId", IntegerType(), True),
                                    StructField("driverId", IntegerType(), True),
                                    StructField("constructorId", IntegerType(), True),
                                    StructField("number", IntegerType(), True),
                                    StructField("grid", IntegerType(), True),
                                    StructField("position", IntegerType(), True),
                                    StructField("positionText", StringType(), True),
                                    StructField("positionOrder", IntegerType(), True),
                                    StructField("points", FloatType(), True),
                                    StructField("laps", IntegerType(), True),
                                    StructField("time", StringType(), True),
                                    StructField("milliseconds", IntegerType(), True),
                                    StructField("fastestLap", IntegerType(), True),
                                    StructField("rank", IntegerType(), True),
                                    StructField("fastestLapTime", StringType(), True),
                                    StructField("fastestLapSpeed", FloatType(), True),
                                    StructField("statusId", StringType(), True)])

In [None]:
# El parámetro "raw_folder_path" se encuentra en el notebook "configuration"
# El parámetro "v_file_date" se encuentra en el notebook e indicamos su valor en tiempo de ejecución
results_df = spark.read \
.schema(results_schema) \
.json(f"{raw_folder_path}/{v_file_date}/results.json")
# Esto seria equivalente a la ruta: /mnt/formula1dl/raw/2021-03-28/results.json

In [None]:
results_df.show(truncate=False)

+--------+------+--------+-------------+------+----+--------+------------+-------------+------+----+-----------+------------+----------+----+--------------+---------------+--------+
|resultId|raceId|driverId|constructorId|number|grid|position|positionText|positionOrder|points|laps|time       |milliseconds|fastestLap|rank|fastestLapTime|fastestLapSpeed|statusId|
+--------+------+--------+-------------+------+----+--------+------------+-------------+------+----+-----------+------------+----------+----+--------------+---------------+--------+
|24966   |1052  |1       |131          |44    |2   |1       |1           |1            |25.0  |56  |1:32:03.897|5523897     |44        |4   |1:34.015      |207.235        |1       |
|24967   |1052  |830     |9            |33    |1   |2       |2           |2            |18.0  |56  |+0.745     |5524642     |41        |2   |1:33.228      |208.984        |1       |
|24968   |1052  |822     |131          |77    |3   |3       |3           |3            |16

#### Paso 2 - Renombrar columnas y añadir columnas nuevas

In [None]:
from pyspark.sql.functions import lit

In [None]:
results_with_columns_df = results_df.withColumnRenamed("resultId", "result_id") \
                                    .withColumnRenamed("raceId", "race_id") \
                                    .withColumnRenamed("driverId", "driver_id") \
                                    .withColumnRenamed("constructorId", "constructor_id") \
                                    .withColumnRenamed("positionText", "position_text") \
                                    .withColumnRenamed("positionOrder", "position_order") \
                                    .withColumnRenamed("fastestLap", "fastest_lap") \
                                    .withColumnRenamed("fastestLapTime", "fastest_lap_time") \
                                    .withColumnRenamed("fastestLapSpeed", "fastest_lap_speed") \
                                    .withColumn("data_source", lit(v_data_source)) \
                                    .withColumn("file_date", lit(v_file_date))

In [None]:
results_with_columns_df.show(truncate=False)

+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+--------+-----------+----------+
|result_id|race_id|driver_id|constructor_id|number|grid|position|position_text|position_order|points|laps|time       |milliseconds|fastest_lap|rank|fastest_lap_time|fastest_lap_speed|statusId|data_source|file_date |
+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+--------+-----------+----------+
|24966    |1052   |1        |131           |44    |2   |1       |1            |1             |25.0  |56  |1:32:03.897|5523897     |44         |4   |1:34.015        |207.235          |1       |Ergast     |2021-03-28|
|24967    |1052   |830      |9             |33    |1   |2       |2            |2             |18.0  |56  |+0.745     |5524642     |41   

In [None]:
# La función "add_ingestion_date()" se encuentra en el notebook "common_functions"
results_with_ingestion_date_df = add_ingestion_date(results_with_columns_df)

In [None]:
results_with_ingestion_date_df.show(truncate=False)

+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+--------+-----------+----------+-----------------------+
|result_id|race_id|driver_id|constructor_id|number|grid|position|position_text|position_order|points|laps|time       |milliseconds|fastest_lap|rank|fastest_lap_time|fastest_lap_speed|statusId|data_source|file_date |ingestion_date         |
+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+--------+-----------+----------+-----------------------+
|24966    |1052   |1        |131           |44    |2   |1       |1            |1             |25.0  |56  |1:32:03.897|5523897     |44         |4   |1:34.015        |207.235          |1       |Ergast     |2021-03-28|2023-06-14 16:52:08.785|
|24967    |1052   |830      |9          

#### Paso 3 - Eliminar la columna no deseada

In [None]:
from pyspark.sql.functions import col

In [None]:
results_final_df = results_with_ingestion_date_df.drop(col("statusId"))

In [None]:
results_final_df.show(truncate=False)

+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+-----------+----------+-----------------------+
|result_id|race_id|driver_id|constructor_id|number|grid|position|position_text|position_order|points|laps|time       |milliseconds|fastest_lap|rank|fastest_lap_time|fastest_lap_speed|data_source|file_date |ingestion_date         |
+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+-----------+----------+-----------------------+
|24966    |1052   |1        |131           |44    |2   |1       |1            |1             |25.0  |56  |1:32:03.897|5523897     |44         |4   |1:34.015        |207.235          |Ergast     |2021-03-28|2023-06-14 16:52:09.581|
|24967    |1052   |830      |9             |33    |1   |2       |2          

#### Paso 4 - Escribir datos en el datalake como parquet y crear la tabla **results** en la base de datos **f1_processed**

Si ejecutamos dos veces el notebook especificando el mismo valor para el parámetro **p_file_date**, DUPLICAREMOS los datos dado que estamos utilizando el **modo "append"**. ESE ES UN PROBLEMA DE ESTE MÉTODO.

In [None]:
# Escribimos el archivo con formato PARQUET en la base de datos "f1_processed" y en la tabla "results"
# Antes de ejecutar el código, debemos eliminar el directorio y archivo parquet creado previamente. Esto 
# porque al crear la tabla creará un directorio y archivo, y además, creará la tabla
results_final_df.write.mode("append").format("parquet").saveAsTable("f1_processed.results")

In [None]:
spark.read.parquet("/mnt/formula1dl/processed/results").show(truncate=False)

+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+-----------+----------+-----------------------+
|result_id|race_id|driver_id|constructor_id|number|grid|position|position_text|position_order|points|laps|time       |milliseconds|fastest_lap|rank|fastest_lap_time|fastest_lap_speed|data_source|file_date |ingestion_date         |
+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+-----------+----------+-----------------------+
|1        |18     |1        |1             |22    |1   |1       |1            |1             |10.0  |58  |1:34:50.616|5690616     |39         |2   |1:27.452        |218.3            |Ergast     |2021-03-21|2023-06-14 15:26:01.731|
|2        |18     |2        |2             |3     |5   |2       |2          

In [None]:
%sql
SELECT * FROM f1_processed.results;

result_id,race_id,driver_id,constructor_id,number,grid,position,position_text,position_order,points,laps,time,milliseconds,fastest_lap,rank,fastest_lap_time,fastest_lap_speed,data_source,file_date,ingestion_date
1,18,1,1,22,1,1.0,1,1,10.0,58,1:34:50.616,5690616.0,39.0,2.0,1:27.452,218.3,Ergast,2021-03-21,2023-06-14T15:26:01.731+0000
2,18,2,2,3,5,2.0,2,2,8.0,58,+5.478,5696094.0,41.0,3.0,1:27.739,217.586,Ergast,2021-03-21,2023-06-14T15:26:01.731+0000
3,18,3,3,7,7,3.0,3,3,6.0,58,+8.163,5698779.0,41.0,5.0,1:28.090,216.719,Ergast,2021-03-21,2023-06-14T15:26:01.731+0000
4,18,4,4,5,11,4.0,4,4,5.0,58,+17.181,5707797.0,58.0,7.0,1:28.603,215.464,Ergast,2021-03-21,2023-06-14T15:26:01.731+0000
5,18,5,1,23,3,5.0,5,5,4.0,58,+18.014,5708630.0,43.0,1.0,1:27.418,218.385,Ergast,2021-03-21,2023-06-14T15:26:01.731+0000
6,18,6,3,8,13,6.0,6,6,3.0,57,\N,,50.0,14.0,1:29.639,212.974,Ergast,2021-03-21,2023-06-14T15:26:01.731+0000
7,18,7,5,14,17,7.0,7,7,2.0,55,\N,,22.0,12.0,1:29.534,213.224,Ergast,2021-03-21,2023-06-14T15:26:01.731+0000
8,18,8,6,1,15,8.0,8,8,1.0,53,\N,,20.0,4.0,1:27.903,217.18,Ergast,2021-03-21,2023-06-14T15:26:01.731+0000
9,18,9,2,4,2,,R,9,0.0,47,\N,,15.0,9.0,1:28.753,215.1,Ergast,2021-03-21,2023-06-14T15:26:01.731+0000
10,18,10,7,12,18,,R,10,0.0,43,\N,,23.0,13.0,1:29.558,213.166,Ergast,2021-03-21,2023-06-14T15:26:01.731+0000


In [None]:
%sql
-- Vemos que hemos agregado la data de la race_id = 1052
SELECT race_id, COUNT(1)
FROM f1_processed.results
GROUP BY race_id
ORDER BY race_id DESC

race_id,count(1)
1052,20
1047,20
1046,20
1045,20
1044,20
1043,20
1042,20
1041,20
1040,20
1039,20


In [None]:
%sql
DESCRIBE FORMATTED f1_processed.results;

col_name,data_type,comment
result_id,int,
race_id,int,
driver_id,int,
constructor_id,int,
number,int,
grid,int,
position,int,
position_text,string,
position_order,int,
points,float,


<center><img src="https://i.postimg.cc/yN6K6vLt/db114.png"></center>

In [None]:
dbutils.notebook.exit("Success")

Success