### **Paso 2.3 - Ingesta del archivo "constructors.json"**

In [None]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [None]:
dbutils.widgets.text("p_file_date", "2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [None]:
%run "../includes/configuration"

In [None]:
%run "../includes/common_functions"

#### Paso 1 - Leer el archivo JSON

In [None]:
constructors_schema = "constructorId INT, constructorRef STRING, name STRING, nationality STRING, url STRING"

In [None]:
constructor_df = spark.read \
.schema(constructors_schema) \
.json("/mnt/formula1dl/raw/constructors.json")
#.json(f"{raw_folder_path}/{v_file_date}/constructors.json")

In [None]:
constructor_df.printSchema()

root
 |-- constructorId: integer (nullable = true)
 |-- constructorRef: string (nullable = true)
 |-- name: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- url: string (nullable = true)



In [None]:
constructor_df.show(truncate=False)

+-------------+--------------+-----------+-----------+------------------------------------------------------------+
|constructorId|constructorRef|name       |nationality|url                                                         |
+-------------+--------------+-----------+-----------+------------------------------------------------------------+
|1            |mclaren       |McLaren    |British    |http://en.wikipedia.org/wiki/McLaren                        |
|2            |bmw_sauber    |BMW Sauber |German     |http://en.wikipedia.org/wiki/BMW_Sauber                     |
|3            |williams      |Williams   |British    |http://en.wikipedia.org/wiki/Williams_Grand_Prix_Engineering|
|4            |renault       |Renault    |French     |http://en.wikipedia.org/wiki/Renault_in_Formula_One         |
|5            |toro_rosso    |Toro Rosso |Italian    |http://en.wikipedia.org/wiki/Scuderia_Toro_Rosso            |
|6            |ferrari       |Ferrari    |Italian    |http://en.wikipedi

#### Paso 2 - Eliminar las columnas no deseadas

In [None]:
from pyspark.sql.functions import col

In [None]:
constructor_dropped_df = constructor_df.drop(col('url'))

In [None]:
constructor_dropped_df.show(truncate=False)

+-------------+--------------+-----------+-----------+
|constructorId|constructorRef|name       |nationality|
+-------------+--------------+-----------+-----------+
|1            |mclaren       |McLaren    |British    |
|2            |bmw_sauber    |BMW Sauber |German     |
|3            |williams      |Williams   |British    |
|4            |renault       |Renault    |French     |
|5            |toro_rosso    |Toro Rosso |Italian    |
|6            |ferrari       |Ferrari    |Italian    |
|7            |toyota        |Toyota     |Japanese   |
|8            |super_aguri   |Super Aguri|Japanese   |
|9            |red_bull      |Red Bull   |Austrian   |
|10           |force_india   |Force India|Indian     |
|11           |honda         |Honda      |Japanese   |
|12           |spyker        |Spyker     |Dutch      |
|13           |mf1           |MF1        |Russian    |
|14           |spyker_mf1    |Spyker MF1 |Dutch      |
|15           |sauber        |Sauber     |Swiss      |
|16       

#### Paso 3 - Cambiar el nombre de las columnas y añadir "ingestion date"

In [None]:
from pyspark.sql.functions import lit, current_timestamp

In [None]:
constructor_renamed_df = constructor_dropped_df.withColumnRenamed("constructorId", "constructor_id") \
                                               .withColumnRenamed("constructorRef", "constructor_ref") \
                                               .withColumn("ingestion_date", current_timestamp())
                                               #.withColumn("data_source", lit(v_data_source)) \
                                               #.withColumn("file_date", lit(v_file_date))

In [None]:
constructor_renamed_df.show(truncate=False)

+--------------+---------------+-----------+-----------+-----------------------+
|constructor_id|constructor_ref|name       |nationality|ingestion_date         |
+--------------+---------------+-----------+-----------+-----------------------+
|1             |mclaren        |McLaren    |British    |2023-06-09 22:11:31.546|
|2             |bmw_sauber     |BMW Sauber |German     |2023-06-09 22:11:31.546|
|3             |williams       |Williams   |British    |2023-06-09 22:11:31.546|
|4             |renault        |Renault    |French     |2023-06-09 22:11:31.546|
|5             |toro_rosso     |Toro Rosso |Italian    |2023-06-09 22:11:31.546|
|6             |ferrari        |Ferrari    |Italian    |2023-06-09 22:11:31.546|
|7             |toyota         |Toyota     |Japanese   |2023-06-09 22:11:31.546|
|8             |super_aguri    |Super Aguri|Japanese   |2023-06-09 22:11:31.546|
|9             |red_bull       |Red Bull   |Austrian   |2023-06-09 22:11:31.546|
|10            |force_india 

In [None]:
#constructor_final_df = add_ingestion_date(constructor_renamed_df)

#### Paso 4 - Escribir datos en el contenedor **processed** del ADLS como **parquet**

In [None]:
# Escribimos el archivo con formato PARQUET en el contenedor "processed" y directorio "constructors"
constructor_renamed_df.write.mode('overwrite').parquet("/mnt/formula1dl/processed/constructors")

In [None]:
# Visualizamos los objetos que se encuentran en el directorio "constructors"
%fs
ls /mnt/formula1dl/processed/constructors

In [None]:
# Podemos guardar como TABLA el archivo, en la base de datos "f1_processed" y la tabla se llama "constructors"
constructor_renamed_df.write.mode("overwrite").format("delta").saveAsTable("f1_processed.constructors")

In [None]:
%sql
SELECT * FROM f1_processed.constructors;

In [None]:
dbutils.notebook.exit("Success")