### **Paso 2.4 - Ingesta del archivo "drivers.json"**

In [None]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [None]:
dbutils.widgets.text("p_file_date", "2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [None]:
%run "../includes/configuration"

In [None]:
%run "../includes/common_functions"

#### Paso 1 - Leer el archivo JSON

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [None]:
name_schema = StructType(fields=[StructField("forename", StringType(), True),
                                 StructField("surname", StringType(), True)
  
])

In [None]:
drivers_schema = StructType(fields=[StructField("driverId", IntegerType(), False),
                                    StructField("driverRef", StringType(), True),
                                    StructField("number", IntegerType(), True),
                                    StructField("code", StringType(), True),
                                    StructField("name", name_schema),
                                    StructField("dob", DateType(), True),
                                    StructField("nationality", StringType(), True),
                                    StructField("url", StringType(), True)  
])

In [None]:
drivers_df = spark.read \
.schema(drivers_schema) \
.json("/mnt/formula1dl/raw/drivers.json")
#.json(f"{raw_folder_path}/{v_file_date}/drivers.json")

In [None]:
drivers_df.show(truncate=False)

+--------+----------+------+----+---------------------+----------+-----------+-----------------------------------------------------+
|driverId|driverRef |number|code|name                 |dob       |nationality|url                                                  |
+--------+----------+------+----+---------------------+----------+-----------+-----------------------------------------------------+
|1       |hamilton  |44    |HAM |{Lewis, Hamilton}    |1985-01-07|British    |http://en.wikipedia.org/wiki/Lewis_Hamilton          |
|2       |heidfeld  |null  |HEI |{Nick, Heidfeld}     |1977-05-10|German     |http://en.wikipedia.org/wiki/Nick_Heidfeld           |
|3       |rosberg   |6     |ROS |{Nico, Rosberg}      |1985-06-27|German     |http://en.wikipedia.org/wiki/Nico_Rosberg            |
|4       |alonso    |14    |ALO |{Fernando, Alonso}   |1981-07-29|Spanish    |http://en.wikipedia.org/wiki/Fernando_Alonso         |
|5       |kovalainen|null  |KOV |{Heikki, Kovalainen} |1981-10-19|Fin

In [None]:
drivers_df.printSchema()

root
 |-- driverId: integer (nullable = true)
 |-- driverRef: string (nullable = true)
 |-- number: integer (nullable = true)
 |-- code: string (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- forename: string (nullable = true)
 |    |-- surname: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- nationality: string (nullable = true)
 |-- url: string (nullable = true)



#### Paso 2 - Renombrar columnas y añadir nuevas columnas
1. **driverId** renombrado a **driver_id**
2. **driverRef** renombrado a **driver_ref**
3. Se añade la columna **ingestion_date**
4. Se añade la columna **name** con la concatenación de **forename** y **surname**.

In [None]:
from pyspark.sql.functions import col, concat, lit, current_timestamp

In [None]:
drivers_with_columns_df = drivers_df.withColumnRenamed("driverId", "driver_id") \
                                    .withColumnRenamed("driverRef", "driver_ref") \
                                    .withColumn("ingestion_date", current_timestamp()) \
                                    .withColumn("name", concat(col("name.forename"), lit(" "), col("name.surname")))
                                    #.withColumn("data_source", lit(v_data_source)) \
                                    #.withColumn("file_date", lit(v_file_date))

In [None]:
drivers_with_columns_df.show(truncate=False)

+---------+----------+------+----+------------------+----------+-----------+-----------------------------------------------------+-----------------------+
|driver_id|driver_ref|number|code|name              |dob       |nationality|url                                                  |ingestion_date         |
+---------+----------+------+----+------------------+----------+-----------+-----------------------------------------------------+-----------------------+
|1        |hamilton  |44    |HAM |Lewis Hamilton    |1985-01-07|British    |http://en.wikipedia.org/wiki/Lewis_Hamilton          |2023-06-10 00:55:40.646|
|2        |heidfeld  |null  |HEI |Nick Heidfeld     |1977-05-10|German     |http://en.wikipedia.org/wiki/Nick_Heidfeld           |2023-06-10 00:55:40.646|
|3        |rosberg   |6     |ROS |Nico Rosberg      |1985-06-27|German     |http://en.wikipedia.org/wiki/Nico_Rosberg            |2023-06-10 00:55:40.646|
|4        |alonso    |14    |ALO |Fernando Alonso   |1981-07-29|Spanis

In [None]:
#drivers_with_ingestion_date_df = add_ingestion_date(drivers_df)

#### Paso 3 - Eliminar las columnas no deseadas

In [None]:
drivers_final_df = drivers_with_columns_df.drop(col("url"))

In [None]:
drivers_final_df.show(truncate=False)

+---------+----------+------+----+------------------+----------+-----------+-----------------------+
|driver_id|driver_ref|number|code|name              |dob       |nationality|ingestion_date         |
+---------+----------+------+----+------------------+----------+-----------+-----------------------+
|1        |hamilton  |44    |HAM |Lewis Hamilton    |1985-01-07|British    |2023-06-10 00:58:31.433|
|2        |heidfeld  |null  |HEI |Nick Heidfeld     |1977-05-10|German     |2023-06-10 00:58:31.433|
|3        |rosberg   |6     |ROS |Nico Rosberg      |1985-06-27|German     |2023-06-10 00:58:31.433|
|4        |alonso    |14    |ALO |Fernando Alonso   |1981-07-29|Spanish    |2023-06-10 00:58:31.433|
|5        |kovalainen|null  |KOV |Heikki Kovalainen |1981-10-19|Finnish    |2023-06-10 00:58:31.433|
|6        |nakajima  |null  |NAK |Kazuki Nakajima   |1985-01-11|Japanese   |2023-06-10 00:58:31.433|
|7        |bourdais  |null  |BOU |Sébastien Bourdais|1979-02-28|French     |2023-06-10 00:5

#### Paso 4 - Escribir datos en el contenedor **processed** del ADLS como **parquet**

In [None]:
# Escribimos el archivo con formato PARQUET en el contenedor "processed" y directorio "constructors"
drivers_final_df.write.mode('overwrite').parquet("/mnt/formula1dl/processed/drivers")

In [None]:
# Visualizamos los objetos que se encuentran en el directorio "constructors"
%fs
ls /mnt/formula1dl/processed/drivers

In [None]:
# Podemos guardar como TABLA el archivo, en la base de datos "f1_processed" y la tabla se llama "drivers"
drivers_final_df.write.mode("overwrite").format("delta").saveAsTable("f1_processed.drivers")

In [None]:
%sql
SELECT * FROM f1_processed.drivers

In [None]:
dbutils.notebook.exit("Success")