### **Paso 2.8 - Ingesta del directorio "qualifying"**

#### Paso 1 - Leer el directorio **qualifying** el cual contiene multiples archivos Multi Line JSON

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [None]:
qualifying_schema = StructType(fields=[StructField("qualifyId", IntegerType(), False),
                                      StructField("raceId", IntegerType(), True),
                                      StructField("driverId", IntegerType(), True),
                                      StructField("constructorId", IntegerType(), True),
                                      StructField("number", IntegerType(), True),
                                      StructField("position", IntegerType(), True),
                                      StructField("q1", StringType(), True),
                                      StructField("q2", StringType(), True),
                                      StructField("q3", StringType(), True),
                                     ])

In [None]:
qualifying_df = spark.read \
.schema(qualifying_schema) \
.option("multiLine", True) \
.json("/mnt/formula1dl/raw/qualifying")

In [None]:
qualifying_df.show(truncate=False)

+---------+------+--------+-------------+------+--------+--------+--------+--------+
|qualifyId|raceId|driverId|constructorId|number|position|q1      |q2      |q3      |
+---------+------+--------+-------------+------+--------+--------+--------+--------+
|1        |18    |1       |1            |22    |1       |1:26.572|1:25.187|1:26.714|
|2        |18    |9       |2            |4     |2       |1:26.103|1:25.315|1:26.869|
|3        |18    |5       |1            |23    |3       |1:25.664|1:25.452|1:27.079|
|4        |18    |13      |6            |2     |4       |1:25.994|1:25.691|1:27.178|
|5        |18    |2       |2            |3     |5       |1:25.960|1:25.518|1:27.236|
|6        |18    |15      |7            |11    |6       |1:26.427|1:26.101|1:28.527|
|7        |18    |3       |3            |7     |7       |1:26.295|1:26.059|1:28.687|
|8        |18    |14      |9            |9     |8       |1:26.381|1:26.063|1:29.041|
|9        |18    |10      |7            |12    |9       |1:26.919

#### Paso 2 - Renombrar columnas y añadir nuevas columnas
1. Renombrar qualifyingId, driverId, constructorId y raceId
2. Añadir ingestion_date con current timestamp

In [None]:
from pyspark.sql.functions import lit, current_timestamp

In [None]:
final_df = qualifying_df.withColumnRenamed("qualifyId", "qualify_id") \
                        .withColumnRenamed("driverId", "driver_id") \
                        .withColumnRenamed("raceId", "race_id") \
                        .withColumnRenamed("constructorId", "constructor_id") \
                        .withColumn("ingestion_date", current_timestamp())

In [None]:
final_df.show(truncate=False)

+----------+-------+---------+--------------+------+--------+--------+--------+--------+-----------------------+
|qualify_id|race_id|driver_id|constructor_id|number|position|q1      |q2      |q3      |ingestion_date         |
+----------+-------+---------+--------------+------+--------+--------+--------+--------+-----------------------+
|1         |18     |1        |1             |22    |1       |1:26.572|1:25.187|1:26.714|2023-06-10 23:22:10.249|
|2         |18     |9        |2             |4     |2       |1:26.103|1:25.315|1:26.869|2023-06-10 23:22:10.249|
|3         |18     |5        |1             |23    |3       |1:25.664|1:25.452|1:27.079|2023-06-10 23:22:10.249|
|4         |18     |13       |6             |2     |4       |1:25.994|1:25.691|1:27.178|2023-06-10 23:22:10.249|
|5         |18     |2        |2             |3     |5       |1:25.960|1:25.518|1:27.236|2023-06-10 23:22:10.249|
|6         |18     |15       |7             |11    |6       |1:26.427|1:26.101|1:28.527|2023-06-

In [None]:
final_df.printSchema()

root
 |-- qualify_id: integer (nullable = true)
 |-- race_id: integer (nullable = true)
 |-- driver_id: integer (nullable = true)
 |-- constructor_id: integer (nullable = true)
 |-- number: integer (nullable = true)
 |-- position: integer (nullable = true)
 |-- q1: string (nullable = true)
 |-- q2: string (nullable = true)
 |-- q3: string (nullable = true)
 |-- ingestion_date: timestamp (nullable = false)



#### Paso 3 - Escribir datos en el contenedor **processed** del ADLS como **parquet**

In [None]:
# Escribimos el archivo con formato PARQUET en el contenedor "processed" y directorio "qualifying"
final_df.write.mode('overwrite').parquet("/mnt/formula1dl/processed/qualifying")

In [None]:
# Visualizamos los objetos que se encuentran en el directorio "qualifying"
%fs
ls /mnt/formula1dl/processed/qualifying