### **Paso 2.7 - Ingesta del directorio "lap_times"**

#### Paso 1 - Leer el directorio **lap_times** el cual contiene multiples archivos CSV

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [None]:
lap_times_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                      StructField("driverId", IntegerType(), True),
                                      StructField("lap", IntegerType(), True),
                                      StructField("position", IntegerType(), True),
                                      StructField("time", StringType(), True),
                                      StructField("milliseconds", IntegerType(), True)
                                     ])

In [None]:
lap_times_df = spark.read \
.schema(lap_times_schema) \
.csv("/mnt/formula1dl/raw/lap_times")

In [None]:
lap_times_df.show(truncate=False)

+------+--------+---+--------+--------+------------+
|raceId|driverId|lap|position|time    |milliseconds|
+------+--------+---+--------+--------+------------+
|841   |20      |1  |1       |1:38.109|98109       |
|841   |20      |2  |1       |1:33.006|93006       |
|841   |20      |3  |1       |1:32.713|92713       |
|841   |20      |4  |1       |1:32.803|92803       |
|841   |20      |5  |1       |1:32.342|92342       |
|841   |20      |6  |1       |1:32.605|92605       |
|841   |20      |7  |1       |1:32.502|92502       |
|841   |20      |8  |1       |1:32.537|92537       |
|841   |20      |9  |1       |1:33.240|93240       |
|841   |20      |10 |1       |1:32.572|92572       |
|841   |20      |11 |1       |1:32.669|92669       |
|841   |20      |12 |1       |1:32.902|92902       |
|841   |20      |13 |1       |1:33.698|93698       |
|841   |20      |14 |3       |1:52.075|112075      |
|841   |20      |15 |4       |1:38.385|98385       |
|841   |20      |16 |2       |1:31.548|91548  

In [None]:
lap_times_df.printSchema()

root
 |-- raceId: integer (nullable = true)
 |-- driverId: integer (nullable = true)
 |-- lap: integer (nullable = true)
 |-- position: integer (nullable = true)
 |-- time: string (nullable = true)
 |-- milliseconds: integer (nullable = true)



#### Paso 2 - Renombrar columnas y añadir nuevas columnas
1. Renombrar driverId y raceId
2. Añadir ingestion_date con current timestamp

In [None]:
from pyspark.sql.functions import lit, current_timestamp

In [None]:
final_df = lap_times_df.withColumnRenamed("driverId", "driver_id") \
                       .withColumnRenamed("raceId", "race_id") \
                       .withColumn("ingestion_date", current_timestamp())

In [None]:
final_df.show(truncate=False)

+-------+---------+---+--------+--------+------------+-----------------------+
|race_id|driver_id|lap|position|time    |milliseconds|ingestion_date         |
+-------+---------+---+--------+--------+------------+-----------------------+
|841    |20       |1  |1       |1:38.109|98109       |2023-06-10 22:12:39.273|
|841    |20       |2  |1       |1:33.006|93006       |2023-06-10 22:12:39.273|
|841    |20       |3  |1       |1:32.713|92713       |2023-06-10 22:12:39.273|
|841    |20       |4  |1       |1:32.803|92803       |2023-06-10 22:12:39.273|
|841    |20       |5  |1       |1:32.342|92342       |2023-06-10 22:12:39.273|
|841    |20       |6  |1       |1:32.605|92605       |2023-06-10 22:12:39.273|
|841    |20       |7  |1       |1:32.502|92502       |2023-06-10 22:12:39.273|
|841    |20       |8  |1       |1:32.537|92537       |2023-06-10 22:12:39.273|
|841    |20       |9  |1       |1:33.240|93240       |2023-06-10 22:12:39.273|
|841    |20       |10 |1       |1:32.572|92572      

#### Paso 3 - Escribir datos en el contenedor **processed** del ADLS como **parquet**

In [None]:
# Escribimos el archivo con formato PARQUET en el contenedor "processed" y directorio "lap_times"
final_df.write.mode('overwrite').parquet("/mnt/formula1dl/processed/lap_times")

In [None]:
# Visualizamos los objetos que se encuentran en el directorio "lap_times"
%fs
ls /mnt/formula1dl/processed/lap_times