### **Paso 2.1 - Ingesta del archivo "circuits.csv"**

In [None]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [None]:
dbutils.widgets.text("p_file_date", "2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [None]:
%run "../includes/configuration"

In [None]:
%run "../includes/common_functions"

#### Paso 1 - Leer el archivo CSV

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

In [None]:
circuits_schema = StructType(fields=[StructField("circuitId", IntegerType(), False),
                                     StructField("circuitRef", StringType(), True),
                                     StructField("name", StringType(), True),
                                     StructField("location", StringType(), True),
                                     StructField("country", StringType(), True),
                                     StructField("lat", DoubleType(), True),
                                     StructField("lng", DoubleType(), True),
                                     StructField("alt", IntegerType(), True),
                                     StructField("url", StringType(), True)
])

In [None]:
circuits_df = spark.read \
.option("header", True) \
.schema(circuits_schema) \
.csv("dbfs:/mnt/formula1dl/raw/circuits.csv")
#.csv(f"{raw_folder_path}/{v_file_date}/circuits.csv")

In [None]:
circuits_df.show(truncate=False)

+---------+--------------+------------------------------+------------+---------+--------+---------+---+-----------------------------------------------------------------+
|circuitId|circuitRef    |name                          |location    |country  |lat     |lng      |alt|url                                                              |
+---------+--------------+------------------------------+------------+---------+--------+---------+---+-----------------------------------------------------------------+
|1        |albert_park   |Albert Park Grand Prix Circuit|Melbourne   |Australia|-37.8497|144.968  |10 |http://en.wikipedia.org/wiki/Melbourne_Grand_Prix_Circuit        |
|2        |sepang        |Sepang International Circuit  |Kuala Lumpur|Malaysia |2.76083 |101.738  |18 |http://en.wikipedia.org/wiki/Sepang_International_Circuit        |
|3        |bahrain       |Bahrain International Circuit |Sakhir      |Bahrain  |26.0325 |50.5106  |7  |http://en.wikipedia.org/wiki/Bahrain_Internatio

In [None]:
circuits_df.printSchema()

root
 |-- circuitId: integer (nullable = true)
 |-- circuitRef: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- country: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lng: double (nullable = true)
 |-- alt: integer (nullable = true)
 |-- url: string (nullable = true)



#### Paso 2 - Seleccionar sólo las columnas necesarias

In [None]:
from pyspark.sql.functions import col

In [None]:
circuits_selected_df = circuits_df.select(col("circuitId"), col("circuitRef"), col("name"), col("location"), col("country"), col("lat"), col("lng"), col("alt"))

In [None]:
circuits_selected_df.show(truncate=False)

+---------+--------------+------------------------------+------------+---------+--------+---------+---+
|circuitId|circuitRef    |name                          |location    |country  |lat     |lng      |alt|
+---------+--------------+------------------------------+------------+---------+--------+---------+---+
|1        |albert_park   |Albert Park Grand Prix Circuit|Melbourne   |Australia|-37.8497|144.968  |10 |
|2        |sepang        |Sepang International Circuit  |Kuala Lumpur|Malaysia |2.76083 |101.738  |18 |
|3        |bahrain       |Bahrain International Circuit |Sakhir      |Bahrain  |26.0325 |50.5106  |7  |
|4        |catalunya     |Circuit de Barcelona-Catalunya|Montmeló    |Spain    |41.57   |2.26111  |109|
|5        |istanbul      |Istanbul Park                 |Istanbul    |Turkey   |40.9517 |29.405   |130|
|6        |monaco        |Circuit de Monaco             |Monte-Carlo |Monaco   |43.7347 |7.42056  |7  |
|7        |villeneuve    |Circuit Gilles Villeneuve     |Montrea

#### Paso 3 - Cambiar el nombre de las columnas

In [None]:
from pyspark.sql.functions import lit

In [None]:
circuits_renamed_df = circuits_selected_df.withColumnRenamed("circuitId", "circuit_id") \
.withColumnRenamed("circuitRef", "circuit_ref") \
.withColumnRenamed("lat", "latitude") \
.withColumnRenamed("lng", "longitude") \
.withColumnRenamed("alt", "altitude") \
#.withColumn("data_source", lit(v_data_source)) \
#.withColumn("file_date", lit(v_file_date))

In [None]:
circuits_renamed_df.show(truncate=False)

+----------+--------------+------------------------------+------------+---------+--------+---------+--------+
|circuit_id|circuit_ref   |name                          |location    |country  |latitude|longitude|altitude|
+----------+--------------+------------------------------+------------+---------+--------+---------+--------+
|1         |albert_park   |Albert Park Grand Prix Circuit|Melbourne   |Australia|-37.8497|144.968  |10      |
|2         |sepang        |Sepang International Circuit  |Kuala Lumpur|Malaysia |2.76083 |101.738  |18      |
|3         |bahrain       |Bahrain International Circuit |Sakhir      |Bahrain  |26.0325 |50.5106  |7       |
|4         |catalunya     |Circuit de Barcelona-Catalunya|Montmeló    |Spain    |41.57   |2.26111  |109     |
|5         |istanbul      |Istanbul Park                 |Istanbul    |Turkey   |40.9517 |29.405   |130     |
|6         |monaco        |Circuit de Monaco             |Monte-Carlo |Monaco   |43.7347 |7.42056  |7       |
|7        

#### Paso 4 - Añadir la fecha de ingestión al dataframe

In [None]:
from pyspark.sql.functions import current_timestamp

In [None]:
# Forma 1
circuits_final_df = circuits_renamed_df.withColumn("ingestion_date", current_timestamp())

In [None]:
circuits_final_df.show(truncate=False)

+----------+--------------+------------------------------+------------+---------+--------+---------+--------+-----------------------+
|circuit_id|circuit_ref   |name                          |location    |country  |latitude|longitude|altitude|ingestion_date         |
+----------+--------------+------------------------------+------------+---------+--------+---------+--------+-----------------------+
|1         |albert_park   |Albert Park Grand Prix Circuit|Melbourne   |Australia|-37.8497|144.968  |10      |2023-06-09 20:36:54.395|
|2         |sepang        |Sepang International Circuit  |Kuala Lumpur|Malaysia |2.76083 |101.738  |18      |2023-06-09 20:36:54.395|
|3         |bahrain       |Bahrain International Circuit |Sakhir      |Bahrain  |26.0325 |50.5106  |7       |2023-06-09 20:36:54.395|
|4         |catalunya     |Circuit de Barcelona-Catalunya|Montmeló    |Spain    |41.57   |2.26111  |109     |2023-06-09 20:36:54.395|
|5         |istanbul      |Istanbul Park                 |Ista

In [None]:
# Forma 2
#circuits_final_df = add_ingestion_date(circuits_renamed_df)

circuits_final_df.show(truncate=False)

#### Paso 5 - Escribir datos en el contenedor **processed** del ADLS como **parquet**

In [None]:
# Escribimos el archivo con formato PARQUET en el contenedor "processed" y directorio "circuits"
circuits_final_df.write.parquet("/mnt/formula1dl/processed/circuits")

In [None]:
# Visualizamos los objetos que se encuentran en el directorio "circuits"
%fs
ls /mnt/formula1dl/processed/circuits

In [None]:
# Podemos guardar como TABLA el archivo, en la base de datos "f1_processed" y la tabla se llama "circuits"
circuits_final_df.write.mode("overwrite").format("delta").saveAsTable("f1_processed.circuits")

In [None]:
%sql
SELECT * FROM f1_processed.circuits;

In [None]:
dbutils.notebook.exit("Success")