### **Paso 5.2.8 - Ingesta del directorio "qualifying" para el directorio "2021-03-21"**

Nos permite crear e indicar parámetros en tiempo de ejecución

<center><img src="https://i.postimg.cc/4NhmHcbQ/db152.png"></center>

In [None]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [None]:
v_data_source

Out[3]: 'Eargast'

In [None]:
dbutils.widgets.text("p_file_date", "2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [None]:
v_file_date

Out[5]: '2021-03-21'

In [None]:
%run "../includes/configuration"

In [None]:
%run "../includes/common_functions"

#### Paso 1 - Leer el directorio **qualifying** el cual contiene multiples archivos Multi Line JSON

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [None]:
qualifying_schema = StructType(fields=[StructField("qualifyId", IntegerType(), False),
                                      StructField("raceId", IntegerType(), True),
                                      StructField("driverId", IntegerType(), True),
                                      StructField("constructorId", IntegerType(), True),
                                      StructField("number", IntegerType(), True),
                                      StructField("position", IntegerType(), True),
                                      StructField("q1", StringType(), True),
                                      StructField("q2", StringType(), True),
                                      StructField("q3", StringType(), True),
                                     ])

In [None]:
# El parámetro "raw_folder_path" se encuentra en el notebook "configuration"
# El parámetro "v_file_date" se encuentra en el notebook e indicamos su valor en tiempo de ejecución
qualifying_df = spark.read \
.schema(qualifying_schema) \
.option("multiLine", True) \
.json(f"{raw_folder_path}/{v_file_date}/qualifying")
# Esto seria equivalente a la ruta: /mnt/formula1dl/raw/2021-03-21/qualifying

In [None]:
qualifying_df.show(truncate=False)

+---------+------+--------+-------------+------+--------+--------+--------+--------+
|qualifyId|raceId|driverId|constructorId|number|position|q1      |q2      |q3      |
+---------+------+--------+-------------+------+--------+--------+--------+--------+
|1        |18    |1       |1            |22    |1       |1:26.572|1:25.187|1:26.714|
|2        |18    |9       |2            |4     |2       |1:26.103|1:25.315|1:26.869|
|3        |18    |5       |1            |23    |3       |1:25.664|1:25.452|1:27.079|
|4        |18    |13      |6            |2     |4       |1:25.994|1:25.691|1:27.178|
|5        |18    |2       |2            |3     |5       |1:25.960|1:25.518|1:27.236|
|6        |18    |15      |7            |11    |6       |1:26.427|1:26.101|1:28.527|
|7        |18    |3       |3            |7     |7       |1:26.295|1:26.059|1:28.687|
|8        |18    |14      |9            |9     |8       |1:26.381|1:26.063|1:29.041|
|9        |18    |10      |7            |12    |9       |1:26.919

#### Paso 2 - Renombrar columnas y añadir nuevas columnas
1. Renombrar qualifyingId, driverId, constructorId y raceId
2. Añadir ingestion_date con current timestamp

In [None]:
from pyspark.sql.functions import lit

In [None]:
# La función "add_ingestion_date()" se encuentra en el notebook "common_functions"
qualifying_with_ingestion_date_df = add_ingestion_date(qualifying_df)

In [None]:
qualifying_with_ingestion_date_df.show(truncate=False)

+---------+------+--------+-------------+------+--------+--------+--------+--------+-----------------------+
|qualifyId|raceId|driverId|constructorId|number|position|q1      |q2      |q3      |ingestion_date         |
+---------+------+--------+-------------+------+--------+--------+--------+--------+-----------------------+
|1        |18    |1       |1            |22    |1       |1:26.572|1:25.187|1:26.714|2023-06-15 17:43:10.249|
|2        |18    |9       |2            |4     |2       |1:26.103|1:25.315|1:26.869|2023-06-15 17:43:10.249|
|3        |18    |5       |1            |23    |3       |1:25.664|1:25.452|1:27.079|2023-06-15 17:43:10.249|
|4        |18    |13      |6            |2     |4       |1:25.994|1:25.691|1:27.178|2023-06-15 17:43:10.249|
|5        |18    |2       |2            |3     |5       |1:25.960|1:25.518|1:27.236|2023-06-15 17:43:10.249|
|6        |18    |15      |7            |11    |6       |1:26.427|1:26.101|1:28.527|2023-06-15 17:43:10.249|
|7        |18    |3

In [None]:
final_df = qualifying_with_ingestion_date_df.withColumnRenamed("qualifyId", "qualify_id") \
                                            .withColumnRenamed("driverId", "driver_id") \
                                            .withColumnRenamed("raceId", "race_id") \
                                            .withColumnRenamed("constructorId", "constructor_id") \
                                            .withColumn("data_source", lit(v_data_source)) \
                                            .withColumn("file_date", lit(v_file_date))

In [None]:
final_df.show(truncate=False)

+----------+-------+---------+--------------+------+--------+--------+--------+--------+-----------------------+-----------+----------+
|qualify_id|race_id|driver_id|constructor_id|number|position|q1      |q2      |q3      |ingestion_date         |data_source|file_date |
+----------+-------+---------+--------------+------+--------+--------+--------+--------+-----------------------+-----------+----------+
|1         |18     |1        |1             |22    |1       |1:26.572|1:25.187|1:26.714|2023-06-15 17:43:10.968|Eargast    |2021-03-21|
|2         |18     |9        |2             |4     |2       |1:26.103|1:25.315|1:26.869|2023-06-15 17:43:10.968|Eargast    |2021-03-21|
|3         |18     |5        |1             |23    |3       |1:25.664|1:25.452|1:27.079|2023-06-15 17:43:10.968|Eargast    |2021-03-21|
|4         |18     |13       |6             |2     |4       |1:25.994|1:25.691|1:27.178|2023-06-15 17:43:10.968|Eargast    |2021-03-21|
|5         |18     |2        |2             |3  

In [None]:
final_df.printSchema()

root
 |-- qualify_id: integer (nullable = true)
 |-- race_id: integer (nullable = true)
 |-- driver_id: integer (nullable = true)
 |-- constructor_id: integer (nullable = true)
 |-- number: integer (nullable = true)
 |-- position: integer (nullable = true)
 |-- q1: string (nullable = true)
 |-- q2: string (nullable = true)
 |-- q3: string (nullable = true)
 |-- ingestion_date: timestamp (nullable = false)
 |-- data_source: string (nullable = false)
 |-- file_date: string (nullable = false)



#### Paso 3 - Escribir datos en el datalake como parquet y crear la tabla **qualifying** en la base de datos **f1_processed**

In [None]:
final_df.select('race_id').distinct().collect()

Out[18]: [Row(race_id=243),
 Row(race_id=858),
 Row(race_id=31),
 Row(race_id=85),
 Row(race_id=251),
 Row(race_id=65),
 Row(race_id=53),
 Row(race_id=255),
 Row(race_id=78),
 Row(race_id=857),
 Row(race_id=108),
 Row(race_id=34),
 Row(race_id=193),
 Row(race_id=211),
 Row(race_id=115),
 Row(race_id=101),
 Row(race_id=81),
 Row(race_id=847),
 Row(race_id=28),
 Row(race_id=210),
 Row(race_id=842),
 Row(race_id=76),
 Row(race_id=26),
 Row(race_id=27),
 Row(race_id=44),
 Row(race_id=192),
 Row(race_id=271),
 Row(race_id=844),
 Row(race_id=253),
 Row(race_id=103),
 Row(race_id=12),
 Row(race_id=350),
 Row(race_id=860),
 Row(race_id=91),
 Row(race_id=22),
 Row(race_id=209),
 Row(race_id=230),
 Row(race_id=122),
 Row(race_id=225),
 Row(race_id=93),
 Row(race_id=246),
 Row(race_id=346),
 Row(race_id=855),
 Row(race_id=861),
 Row(race_id=224),
 Row(race_id=111),
 Row(race_id=47),
 Row(race_id=140),
 Row(race_id=177),
 Row(race_id=353),
 Row(race_id=355),
 Row(race_id=259),
 Row(race_id=1),
 Ro

In [None]:
for race_id_list in final_df.select('race_id').distinct().collect():
  print(race_id_list)

Row(race_id=243)
Row(race_id=858)
Row(race_id=31)
Row(race_id=85)
Row(race_id=251)
Row(race_id=65)
Row(race_id=53)
Row(race_id=255)
Row(race_id=78)
Row(race_id=857)
Row(race_id=108)
Row(race_id=34)
Row(race_id=193)
Row(race_id=211)
Row(race_id=115)
Row(race_id=101)
Row(race_id=81)
Row(race_id=847)
Row(race_id=28)
Row(race_id=210)
Row(race_id=842)
Row(race_id=76)
Row(race_id=26)
Row(race_id=27)
Row(race_id=44)
Row(race_id=192)
Row(race_id=271)
Row(race_id=844)
Row(race_id=253)
Row(race_id=103)
Row(race_id=12)
Row(race_id=350)
Row(race_id=860)
Row(race_id=91)
Row(race_id=22)
Row(race_id=209)
Row(race_id=230)
Row(race_id=122)
Row(race_id=225)
Row(race_id=93)
Row(race_id=246)
Row(race_id=346)
Row(race_id=855)
Row(race_id=861)
Row(race_id=224)
Row(race_id=111)
Row(race_id=47)
Row(race_id=140)
Row(race_id=177)
Row(race_id=353)
Row(race_id=355)
Row(race_id=259)
Row(race_id=1)
Row(race_id=52)
Row(race_id=212)
Row(race_id=13)
Row(race_id=348)
Row(race_id=86)
Row(race_id=6)
Row(race_id=16)
Row(r

In [None]:
for race_id_list in final_df.select('race_id').distinct().collect():
  print(race_id_list.race_id)

243
858
31
85
251
65
53
255
78
857
108
34
193
211
115
101
81
847
28
210
842
76
26
27
44
192
271
844
253
103
12
350
860
91
22
209
230
122
225
93
246
346
855
861
224
111
47
140
177
353
355
259
1
52
212
13
348
86
6
16
852
3
20
40
340
250
94
57
339
54
120
96
846
48
266
191
268
5
258
19
92
257
64
227
117
41
347
43
15
845
262
269
112
849
843
854
207
37
61
88
263
107
9
17
72
175
850
35
229
173
114
55
59
4
241
8
100
161
23
39
49
176
7
244
84
856
247
87
343
51
69
97
264
63
77
10
102
848
50
267
45
216
38
261
82
249
80
240
25
245
73
113
24
160
70
62
125
121
156
351
95
29
226
21
214
338
352
256
98
345
32
60
90
75
203
265
341
859
200
272
56
109
254
213
228
354
105
58
33
11
83
110
68
106
71
116
14
344
349
342
123
158
199
242
270
248
42
337
119
79
201
2
118
30
99
66
46
67
215
252
851
18
74
104
841
36
208
89
897
1025
879
883
898
970
853
918
1005
1016
961
974
876
976
950
874
939
1031
1030
1034
988
914
1019
908
916
973
984
1046
926
875
995
882
906
955
912
896
1008
992
1047
887
972
998
867
881
980
1021
8

In [None]:
n = 0
for race_id_list in final_df.select('race_id').distinct().collect():
  n = n + 1
print(n)

404


1. Si no hubiesemos utilizado la sentencia **IF** y la tabla **f1_processed.qualifying** no hubiese existido, nos hubiese devuelto un **ERROR**. Es por eso que utilizamos el **IF**
2. **spark.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic")** nos permite utilizar la función **insertInto** en modo **overwrite** y asi SOLO SOBRESCRIBIR las particiones que sean coincidentes. Tengo un ejemplo en mi material de **Apache Spark con Python**
3. Por lo tanto, si la tabla ya existe, solo sobrescribirá las particiones coincidentes
4. Y si no existe la tabla, la creará
5. Como estamos ingestando el directorio de archivos JSON **qualifying** del directorio **2021-03-21**, existen 404 particiones
6. Con este método, podemos EJECUTAR EL NOTEBOOK CUANTAS VECES QUERAMOS Y NO SE DUPLICARAN LOS DATOS

In [None]:
spark.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic")

In [None]:
final_df = final_df.select("qualify_id","driver_id","constructor_id","number","position","q1","q2","q3","ingestion_date","data_source","file_date","race_id")

In [None]:
if (spark._jsparkSession.catalog().tableExists("f1_processed.qualifying")):
  final_df.write.mode('overwrite').insertInto("f1_processed.qualifying")
else:
  final_df.write.partitionBy('race_id').format('parquet').saveAsTable("f1_processed.qualifying")

In [None]:
spark.read.parquet("/mnt/formula1dl/processed/qualifying").show(truncate=False)

+----------+---------+--------------+------+--------+--------+--------+--------+-----------------------+-----------+----------+-------+
|qualify_id|driver_id|constructor_id|number|position|q1      |q2      |q3      |ingestion_date         |data_source|file_date |race_id|
+----------+---------+--------------+------+--------+--------+--------+--------+-----------------------+-----------+----------+-------+
|3374      |17       |9             |6     |1       |1:51.886|1:48.210|1:49.327|2023-06-15 17:43:16.777|Eargast    |2021-03-21|339    |
|3375      |3        |131           |4     |2       |1:52.560|1:47.417|1:50.673|2023-06-15 17:43:16.777|Eargast    |2021-03-21|339    |
|3376      |20       |9             |5     |3       |1:47.632|1:46.828|1:50.789|2023-06-15 17:43:16.777|Eargast    |2021-03-21|339    |
|3377      |16       |10            |14    |4       |1:49.479|1:47.085|1:50.914|2023-06-15 17:43:16.777|Eargast    |2021-03-21|339    |
|3378      |807      |3             |10    |5   

In [None]:
%sql
SELECT * FROM f1_processed.qualifying;

qualify_id,driver_id,constructor_id,number,position,q1,q2,q3,ingestion_date,data_source,file_date,race_id
3374,17,9,6,1,1:51.886,1:48.210,1:49.327,2023-06-15T17:43:16.777+0000,Eargast,2021-03-21,339
3375,3,131,4,2,1:52.560,1:47.417,1:50.673,2023-06-15T17:43:16.777+0000,Eargast,2021-03-21,339
3376,20,9,5,3,1:47.632,1:46.828,1:50.789,2023-06-15T17:43:16.777+0000,Eargast,2021-03-21,339
3377,16,10,14,4,1:49.479,1:47.085,1:50.914,2023-06-15T17:43:16.777+0000,Eargast,2021-03-21,339
3378,807,3,10,5,1:49.664,1:47.346,1:51.001,2023-06-15T17:43:16.777+0000,Eargast,2021-03-21,339
3379,9,4,11,6,1:46.283,1:46.951,1:51.051,2023-06-15T17:43:16.777+0000,Eargast,2021-03-21,339
3380,22,3,9,7,1:50.301,1:48.371,1:51.511,2023-06-15T17:43:16.777+0000,Eargast,2021-03-21,339
3381,30,131,3,8,1:52.239,1:48.400,1:51.717,2023-06-15T17:43:16.777+0000,Eargast,2021-03-21,339
3382,155,15,23,9,1:48.467,1:47.792,1:51.767,2023-06-15T17:43:16.777+0000,Eargast,2021-03-21,339
3383,24,10,15,10,1:49.922,1:48.238,1:52.254,2023-06-15T17:43:16.777+0000,Eargast,2021-03-21,339


In [None]:
%sql
-- Vemos que tenemos la data hasta la race_id = 1047
SELECT race_id, COUNT(1)
FROM f1_processed.qualifying
GROUP BY race_id
ORDER BY race_id DESC

race_id,count(1)
1047,20
1046,20
1045,20
1044,20
1043,20
1042,20
1041,20
1040,20
1039,20
1038,20


In [None]:
%sql
DESCRIBE FORMATTED f1_processed.qualifying;

col_name,data_type,comment
qualify_id,int,
driver_id,int,
constructor_id,int,
number,int,
position,int,
q1,string,
q2,string,
q3,string,
ingestion_date,timestamp,
data_source,string,


<center><img src="https://i.postimg.cc/mr6g0vT8/db120.png"></center>

In [None]:
dbutils.notebook.exit("Success")

Success