In [0]:
#READ flight (parquet)
df1 = spark.read.parquet("dbfs:/FileStore/shared_uploads/tacnampt@gmail.com/raw_flight_data_snappy.parquet")
df1.printSchema()
df1.show(2)

root
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- Carrier: string (nullable = true)
 |-- OriginAirportID: integer (nullable = true)
 |-- DestAirportID: integer (nullable = true)
 |-- DepDelay: integer (nullable = true)
 |-- ArrDelay: integer (nullable = true)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        30|        4|     UA|          13930|        10721|      -3|      -7|
|        30|        4|     UA|          11618|        12892|      -1|     -28|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 2 rows



In [0]:
df1.count()

Out[11]: 2719418

In [0]:
# Quitar de la tabla "flights" los registros que tengan valores nulos o faltantes
# en los campos "ArrDelay" y "DepDelay".
df1.filter(df1.DepDelay.isNull() | df1.ArrDelay.isNull()).count()

Out[12]: 29033

In [0]:
df1.filter("DepDelay IS NULL OR ArrDelay IS NULL").count()

Out[13]: 29033

In [0]:
df1 = df1.dropna(how="any", subset=["DepDelay", "ArrDelay"])
df1.count()

Out[15]: 2690385

In [0]:
#READ airports (csv)
df2 = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/tacnampt@gmail.com/airports.csv")
df2.printSchema()
df2.show(2)

root
 |-- airport_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- name: string (nullable = true)

+----------+-----------+-----+--------------------+
|airport_id|       city|state|                name|
+----------+-----------+-----+--------------------+
|     10165|Adak Island|   AK|                Adak|
|     10299|  Anchorage|   AK|Ted Stevens Ancho...|
+----------+-----------+-----+--------------------+
only showing top 2 rows



In [0]:
df1.createOrReplaceTempView("flights")
df2.createOrReplaceTempView("airports")

In [0]:
16) En la carpeta "data_cp_flights" hay un archivo Parquet, comprimido con Snappy, en adelante, la tabla "flights". ¿Qué cantidad de columnas posee? *
Nota: Para la realización de los puntos 16 al 20, se puede utilizar el entorno visto en clase ubicado en el repositorio: https://github.com/soyHenry/DS-M4-Cluster_Spark
Quitar de la tabla "flights" los registros que tengan valores nulos o faltantes en los campos "ArrDelay" y "DepDelay". Luego contestar:
*
1-5
2-7
3-12

RPTA : 2-7

In [0]:
df1.show(5)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        30|        4|     UA|          13930|        10721|      -3|      -7|
|        30|        4|     UA|          11618|        12892|      -1|     -28|
|        30|        4|     UA|          12892|        14771|      89|      84|
|        30|        4|     UA|          13930|        15304|      -2|      -2|
|        30|        4|     UA|          11618|        12266|       7|      23|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 5 rows



In [0]:
17) ¿Cuál es la tupla de aeropuertos, con mayor cantidad de vuelos entre sí?
 *
Nota: Es posible tomar el nombre del aeropuerto desde el archivo "airports.csv", donde "airport_id" se puede relacionar con "OriginAirportID" y "DestAirportID" de la tabla "flights"
Si consideramos que cuando el campo ArrDelay es mayor a 25, el vuelo está demorado, contestar:
1-Honolulu International - Kahului Airport
2-San Francisco International - Los Angeles International
3-Los Angeles International - McCarran International

RPTA: 2-San Francisco International - Los Angeles International (9367)

In [0]:
#USANDO SPARK SQL (HIVE)

spark.sql(""" 
SELECT a1.name as origen, a2.name as destino, COUNT(*) as cantidad
FROM flights f
INNER JOIN airports a1 ON f.OriginAirportID = a1.airport_id
INNER JOIN airports a2 ON f.DestAirportID = a2.airport_id
GROUP BY a1.name, a2.name
ORDER BY 3 DESC
LIMIT 5
           """).show(truncate=False)

+---------------------------+---------------------------+--------+
|origen                     |destino                    |cantidad|
+---------------------------+---------------------------+--------+
|San Francisco International|Los Angeles International  |9367    |
|Los Angeles International  |San Francisco International|9306    |
|Kahului Airport            |Honolulu International     |6891    |
|Los Angeles International  |McCarran International     |6861    |
|Honolulu International     |Kahului Airport            |6856    |
+---------------------------+---------------------------+--------+



In [0]:
#USANDO PYSPARK
from pyspark.sql.functions import sum,avg,max,count
from pyspark.sql.functions import col

df1.alias("f") \
    .join(df2.alias("a1"), col("f.OriginAirportID") == col("a1.airport_id"), "inner") \
    .join(df2.alias("a2"), col("f.DestAirportID") == col("a2.airport_id"), "inner") \
    .select(col("a1.name").alias("origen"), col("a2.name").alias("destino")) \
    .groupBy(col("origen"), col("destino")) \
    .agg(count("*").alias("cantidad")) \
    .orderBy(col("cantidad"), ascending = False) \
    .show(5, truncate=False)

+---------------------------+---------------------------+--------+
|origen                     |destino                    |cantidad|
+---------------------------+---------------------------+--------+
|San Francisco International|Los Angeles International  |9367    |
|Los Angeles International  |San Francisco International|9306    |
|Kahului Airport            |Honolulu International     |6891    |
|Los Angeles International  |McCarran International     |6861    |
|Honolulu International     |Kahului Airport            |6856    |
+---------------------------+---------------------------+--------+
only showing top 5 rows



In [0]:
18) ¿Cuál es la cantidad de vuelos demorados?
Si consideramos que cuando el campo ArrDelay es mayor a 25, el vuelo está demorado, contestar:
 *
1-380502
2-1255037
3-2309883

RPTA : 1-380502

In [0]:
spark.sql(""" 
SELECT COUNT(*)
FROM flights f
WHERE f.ArrDelay > 25
           """).show(truncate=False)

+--------+
|count(1)|
+--------+
|380502  |
+--------+



In [0]:
19) ¿Cuál es la tupla de aeropuertos, con mayor cantidad de vuelos demorados entre sí?
*
Nota: Es posible tomar el nombre del aeropuerto desde el archivo "airports.csv", donde "airport_id" se puede relacionar con "OriginAirportID" y "DestAirportID" de la tabla "flights"
Si consideramos que cuando el campo ArrDelay es mayor a 25, el vuelo está demorado, contestar:

1-Honolulu International - Kahului Airport
2-San Francisco International - Los Angeles International
3-Chicago O'Hare International - San Francisco International

RPTA: 2-San Francisco International - Los Angeles International (1792)
Debe ser (Los Angeles International - San Francisco International - 2375)


[0;36m  File [0;32m"<command-1343562696852966>"[0;36m, line [0;32m2[0m
[0;31m    *[0m
[0m     ^[0m
[0;31mSyntaxError[0m[0;31m:[0m invalid syntax


In [0]:
spark.sql(""" 
SELECT a1.name as origen, a2.name as destino, COUNT(*) as cantidad
FROM flights f
INNER JOIN airports a1 ON f.OriginAirportID = a1.airport_id
INNER JOIN airports a2 ON f.DestAirportID = a2.airport_id
WHERE f.ArrDelay > 25
GROUP BY a1.name, a2.name
ORDER BY 3 DESC
LIMIT 10
           """).show(truncate=False)

+----------------------------------------+----------------------------------------+--------+
|origen                                  |destino                                 |cantidad|
+----------------------------------------+----------------------------------------+--------+
|Los Angeles International               |San Francisco International             |2375    |
|San Francisco International             |Los Angeles International               |1792    |
|McCarran International                  |San Francisco International             |1375    |
|LaGuardia                               |Hartsfield-Jackson Atlanta International|1204    |
|Chicago O'Hare International            |LaGuardia                               |1184    |
|Chicago O'Hare International            |San Francisco International             |1173    |
|Hartsfield-Jackson Atlanta International|LaGuardia                               |1146    |
|San Diego International                 |San Francisco International 

In [0]:
20) ¿Cuál es el aeropuerto con el mayor promedio de demora en arribos? (ArrDelay)
*
1-Washington Dulles International
2-Nashville International
3-Chicago Midway International

RPTA : 3-Chicago Midway International (promedio = 11.78)

[0;36m  File [0;32m"<command-2427705608286255>"[0;36m, line [0;32m1[0m
[0;31m    20) ¿Cuál es el aeropuerto con el mayor promedio de demora en arribos? (ArrDelay)[0m
[0m      ^[0m
[0;31mSyntaxError[0m[0;31m:[0m unmatched ')'


In [0]:
spark.sql(""" 
SELECT a1.name as origen, AVG(f.ArrDelay) as promedio
FROM flights f
INNER JOIN airports a1 ON f.OriginAirportID = a1.airport_id
GROUP BY a1.name
ORDER BY 2 DESC
LIMIT 10
           """).show(truncate=False)

+----------------------------------------------------+------------------+
|origen                                              |promedio          |
+----------------------------------------------------+------------------+
|Chicago Midway International                        |11.789408049560512|
|Chicago O'Hare International                        |11.747348551501288|
|Denver International                                |11.447812107096482|
|Dallas/Fort Worth International                     |10.255666241939844|
|Baltimore/Washington International Thurgood Marshall|9.627814158949123 |
|Dallas Love Field                                   |9.16053407855923  |
|William P Hobby                                     |9.00740052319978  |
|Palm Beach International                            |8.92683652225637  |
|San Francisco International                         |8.788737644597292 |
|Washington Dulles International                     |8.496601402767622 |
+-------------------------------------