In [1]:
import findspark
findspark.init()

In [2]:
# PySpark is the Spark API for Python. In this lab, we use PySpark to initialize the spark context. 
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, IntegerType, StringType, FloatType


In [3]:
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("Python Spark DataFrames basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [4]:
df_airports_proc = spark.read.parquet("transformation_proc/airports_proc.parquet")
df_airports_proc.printSchema()

df_planes_proc = spark.read.parquet('transformation_proc/planes_proc.parquet')
df_planes_proc.printSchema()

df_flights_proc = spark.read.parquet('transformation_proc/flights_proc.parquet')
df_flights_proc.printSchema()

root
 |-- faa: string (nullable = true)
 |-- name: string (nullable = true)
 |-- lat: float (nullable = true)
 |-- lon: float (nullable = true)
 |-- alt: integer (nullable = true)
 |-- tz: float (nullable = true)
 |-- dst: string (nullable = true)
 |-- region: string (nullable = true)
 |-- type: string (nullable = true)
 |-- military: boolean (nullable = true)
 |-- administration: string (nullable = true)

root
 |-- tailnum: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- engines: integer (nullable = true)
 |-- seats: integer (nullable = true)
 |-- speed: long (nullable = true)
 |-- engine: string (nullable = true)
 |-- tailchar: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- engine_type: string (nullable = true)

root
 |-- dep_time: string (nullable = true)
 |-- dep_delay: integer (nullable = true)
 |-- arr_time: string (nullable = 

## Pergunta 1

In [5]:
df_airports_origin_proc = df_airports_proc.select([F.col(c).alias(c+"_airports_origin") for c in df_airports_proc.columns])
df_airports_dest_proc = df_airports_proc.select([F.col(c).alias(c+"_airports_dest") for c in df_airports_proc.columns])
df_planes_proc = df_planes_proc.select([F.col(c).alias(c+"_planes") for c in df_planes_proc.columns])
df_flights_proc = df_flights_proc.select([F.col(c).alias(c+"_flights") for c in df_flights_proc.columns])


df_proc = df_flights_proc.join(df_airports_origin_proc, 
                          df_airports_origin_proc.faa_airports_origin == df_flights_proc.origin_flights,
                         "left")

df_proc = df_proc.join(df_airports_dest_proc,
                      df_airports_dest_proc.faa_airports_dest == df_proc.dest_flights,
                      "left")

df_proc = df_proc.join(df_planes_proc,
                      df_planes_proc.tailnum_planes == df_proc.tailnum_flights,
                      "left")

df_proc.printSchema()


root
 |-- dep_time_flights: string (nullable = true)
 |-- arr_time_flights: string (nullable = true)
 |-- dep_delay_flights: integer (nullable = true)
 |-- arr_delay_flights: integer (nullable = true)
 |-- carrier_flights: string (nullable = true)
 |-- tailnum_flights: string (nullable = true)
 |-- flight_flights: string (nullable = true)
 |-- origin_flights: string (nullable = true)
 |-- dest_flights: string (nullable = true)
 |-- air_time_flights: integer (nullable = true)
 |-- distance_flights: integer (nullable = true)
 |-- dep_datetime_flights: timestamp (nullable = true)
 |-- air_time_projected_flights: integer (nullable = true)
 |-- air_time_expected_flights: integer (nullable = true)
 |-- haul_duration_flights: string (nullable = true)
 |-- dep_season_flights: string (nullable = true)
 |-- dep_delay_category_flights: string (nullable = true)
 |-- faa_airports_origin: string (nullable = true)
 |-- name_airports_origin: string (nullable = true)
 |-- lat_airports_origin: float (nu

Checando tamanho de linhas e colunas

In [161]:
print(df_proc.count())

print(len(df_proc.columns))


10000
54


## Pergunta 2 

In [133]:
df_airports_region_dest_proc = df_proc.select('faa_airports_dest','region_airports_dest')
df_airports_region_origin_proc = df_proc.select('faa_airports_origin','region_airports_origin')

df_count_airports_region_proc = df_airports_region_dest_proc.union(df_airports_region_origin_proc)
df_count_airports_region_proc = df_count_airports_region_proc.drop_duplicates()
df_count_airports_region_proc.groupBy('region_airports_dest').count().show()

+--------------------+-----+
|region_airports_dest|count|
+--------------------+-----+
|              ALASKA|    9|
|       MAINLAND-EAST|   24|
|       MAINLAND-WEST|   36|
+--------------------+-----+



### Pergunta 3

In [134]:
df_proc = df_proc.withColumn('alt_diff', F.abs(F.col('alt_airports_origin') - F.col('alt_airports_dest')))

(df_proc
 .select('tailnum_flights','alt_diff','alt_airports_origin','alt_airports_dest')
 .orderBy(F.col('alt_diff').desc())
 .show(1))

df_proc = df_proc.drop('alt_diff')

+---------------+--------+-------------------+-----------------+
|tailnum_flights|alt_diff|alt_airports_origin|alt_airports_dest|
+---------------+--------+-------------------+-----------------+
|         N224AG|    6169|                433|             6602|
+---------------+--------+-------------------+-----------------+
only showing top 1 row



## Pergunta 4


Atraso médio de chegada e atraso médio de partida

In [139]:
df_avg_delay_dep = (df_proc
                   .filter(df_proc.dep_delay_flights > 0 )
                   .agg({'dep_delay_flights' : 'avg'}))

df_avg_delay_arr = (df_proc
                   .filter(df_proc.arr_delay_flights > 0 )
                   .agg({'arr_delay_flights' : 'avg'}))


df_avg_delay_arr = df_avg_delay_arr.withColumn('avg(arr_delay_flights)', 
                                               F.round(F.col('avg(arr_delay_flights)')))

df_avg_delay_dep = df_avg_delay_dep.withColumn('avg(dep_delay_flights)', 
                                               F.round(F.col('avg(dep_delay_flights)')))

df_avg_delay_arr.select(F.col('avg(arr_delay_flights)').alias('arr_delay_avg')).show()
df_avg_delay_dep.select(F.col('avg(dep_delay_flights)').alias('dep_delay_avg')).show()

+-------------+
|arr_delay_avg|
+-------------+
|         25.0|
+-------------+

+-------------+
|dep_delay_avg|
+-------------+
|         26.0|
+-------------+



Atraso médio de voo

In [100]:
df_proc = df_proc.withColumn('dep_delay', F.when(F.col('dep_delay_flights') < 0, 0)
                            .otherwise(F.col('dep_delay_flights')))

df_proc = df_proc.withColumn('arr_delay', F.when(F.col('arr_delay_flights') < 0, 0)
                            .otherwise(F.col('arr_delay_flights')))

df_proc = df_proc.withColumn('sum_delay',
                             F.col('dep_delay') + F.col('arr_delay'))

df_avg_delay = df_proc.filter(F.col('sum_delay') > 0).agg({"sum_delay" : "avg"})
df_avg_delay = df_avg_delay.withColumn('avg(sum_delay)', F.round(F.col('avg(sum_delay)')))

df_avg_delay.select(F.col('avg(sum_delay)').alias('flight_delay_avg')).show()

+----------------+
|flight_delay_avg|
+----------------+
|            38.0|
+----------------+



## Pergunta 5


Média de atraso de chegada para cada região destino e média de atraso de partida para cada região origem

In [103]:
df_per_region_origin = (df_proc
                        .filter(df_proc.dep_delay_flights > 0)
                        .select('dep_delay_flights','region_airports_origin'))

df_per_region_dest = (df_proc
                      .filter(df_proc.arr_delay_flights > 0)
                      .select('arr_delay_flights','region_airports_dest'))

df_per_region_origin = df_per_region_origin.groupBy('region_airports_origin').avg()
df_per_region_dest = df_per_region_dest.groupBy('region_airports_dest').avg()

df_per_region_origin = df_per_region_origin.withColumn('avg(dep_delay_flights)', 
                                                       F.round(F.col('avg(dep_delay_flights)')))

df_per_region_dest = df_per_region_dest.withColumn('avg(arr_delay_flights)', 
                                                       F.round(F.col('avg(arr_delay_flights)')))


df_per_region_dest.select(
    F.col('region_airports_dest').alias('region_dest'), 
    F.col('avg(arr_delay_flights)').alias('avg_arr_delay_flights')).show()
df_per_region_origin.select(
    F.col('region_airports_origin').alias('region_origin'), 
    F.col('avg(dep_delay_flights)').alias('avg_dep_delay_flights')).show()

df_per_region = df_per_region_dest.join(df_per_region_origin,
                                         df_per_region_origin.region_airports_origin == 
                                          df_per_region_dest.region_airports_dest,
                                         "left")

df_per_region.select(F.col('region_airports_dest').alias('region_dest'), 
                     F.col('avg(arr_delay_flights)').alias('avg_arr_delay_flights'),
                     F.col('region_airports_origin').alias('region_dest'), 
                     F.col('avg(dep_delay_flights)').alias('avg_dep_delay_flights')).show()

+-------------+---------------------+
|  region_dest|avg_arr_delay_flights|
+-------------+---------------------+
|       ALASKA|                 22.0|
|MAINLAND-EAST|                 29.0|
|MAINLAND-WEST|                 24.0|
+-------------+---------------------+

+-------------+---------------------+
|region_origin|avg_dep_delay_flights|
+-------------+---------------------+
|MAINLAND-WEST|                 26.0|
+-------------+---------------------+

+-------------+---------------------+-------------+---------------------+
|  region_dest|avg_arr_delay_flights|  region_dest|avg_dep_delay_flights|
+-------------+---------------------+-------------+---------------------+
|       ALASKA|                 22.0|         null|                 null|
|MAINLAND-EAST|                 29.0|         null|                 null|
|MAINLAND-WEST|                 24.0|MAINLAND-WEST|                 26.0|
+-------------+---------------------+-------------+---------------------+



média de atraso geral de voo para cada região de destino

In [104]:
df_per_region = (df_proc
 .filter(df_proc.sum_delay > 0 )
 .groupBy('region_airports_dest')
 .avg('sum_delay')
)

df_per_region = df_per_region.withColumn('avg(sum_delay)', 
                                                       F.round(F.col('avg(sum_delay)')))
df_per_region.select(F.col('region_airports_dest').alias('region_dest'), 
                     F.col('avg(sum_delay)').alias('avg_sum_delay')).show()

+-------------+-------------+
|  region_dest|avg_sum_delay|
+-------------+-------------+
|       ALASKA|         31.0|
|MAINLAND-EAST|         40.0|
|MAINLAND-WEST|         38.0|
+-------------+-------------+



## Pergunta 6

In [140]:
df_per_region_origin = (df_proc
                        .filter(df_proc.dep_delay_flights > 0)
                        .select('dep_delay_flights','dep_datetime_flights'))

df_per_region_dest = (df_proc
                      .filter(df_proc.arr_delay_flights > 0)
                      .select('arr_delay_flights','dep_datetime_flights'))

df_per_region_origin = (df_per_region_origin
                        .groupBy(F.year('dep_datetime_flights'))
                        .sum())

df_per_region_dest = (df_per_region_dest
                      .groupBy(F.year('dep_datetime_flights'))
                      .sum())

df_per_region_dest.select(
                          F.col('year(dep_datetime_flights)').alias('year'), 
                          F.col('sum(arr_delay_flights)').alias('sum_arr_delay_flights')).show()

df_per_region_origin.select(
                           F.col('year(dep_datetime_flights)').alias('year'), 
                           F.col('sum(dep_delay_flights)').alias('sum_dep_delay_flights')).show()

+----+---------------------+
|year|sum_arr_delay_flights|
+----+---------------------+
|2014|                91820|
+----+---------------------+

+----+---------------------+
|year|sum_dep_delay_flights|
+----+---------------------+
|2014|                88314|
+----+---------------------+



In [108]:
(df_proc
 .filter(df_proc.sum_delay > 0 )
 .groupBy(F.year('dep_datetime_flights'))
 .sum('sum_delay')
 .select(F.col('year(dep_datetime_flights)').alias('year'),
        F.col('sum(sum_delay)').alias('sum_acc_delay'))
 .show())


+----+-------------+
|year|sum_acc_delay|
+----+-------------+
|2014|       180134|
+----+-------------+



## Pergunta 7

In [113]:
df_delay_acc_origin = (df_proc
                       .filter((df_proc.dep_delay_flights > 0) )
                       .groupBy(F.col('region_airports_origin'), F.year(F.col('dep_datetime_flights')))
                       .sum('dep_delay_flights'))

df_delay_acc_dest = (df_proc
                     .filter((df_proc.arr_delay_flights > 0) )
                     .groupBy(F.col('region_airports_dest'), F.year(F.col('dep_datetime_flights')))
                     .sum('arr_delay_flights'))

df_delay_acc_origin.select(F.col('region_airports_origin').alias('region_origin'),
                          F.col('year(dep_datetime_flights)').alias('year'),
                          F.col('sum(dep_delay_flights)').alias('sum_dep_delay_flights')).show()

df_delay_acc_dest.select(F.col('region_airports_dest').alias('region_dest'),
                        F.col('year(dep_datetime_flights)').alias('year'),
                        F.col('sum(arr_delay_flights)').alias('sum_arr_delay_flights')).show()

df_delay_acc_dest = (df_delay_acc_dest
                .withColumnRenamed("region_airports_dest","region")
               .withColumnRenamed("year(dep_datetime_flights)",'year')
               )

df_delay_acc = df_delay_acc_dest.join(df_delay_acc_origin,
                                     df_delay_acc_dest.region == 
                                      df_delay_acc_origin.region_airports_origin,
                                     "left")

df_delay_acc.select('region',
                    'year',
                    F.col('sum(arr_delay_flights)').alias('sum_arr_delay_flights'),
                    F.col('sum(dep_delay_flights)').alias('sum_dep_delay_flights')).show()

+-------------+----+---------------------+
|region_origin|year|sum_dep_delay_flights|
+-------------+----+---------------------+
|MAINLAND-WEST|2014|                88314|
+-------------+----+---------------------+

+-------------+----+---------------------+
|  region_dest|year|sum_arr_delay_flights|
+-------------+----+---------------------+
|MAINLAND-WEST|2014|                60242|
|       ALASKA|2014|                 8640|
|MAINLAND-EAST|2014|                22938|
+-------------+----+---------------------+

+-------------+----+---------------------+---------------------+
|       region|year|sum_arr_delay_flights|sum_dep_delay_flights|
+-------------+----+---------------------+---------------------+
|       ALASKA|2014|                 8640|                 null|
|MAINLAND-EAST|2014|                22938|                 null|
|MAINLAND-WEST|2014|                60242|                88314|
+-------------+----+---------------------+---------------------+



In [114]:
(df_proc
 .filter(df_proc.sum_delay > 0 )
 .groupBy(F.col('region_airports_dest'), F.year(F.col('dep_datetime_flights')))
 .sum('sum_delay')
 .select(F.col('region_airports_dest').alias('region_dest'),
        F.col('year(dep_datetime_flights)').alias('year'),
        F.col('sum(sum_delay)').alias('sum_acc_delay'))
 .show())

+-------------+----+-------------+
|  region_dest|year|sum_acc_delay|
+-------------+----+-------------+
|MAINLAND-WEST|2014|       119503|
|       ALASKA|2014|        16155|
|MAINLAND-EAST|2014|        44476|
+-------------+----+-------------+



## Pergunta 8


In [115]:
df_avg_air_time = df_proc.agg({"air_time_flights" : "avg"})
df_avg_air_time = df_avg_air_time.withColumn('avg(air_time_flights)', 
                                             F.round(F.col('avg(air_time_flights)')))

df_avg_air_time.select(F.col('avg(air_time_flights)').alias('avg_air_time_flights')).show()

+--------------------+
|avg_air_time_flights|
+--------------------+
|               153.0|
+--------------------+



## Pergunta 9

In [116]:
df_avg_air_time = df_proc.groupBy('region_airports_dest').avg('air_time_flights')
df_avg_air_time = df_avg_air_time.withColumn('avg(air_time_flights)', 
                                             F.round(F.col('avg(air_time_flights)')))
df_avg_air_time.select(
    'region_airports_dest', 
    F.col('avg(air_time_flights)').alias('avg_air_time_flights')).show()

+--------------------+--------------------+
|region_airports_dest|avg_air_time_flights|
+--------------------+--------------------+
|              ALASKA|               228.0|
|       MAINLAND-EAST|               237.0|
|       MAINLAND-WEST|               115.0|
+--------------------+--------------------+



## Pergunta 10

In [117]:
df_avg_air_time = df_proc.groupBy('faa_airports_origin','faa_airports_dest').avg('air_time_flights')
df_avg_air_time = df_avg_air_time.withColumn('avg(air_time_flights)', 
                                             F.round(F.col('avg(air_time_flights)')))
df_avg_air_time.select('faa_airports_origin',
                       'faa_airports_dest',
                      F.col('avg(air_time_flights)').alias('avg_air_time_flights')).show()

+-------------------+-----------------+--------------------+
|faa_airports_origin|faa_airports_dest|avg_air_time_flights|
+-------------------+-----------------+--------------------+
|                SEA|              RNO|                74.0|
|                SEA|              DTW|               220.0|
|                SEA|              CLE|               234.0|
|                SEA|              LAX|               127.0|
|                PDX|              SEA|                35.0|
|                SEA|              BLI|                23.0|
|                PDX|              IAH|               214.0|
|                PDX|              PHX|               130.0|
|                SEA|              SLC|                89.0|
|                SEA|              SBA|               118.0|
|                SEA|              BWI|               270.0|
|                PDX|              IAD|               268.0|
|                PDX|              SFO|                85.0|
|                SEA|   

## Pergunta 11

In [118]:
(df_proc
 .groupBy(F.year(F.col('dep_datetime_flights')))
 .sum('air_time_flights')
 .select(F.col('year(dep_datetime_flights)').alias('year'), 
         F.col('sum(air_time_flights)').alias('sum_air_time_flights'))
 .show())

+----+--------------------+
|year|sum_air_time_flights|
+----+--------------------+
|2014|             1528625|
+----+--------------------+



## Pergunta 12

In [119]:
(df_proc
 .groupBy('region_airports_dest')
 .sum('air_time_flights')
 .select('region_airports_dest', 
         F.col('sum(air_time_flights)').alias('sum_air_time_flights'))
 .show())

+--------------------+--------------------+
|region_airports_dest|sum_air_time_flights|
+--------------------+--------------------+
|              ALASKA|              230602|
|       MAINLAND-EAST|              508344|
|       MAINLAND-WEST|              789679|
+--------------------+--------------------+



## Pergunta 13

In [120]:
(df_proc
 .agg({'distance_flights': 'avg'})
 .select(F.col('avg(distance_flights)').alias('avg_distance_flights'))
 .show())

+--------------------+
|avg_distance_flights|
+--------------------+
|           1208.1516|
+--------------------+



## Pergunta 14

In [122]:
(df_proc
 .groupBy('region_airports_dest')
 .avg('distance_flights')
 .select('region_airports_dest',
         F.col('avg(distance_flights)').alias('avg_distance_flights'))
 .show())

+--------------------+--------------------+
|region_airports_dest|avg_distance_flights|
+--------------------+--------------------+
|              ALASKA|   1741.653162055336|
|       MAINLAND-EAST|  2042.3983208955224|
|       MAINLAND-WEST|   867.9224137931035|
+--------------------+--------------------+



## Pergunta 15

In [123]:
(df_proc
 .groupBy('faa_airports_origin','faa_airports_dest')
 .avg('distance_flights')
 .select('faa_airports_origin',
         'faa_airports_dest',
        F.col('avg(distance_flights)').alias('avg_distance_flights'))
 .show())

+-------------------+-----------------+--------------------+
|faa_airports_origin|faa_airports_dest|avg_distance_flights|
+-------------------+-----------------+--------------------+
|                SEA|              RNO|               564.0|
|                SEA|              DTW|              1927.0|
|                SEA|              CLE|              2021.0|
|                SEA|              LAX|               954.0|
|                PDX|              SEA|               129.0|
|                SEA|              BLI|                93.0|
|                PDX|              IAH|              1825.0|
|                PDX|              PHX|              1009.0|
|                SEA|              SLC|               689.0|
|                SEA|              SBA|               908.0|
|                SEA|              BWI|              2335.0|
|                PDX|              IAD|              2327.0|
|                PDX|              SFO|               550.0|
|                SEA|   

## Pergunta 16



In [124]:
(df_proc
 .groupBy(F.year(F.col('dep_datetime_flights')))
 .sum('distance_flights')
 .select(F.col('year(dep_datetime_flights)').alias('year'), 
         F.col('sum(distance_flights)').alias('sum_distance_flights'))
 .show())

+----+--------------------+
|year|sum_distance_flights|
+----+--------------------+
|2014|            12081516|
+----+--------------------+



## Pergunta 17

In [125]:
(df_proc
 .groupBy('region_airports_dest')
 .sum('distance_flights')
 .select('region_airports_dest', F.col('sum(distance_flights)').alias('sum_distance_flights'))
 .show())


+--------------------+--------------------+
|region_airports_dest|sum_distance_flights|
+--------------------+--------------------+
|              ALASKA|             1762553|
|       MAINLAND-EAST|             4378902|
|       MAINLAND-WEST|             5940061|
+--------------------+--------------------+



## Pergunta 18

In [126]:
df_seats_avg = df_proc.groupBy('faa_airports_origin','faa_airports_dest').avg('seats_planes')
df_seats_avg = df_seats_avg.withColumn('avg(seats_planes)', F.ceil(F.col('avg(seats_planes)')))
df_seats_avg.select('faa_airports_origin',
                   'faa_airports_dest',
                   F.col('avg(seats_planes)').alias('avg_seats_planes')).show()

+-------------------+-----------------+----------------+
|faa_airports_origin|faa_airports_dest|avg_seats_planes|
+-------------------+-----------------+----------------+
|                SEA|              RNO|             142|
|                SEA|              DTW|             213|
|                SEA|              CLE|             182|
|                SEA|              LAX|             155|
|                PDX|              SEA|              65|
|                SEA|              BLI|             164|
|                PDX|              IAH|             183|
|                PDX|              PHX|             196|
|                SEA|              SLC|             166|
|                SEA|              SBA|              80|
|                SEA|              BWI|             152|
|                PDX|              IAD|             188|
|                PDX|              SFO|             139|
|                SEA|              KOA|             171|
|                PDX|          

## Pergunta 19

In [127]:
(df_proc
 .groupBy(F.year(F.col('dep_datetime_flights')))
 .sum('seats_planes')
 .select(F.col('year(dep_datetime_flights)').alias('year'), 
         F.col('sum(seats_planes)').alias('sum_seats_planes'))
 .show())

+----+----------------+
|year|sum_seats_planes|
+----+----------------+
|2014|         1509544|
+----+----------------+



## Pergunta 20

In [141]:
df_proc.groupBy('faa_airports_dest').count().orderBy(F.col('count').desc()).show(1)

+-----------------+-----+
|faa_airports_dest|count|
+-----------------+-----+
|              SFO|  787|
+-----------------+-----+
only showing top 1 row



## Pergunta 21

In [128]:
(df_proc
 .groupBy('faa_airports_dest')
 .sum('seats_planes')
 .orderBy(F.col('sum(seats_planes)').desc())
 .select( 'faa_airports_dest', F.col('sum(seats_planes)').alias('sum_seats_planes'))
 .show())



+-----------------+----------------+
|faa_airports_dest|sum_seats_planes|
+-----------------+----------------+
|              SFO|          119635|
|              PHX|           96317|
|              LAX|           91406|
|              DEN|           88218|
|              LAS|           76354|
|              ANC|           74715|
|              SLC|           64920|
|              ATL|           58940|
|              SJC|           52133|
|              OAK|           48773|
|              ORD|           48466|
|              MSP|           48461|
|              SAN|           42350|
|              SMF|           41681|
|              IAH|           41228|
|              HNL|           37689|
|              SNA|           29448|
|              JFK|           28723|
|              EWR|           26771|
|              DTW|           25884|
+-----------------+----------------+
only showing top 20 rows



## Pergunta 22

In [27]:
(df_proc.select('faa_airports_origin','faa_airports_dest','distance_flights')
.filter((F.col('faa_airports_origin') == 'PDX')| (F.col('faa_airports_dest') == 'PDX'))
.orderBy(F.col('distance_flights').desc()).show())

+-------------------+-----------------+----------------+
|faa_airports_origin|faa_airports_dest|distance_flights|
+-------------------+-----------------+----------------+
|                PDX|              LIH|            2631|
|                PDX|              LIH|            2631|
|                PDX|              LIH|            2631|
|                PDX|              LIH|            2631|
|                PDX|              LIH|            2631|
|                PDX|              LIH|            2631|
|                PDX|              LIH|            2631|
|                PDX|              KOA|            2607|
|                PDX|              KOA|            2607|
|                PDX|              KOA|            2607|
|                PDX|              KOA|            2607|
|                PDX|              KOA|            2607|
|                PDX|              KOA|            2607|
|                PDX|              KOA|            2607|
|                PDX|          

## Pergunta 23

In [48]:
(df_proc.groupBy(F.month(F.col('dep_datetime_flights')).alias('month'), F.col('faa_airports_dest'))
 .count().orderBy(F.col('count').desc()).show(1))

+-----+-----------------+-----+
|month|faa_airports_dest|count|
+-----+-----------------+-----+
|    5|              LAX|   77|
+-----+-----------------+-----+
only showing top 1 row



In [130]:
df_month_dest = (df_proc
                 .groupBy(F.month(F.col('dep_datetime_flights')).alias('month'), F.col('faa_airports_dest'))
                 .count()
                 .orderBy(F.col('count').desc()))

df_month = (df_month_dest
 .groupBy('month')
 .max('count')
 .orderBy(F.col('month')))

df_month.alias('a').join(df_month_dest, 
                        (
                            df_month.month == df_month_dest.month) &
                            (F.col('max(count)') == F.col('count')
                        ),
                       "left").select("a.month",'faa_airports_dest','count').show()

+-----+-----------------+-----+
|month|faa_airports_dest|count|
+-----+-----------------+-----+
|    1|              PHX|   52|
|    2|              LAX|   52|
|    3|              SFO|   62|
|    4|              SFO|   70|
|    5|              LAX|   77|
|    6|              ANC|   61|
|    7|              SFO|   71|
|    8|              SFO|   75|
|    9|              SFO|   69|
|   10|              SFO|   66|
|   11|              SFO|   68|
|   12|              SFO|   76|
+-----+-----------------+-----+



In [131]:
 df_month_dest = (df_month_dest
                  .groupBy('faa_airports_dest')
                  .avg('count')
                  .orderBy(F.col('avg(count)').desc()))

df_month_dest = df_month_dest.withColumn('avg(count)', F.ceil(F.col('avg(count)')))
df_month_dest.select('faa_airports_dest',F.col('avg(count)').alias('avg_count')).show(1)

+-----------------+---------+
|faa_airports_dest|avg_count|
+-----------------+---------+
|              SFO|       66|
+-----------------+---------+
only showing top 1 row



## Pergunta 24

In [29]:
df_proc.groupBy('model_planes').count().orderBy(F.col('count').desc()).show()

+------------+-----+
|model_planes|count|
+------------+-----+
|     737-890| 1463|
|     737-7H4|  851|
|   737-990ER|  664|
|    A320-232|  612|
|     737-790|  581|
|        null|  552|
| CL-600-2C10|  551|
|     737-490|  472|
|     737-990|  335|
|     737-3H4|  279|
|     737-832|  273|
|   737-924ER|  265|
|     757-232|  259|
|    A320-214|  258|
|   EMB-120ER|  250|
| CL-600-2D24|  221|
|     737-4Q8|  183|
|     737-824|  180|
|   737-932ER|  158|
|     737-8H4|  142|
+------------+-----+
only showing top 20 rows



## Pergunta 25

In [30]:
(df_proc
 .filter(F.col('model_planes').isNotNull())
 .groupBy('model_planes','faa_airports_dest')
 .count().orderBy(F.col('count').desc()).show())

+------------+-----------------+-----+
|model_planes|faa_airports_dest|count|
+------------+-----------------+-----+
|     737-7H4|              OAK|  141|
|     737-890|              ANC|  138|
|     737-790|              SNA|  122|
|     737-7H4|              SMF|  114|
|     737-890|              LAX|  110|
|     737-890|              LAS|  108|
|     757-232|              SLC|  104|
|     737-890|              SAN|  102|
|    A320-232|              LGB|  102|
|    A320-214|              DEN|  102|
|     737-7H4|              LAS|   97|
|     737-7H4|              DEN|   93|
|    A320-232|              PHX|   91|
|     737-7H4|              SJC|   89|
|     737-490|              SFO|   87|
|   EMB-120ER|              PDX|   87|
|    A319-131|              SFO|   86|
|    A320-232|              SFO|   85|
|     737-7H4|              PHX|   80|
|   737-932ER|              ATL|   80|
+------------+-----------------+-----+
only showing top 20 rows



## Pergunta 26

In [132]:
df_haul_duration = df_proc.groupBy('haul_duration_flights').avg('engines_planes')
df_haul_duration = df_haul_duration.withColumn('avg(engines_planes)',
                                               F.ceil(F.col('avg(engines_planes)')))
(df_haul_duration
 .select('haul_duration_flights', F.col('avg(engines_planes)').alias('avg_engines_planes'))
 .show())

+---------------------+------------------+
|haul_duration_flights|avg_engines_planes|
+---------------------+------------------+
|            LONG-HAUL|                 2|
|          MEDIUM-HAUL|                 2|
|           SHORT-HAUL|                 2|
+---------------------+------------------+



## Pergunta 27

In [32]:
df_proc.groupBy('dep_season_flights').count().orderBy(F.col('count').desc()).show()

+------------------+-----+
|dep_season_flights|count|
+------------------+-----+
|            SUMMER| 2918|
|            SPRING| 2560|
|              FALL| 2373|
|            WINTER| 2149|
+------------------+-----+



## Pergunta 28

In [158]:
df_season_dest = (df_proc
 .groupBy('dep_season_flights','faa_airports_dest')
 .count()
 .orderBy(F.col('count').desc()))

df_dest = (df_season_dest
          .groupBy('faa_airports_dest').max('count'))

(df_dest.alias('d').join(df_season_dest,
                       (F.col('count') == F.col('max(count)')) &
                       (df_dest.faa_airports_dest == df_season_dest.faa_airports_dest),
                       "left")
 .select('dep_season_flights','d.faa_airports_dest','max(count)')
 .orderBy('d.faa_airports_dest')
 .show())


+------------------+-----------------+----------+
|dep_season_flights|faa_airports_dest|max(count)|
+------------------+-----------------+----------+
|              FALL|              ABQ|        19|
|            SUMMER|              ANC|       145|
|            SUMMER|              ATL|        86|
|            SPRING|              AUS|         9|
|            SUMMER|              BLI|         4|
|            SUMMER|              BNA|         5|
|            SPRING|              BOI|         7|
|            SUMMER|              BOS|        50|
|            WINTER|              BUR|        44|
|            SUMMER|              BWI|        14|
|            SUMMER|              CLE|         2|
|            SPRING|              CLT|        28|
|              FALL|              COS|         8|
|            SPRING|              CVG|         3|
|            SUMMER|              DCA|        20|
|            SUMMER|              DEN|       172|
|            SUMMER|              DFW|       122|


## Pergunta 29

In [34]:
df_proc.groupBy('dep_delay_category_flights').count().orderBy(F.col('count').desc()).show()

+--------------------------+-----+
|dep_delay_category_flights|count|
+--------------------------+-----+
|               ANTECIPATED| 5894|
|                     MINOR| 3065|
|                    INTIME|  646|
|                     MAJOR|  395|
+--------------------------+-----+



## Pergunta 30

In [159]:
(df_proc
 .groupBy('faa_airports_origin','faa_airports_dest','dep_delay_category_flights')
 .count()
 .orderBy(F.col('count').desc())
 .show())



+-------------------+-----------------+--------------------------+-----+
|faa_airports_origin|faa_airports_dest|dep_delay_category_flights|count|
+-------------------+-----------------+--------------------------+-----+
|                SEA|              LAX|               ANTECIPATED|  293|
|                SEA|              SFO|               ANTECIPATED|  245|
|                SEA|              LAS|               ANTECIPATED|  228|
|                SEA|              PHX|               ANTECIPATED|  195|
|                SEA|              ANC|               ANTECIPATED|  192|
|                PDX|              SFO|               ANTECIPATED|  174|
|                SEA|              DEN|               ANTECIPATED|  169|
|                SEA|              ORD|               ANTECIPATED|  167|
|                SEA|              SFO|                     MINOR|  159|
|                SEA|              ANC|                     MINOR|  145|
|                SEA|              DEN|            