In [1]:
import pyspark
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf,lit
from collections import namedtuple
sns.set(style="white", color_codes=True)
sns.set_context(rc={"font.family":'sans',"font.size":24,"axes.titlesize":24,"axes.labelsize":24})   


# import matplotlib and allow it to plot inline
import matplotlib.pyplot as plt
%matplotlib inline

# seaborn can generate several warnings, we ignore them
import warnings 
warnings.filterwarnings("ignore")
sc = pyspark.SparkContext.getOrCreate()

# Queries:
- the percentage of canceled flights per day, throughout the entire data set
- weekly percentages of delays that are due to weather, throughout the entire data set 
- the percentage of flights belonging to a given "distance group" that were able to halve their departure delays by the time they arrived at their destinations. Distance groups assort flights by their total distance in miles. Flights with distances that are less than 200 miles belong in group 1, flights with distances that are between 200 and 399 miles belong in group 2, flights with distances that are between 400 and 599 miles belong in group 3, and so on. The last group contains flights whose distances are between 2400 and 2599 miles.
- a weekly "penalty" score for each airport that depends on both the its incoming and outgoing flights. The score adds 0.5 for each incoming flight that is more than 15 minutes late, and 1 for each outgoing flight that is more than 15 minutes late.


# "The percentage of canceled flights per day, throughout the entire data set"


In [None]:
ds = sc.textFile('./BDdata/1994.csv')

In [None]:
dsplitted = ds.map(lambda line : line.split(","))

head = dsplitted.take(1)[0]
dsfiltered = dsplitted.filter(lambda x : x != head)

dmapped = dsfiltered.map(lambda x : (tuple([int(el) for el in x[0:3]]),int(x[21])))

date_cancelled = dmapped.reduceByKey(lambda a, b : a+b)
total_per_date = dmapped.map(lambda x: (x[0],1)).reduceByKey(lambda a, b : a+b)

results=date_cancelled.join(total_per_date).map(lambda x : (x[0], x[1][0]/x[1][1]))
results.take(20)

In [None]:
def getPercentage(year):
    ds = sc.textFile('./BDdata/'+year+'.csv')
    dsplitted = ds.map(lambda line : line.split(","))

    head = dsplitted.take(1)[0]
    dsfiltered = dsplitted.filter(lambda x : x != head)

    dmapped = dsfiltered.map(lambda x : (tuple([int(el) for el in x[0:3]]),int(x[21])))

    date_cancelled = dmapped.reduceByKey(lambda a, b : a+b)
    total_per_date = dmapped.map(lambda x: (x[0],1)).reduceByKey(lambda a, b : a+b)

    results=date_cancelled.join(total_per_date).map(lambda x : (x[0], x[1][0]/x[1][1]))
    return results


In [None]:
res2 = [getPercentage(str(i)).collect() for i in range(1994,2009)]

# "Weekly percentages of delays that are due to weather, throughout the entire data set"

In [2]:
ds = sc.textFile('./BDdata/2004.csv')
dsplitted = ds.map(lambda line : line.split(","))
head = dsplitted.take(1)[0]
dsfiltered = dsplitted.filter(lambda x : x != head)
# "Year" , "Month", "DayofMonth" -> [0:3]
# "ArrDelay" -> 14
# "WeatherDelay" -> 25
d1 = dsfiltered\
    .filter(lambda x : x[14] != 'NA' and int(x[14])>=0 and x[25]!='NA')\
    .map(lambda x : ("/".join(x[0:3]), tuple(int(el) for el in [*x[0:3],x[14],x[25]])))\
    .reduceByKey(lambda x,y: (*x[0:3], x[3]+y[3],x[4]+y[4]))\
    .map(lambda x : (datetime(*x[1][0:3]).isocalendar()[1], x[1]))

d2 = d1.filter(lambda x : x[0] is 1 and x[1][1] is 12)

d1 = d1.filter(lambda x : not (x[0] is 1 and x[1][1] is 12))\
    .reduceByKey(lambda x,y : (*x[0:3], x[3]+y[3], x[4]+y[4]))
d3b = d1.map(lambda x: (x[1][0],x[0],x[1][-1]/x[1][-2]))
res2 = sorted(d3b.collect())

In [101]:
res2

[(2004, 1, 0.07434515676522563),
 (2004, 2, 0.06307909121084979),
 (2004, 3, 0.06906457349675801),
 (2004, 4, 0.06112653802823575),
 (2004, 5, 0.09106793076338827),
 (2004, 6, 0.07341102967679923),
 (2004, 7, 0.05757752477390909),
 (2004, 8, 0.03147182026203678),
 (2004, 9, 0.06339412919870108),
 (2004, 10, 0.052794145119839236),
 (2004, 11, 0.025798587549322675),
 (2004, 12, 0.04267350976706616),
 (2004, 13, 0.016706971577044198),
 (2004, 14, 0.029161947023824342),
 (2004, 15, 0.02935286463035731),
 (2004, 16, 0.05104257324425239),
 (2004, 17, 0.04180623220286374),
 (2004, 18, 0.04801365267544427),
 (2004, 19, 0.04985891227714397),
 (2004, 20, 0.07074660352697981),
 (2004, 21, 0.06161961351063612),
 (2004, 22, 0.07625980785434189),
 (2004, 23, 0.06942345560287586),
 (2004, 24, 0.08337911786036085),
 (2004, 25, 0.06026607236693493),
 (2004, 26, 0.08600485630390489),
 (2004, 27, 0.07213184119358915),
 (2004, 28, 0.05893081374518968),
 (2004, 29, 0.05280998573020174),
 (2004, 30, 0.04929

## "The percentage of flights belonging to a given "distance group" that were able to halve their departure delays by the time they arrived at their destinations. 
Distance groups assort flights by their total distance in miles. Flights with distances that are less than 200 miles belong in group 1, flights with distances that are between 200 and 399 miles belong in group 2, flights with distances that are between 400 and 599 miles belong in group 3, and so on. The last group contains flights whose distances are between 2400 and 2599 miles."

In [87]:
ds = sc.textFile('./BDdata/2005.csv')
dsplitted = ds.map(lambda line : line.split(","))
head = dsplitted.take(1)[0]
# head.index("Cancelled") # -> 21
# head.index("DepDelay") # -> 15
# head.index("ArrDelay") # -> 14
# head.index("Distance") # -> 18
dsfiltered = dsplitted.filter(lambda x : x != head)

d1 = dsfiltered\
    .filter(lambda x : x[21] == '0' and x[14] != 'NA' and x[15] != 'NA')\
    .map(lambda x : (x[15], x[14], x[18]))\

d2 = d1.map(lambda x : (int(x[2])//200+1,(int(x[0]),int(x[1]))))

d3 = d2.filter(lambda x : x[1][0]>=2*x[1][1])\
    .map(lambda x: (x[0],1))\
    .reduceByKey(lambda x, y : x+y)
    
d4 = d2.map(lambda x : (x[0],1))\
    .reduceByKey(lambda x, y : x+y)


d5 = d4.join(d3).map(lambda x : (x[0], x[1][1]/x[1][0]))

In [89]:
sorted(d5.collect())

[(1, 0.5876224923306915),
 (2, 0.5757068598210482),
 (3, 0.561471897762626),
 (4, 0.5688015900802174),
 (5, 0.5605372543402206),
 (6, 0.5436613265874061),
 (7, 0.5574551778009225),
 (8, 0.5512521127105566),
 (9, 0.5605423769579989),
 (10, 0.544119338250998),
 (11, 0.5433981336521986),
 (12, 0.5830452613710987),
 (13, 0.5312673762031185),
 (14, 0.5913933570085649),
 (15, 0.5382224984462399),
 (16, 0.3),
 (17, 0.6046052631578948),
 (18, 0.4482758620689655),
 (19, 0.5956112852664577),
 (20, 0.5176282051282052),
 (21, 0.5977961432506887),
 (22, 0.641),
 (23, 0.5426745778183478),
 (25, 0.5306406685236769)]

## A weekly "penalty" score for each airport that depends on both the its incoming and outgoing flights.
The score adds 0.5 for each incoming flight that is more than 15 minutes late, and 1 for each outgoing flight that is more than 15 minutes late.

In [90]:
ds = sc.textFile('./BDdata/2003.csv')
dsplitted = ds.map(lambda line : line.split(","))
head = dsplitted.take(1)[0]

dsfiltered = dsplitted.filter(lambda x : x != head)

In [101]:
d1 = dsfiltered.filter(lambda x : x[14] != "NA" and int(x[14])>15)
d2 = d1.flatMap(lambda x : [(x[16],1),(x[17],0.5)])
d3 = d2.reduceByKey(lambda x, y : x+y)
sorted(d3.collect())

[('ABE', 936.0),
 ('ABI', 399.5),
 ('ABQ', 5776.5),
 ('ABY', 397.0),
 ('ACK', 73.0),
 ('ACT', 297.0),
 ('ACV', 811.5),
 ('ACY', 23.0),
 ('ADK', 30.5),
 ('ADQ', 206.0),
 ('AEX', 662.5),
 ('AGS', 1454.5),
 ('AKN', 90.0),
 ('ALB', 4726.0),
 ('AMA', 1536.5),
 ('ANC', 5117.5),
 ('ATL', 112164.0),
 ('ATW', 228.0),
 ('AUS', 6888.5),
 ('AVL', 1116.0),
 ('AVP', 317.0),
 ('AZO', 752.5),
 ('BDL', 7565.0),
 ('BET', 330.5),
 ('BFF', 1),
 ('BFL', 262.0),
 ('BGM', 258.5),
 ('BGR', 939.5),
 ('BHM', 3787.0),
 ('BIL', 614.5),
 ('BIS', 215.5),
 ('BMI', 737.5),
 ('BNA', 10483.0),
 ('BOI', 2234.5),
 ('BOS', 28666.5),
 ('BPT', 199.0),
 ('BQK', 436.0),
 ('BQN', 89.0),
 ('BRO', 225.5),
 ('BRW', 187.5),
 ('BTM', 209.0),
 ('BTR', 1947.5),
 ('BTV', 2115.5),
 ('BUF', 5467.0),
 ('BUR', 4800.0),
 ('BWI', 22893.0),
 ('BZN', 755.5),
 ('CAE', 2962.0),
 ('CAK', 1529.5),
 ('CDC', 103.0),
 ('CDV', 214.0),
 ('CEC', 186.5),
 ('CHA', 1160.0),
 ('CHO', 608.5),
 ('CHS', 2657.5),
 ('CIC', 135.0),
 ('CID', 1612.0),
 ('CLD', 194

In [98]:
#head.index("Origin")
head.index("Dest")
#head.index("ArrDelay")


17