In [82]:
import pyspark
import pandas as pd
import numpy as np
import seaborn as sns

sns.set(style="white", color_codes=True)
sns.set_context(rc={"font.family":'sans',"font.size":24,"axes.titlesize":24,"axes.labelsize":24})   


# import matplotlib and allow it to plot inline
import matplotlib.pyplot as plt
%matplotlib inline

# seaborn can generate several warnings, we ignore them
import warnings 
warnings.filterwarnings("ignore")


# Queries:
- the percentage of canceled flights per day, throughout the entire data set
- weekly percentages of delays that are due to weather, throughout the entire data set 
- the percentage of flights belonging to a given "distance group" that were able to halve their departure delays by the time they arrived at their destinations. Distance groups assort flights by their total distance in miles. Flights with distances that are less than 200 miles belong in group 1, flights with distances that are between 200 and 399 miles belong in group 2, flights with distances that are between 400 and 599 miles belong in group 3, and so on. The last group contains flights whose distances are between 2400 and 2599 miles.
- a weekly "penalty" score for each airport that depends on both the its incoming and outgoing flights. The score adds 0.5 for each incoming flight that is more than 15 minutes late, and 1 for each outgoing flight that is more than 15 minutes late.

# "The percentage of canceled flights per day, throughout the entire data set"
## Sql version

In [85]:
from pyspark.sql import SQLContext
sc = pyspark.SparkContext.getOrCreate()
sqlContext = SQLContext(sc)
d = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('BDdata/1994.csv')

In [86]:
d.count()

5180048

In [87]:
d_ymdc= d.select(d["Year"],d["Month"],d["DayofMonth"],d["Cancelled"])
d_cancelled = d_ymdc.filter(d["Cancelled"]==1). \
    groupBy(d["Year"],d["Month"],d["DayofMonth"]). \
    count().withColumnRenamed("count","cancelled")
d_cancelled.show()

+----+-----+----------+---------+
|Year|Month|DayofMonth|cancelled|
+----+-----+----------+---------+
|1994|   10|        28|       37|
|1994|   12|        26|      159|
|1994|    2|        11|     3649|
|1994|   12|        10|       80|
|1994|    4|        13|      368|
|1994|    6|        23|       88|
|1994|    9|        26|       59|
|1994|   12|        11|       76|
|1994|   11|        24|       14|
|1994|   12|         4|       80|
|1994|    1|        13|      315|
|1994|    4|        28|      124|
|1994|    4|        24|       22|
|1994|    5|        12|       72|
|1994|    8|         8|       43|
|1994|    4|        29|      107|
|1994|    4|        16|       65|
|1994|   10|        20|      212|
|1994|    1|         8|     1206|
|1994|   12|        29|       54|
+----+-----+----------+---------+
only showing top 20 rows



In [88]:
d_total = d_ymdc. \
    groupBy(d["Year"],d["Month"],d["DayofMonth"]). \
    count().withColumnRenamed("count","total")
d_total.show()

+----+-----+----------+-----+
|Year|Month|DayofMonth|total|
+----+-----+----------+-----+
|1994|   10|        28|14847|
|1994|   12|        26|14751|
|1994|    2|        11|14242|
|1994|   12|        10|13409|
|1994|    4|        13|14450|
|1994|    6|        23|14527|
|1994|    9|        26|14680|
|1994|   12|        11|14158|
|1994|   11|        24|11524|
|1994|   12|         4|14088|
|1994|    1|        13|14062|
|1994|    4|        24|13661|
|1994|    4|        28|14364|
|1994|    5|        12|14462|
|1994|    8|         8|14819|
|1994|    4|        29|14226|
|1994|    4|        16|12845|
|1994|   10|        20|14800|
|1994|    1|         8|12605|
|1994|   12|        29|14922|
+----+-----+----------+-----+
only showing top 20 rows



In [89]:
res = d_cancelled.join(d_total, on=[d_cancelled.Year == d_total.Year, d_cancelled.Month == d_total.Month,d_cancelled.DayofMonth == d_total.DayofMonth])\
    .select(d_cancelled.Year,d_cancelled.Month,d_cancelled.DayofMonth,d_cancelled["cancelled"],d_total["total"])
res.show()

+----+-----+----------+---------+-----+
|Year|Month|DayofMonth|cancelled|total|
+----+-----+----------+---------+-----+
|1994|   10|        28|       37|14847|
|1994|   12|        26|      159|14751|
|1994|    2|        11|     3649|14242|
|1994|   12|        10|       80|13409|
|1994|    4|        13|      368|14450|
|1994|    6|        23|       88|14527|
|1994|    9|        26|       59|14680|
|1994|   12|        11|       76|14158|
|1994|   11|        24|       14|11524|
|1994|   12|         4|       80|14088|
|1994|    1|        13|      315|14062|
|1994|    4|        24|       22|13661|
|1994|    4|        28|      124|14364|
|1994|    5|        12|       72|14462|
|1994|    8|         8|       43|14819|
|1994|    4|        29|      107|14226|
|1994|    4|        16|       65|12845|
|1994|   10|        20|      212|14800|
|1994|    1|         8|     1206|12605|
|1994|   12|        29|       54|14922|
+----+-----+----------+---------+-----+
only showing top 20 rows



In [90]:
res.withColumn("percentageCancelled", (res.cancelled/res.total))\
    .drop("cancelled","total")\
    .orderBy("Year","Month","DayofMonth").show()

+----+-----+----------+--------------------+
|Year|Month|DayofMonth| percentageCancelled|
+----+-----+----------+--------------------+
|1994|    1|         1|0.005264023688106...|
|1994|    1|         2|0.004492230650268797|
|1994|    1|         3| 0.01541819205857505|
|1994|    1|         4| 0.15560882746950574|
|1994|    1|         5|0.047656139357031585|
|1994|    1|         6| 0.05416755640970888|
|1994|    1|         7| 0.08754974417282547|
|1994|    1|         8| 0.09567631892106307|
|1994|    1|         9|0.011112791049289385|
|1994|    1|        10|0.018481801444554597|
|1994|    1|        11|0.013738403795765172|
|1994|    1|        12| 0.07870958633347451|
|1994|    1|        13|0.022400796472763475|
|1994|    1|        14| 0.01504349066020248|
|1994|    1|        15|  0.0120414673046252|
|1994|    1|        16|0.047875201721355565|
|1994|    1|        17| 0.17112681081852565|
|1994|    1|        18| 0.11705949985628054|
|1994|    1|        19| 0.07063008130081301|
|1994|    

In [102]:
def getCancelledPercentage(file):
    d = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true')\
    .load(file)
    d_ymdc= d.select(d["Year"],d["Month"],d["DayofMonth"],d["Cancelled"])
    d_cancelled = d_ymdc.filter(d["Cancelled"]==1) \
        .groupBy(d["Year"],d["Month"],d["DayofMonth"]) \
        .count().withColumnRenamed("count","cancelled")
    d_total = d_ymdc \
        .groupBy(d["Year"],d["Month"],d["DayofMonth"]) \
        .count().withColumnRenamed("count","total")
    res = d_cancelled.join(d_total, on=[d_cancelled.Year == d_total.Year, d_cancelled.Month == d_total.Month,d_cancelled.DayofMonth == d_total.DayofMonth]) \
        .select(d_cancelled.Year,d_cancelled.Month,d_cancelled.DayofMonth,d_cancelled["cancelled"],d_total["total"])
    return res.withColumn("percentageCancelled", (res.cancelled/res.total))\
        .drop("cancelled","total")\
        .orderBy("Year","Month","DayofMonth")
results=[getCancelledPercentage("BDdata/"+str(i)+".csv") for i in range(1994,2009)]


In [104]:
results[1].show()

+----+-----+----------+--------------------+
|Year|Month|DayofMonth| percentageCancelled|
+----+-----+----------+--------------------+
|1995|    1|         1|0.010582010582010581|
|1995|    1|         2|0.010852713178294573|
|1995|    1|         3|0.017752621084453593|
|1995|    1|         4| 0.02272874023374443|
|1995|    1|         5| 0.02155618850336613|
|1995|    1|         6| 0.06094487171201448|
|1995|    1|         7|  0.0257183908045977|
|1995|    1|         8| 0.01808698008399946|
|1995|    1|         9|  0.0530588388102351|
|1995|    1|        10| 0.04393730736441734|
|1995|    1|        11| 0.12992796332678455|
|1995|    1|        12| 0.09618030531350324|
|1995|    1|        13|  0.0287468966418398|
|1995|    1|        14|0.027137140778464897|
|1995|    1|        15|0.019543303846945075|
|1995|    1|        16|0.020290613954706112|
|1995|    1|        17|0.016901408450704224|
|1995|    1|        18|0.031135291033036184|
|1995|    1|        19| 0.04605004585353072|
|1995|    

## MapReduce version

In [92]:
ds = sc.textFile('./BDdata/1994.csv')


In [93]:
ds.take(1)

['Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay']

In [94]:
dsplitted = ds.map(lambda line : line.split(","))
header = dsplitted.take(1)[0]
dsfiltered = dsplitted.filter(lambda x : x != header)
header

['Year',
 'Month',
 'DayofMonth',
 'DayOfWeek',
 'DepTime',
 'CRSDepTime',
 'ArrTime',
 'CRSArrTime',
 'UniqueCarrier',
 'FlightNum',
 'TailNum',
 'ActualElapsedTime',
 'CRSElapsedTime',
 'AirTime',
 'ArrDelay',
 'DepDelay',
 'Origin',
 'Dest',
 'Distance',
 'TaxiIn',
 'TaxiOut',
 'Cancelled',
 'CancellationCode',
 'Diverted',
 'CarrierDelay',
 'WeatherDelay',
 'NASDelay',
 'SecurityDelay',
 'LateAircraftDelay']

In [95]:
dmapped = dsfiltered.map(lambda x : (tuple([int(el) for el in x[0:3]]),int(x[21])))
dmapped.take(1)

[((1994, 1, 7), 0)]

In [96]:
date_cancelled = dmapped.reduceByKey(lambda a, b : a+b)
total_per_date = dmapped.map(lambda x: (x[0],1)).reduceByKey(lambda a, b : a+b)
date_cancelled.take(5)

[((1994, 1, 5), 673),
 ((1994, 2, 1), 93),
 ((1994, 2, 2), 83),
 ((1994, 2, 3), 103),
 ((1994, 5, 18), 52)]

In [97]:
total_per_date.take(5)

[((1994, 1, 5), 14122),
 ((1994, 2, 1), 14085),
 ((1994, 2, 2), 14167),
 ((1994, 2, 3), 14211),
 ((1994, 5, 18), 14516)]

In [98]:
results=date_cancelled.join(total_per_date).map(lambda x : (x[0], x[1][0]/x[1][1]))

In [99]:
r = results.collect()
r.sort()
r

[((1994, 1, 1), 0.0052640236881065965),
 ((1994, 1, 2), 0.004492230650268797),
 ((1994, 1, 3), 0.01541819205857505),
 ((1994, 1, 4), 0.15560882746950574),
 ((1994, 1, 5), 0.047656139357031585),
 ((1994, 1, 6), 0.05416755640970888),
 ((1994, 1, 7), 0.08754974417282547),
 ((1994, 1, 8), 0.09567631892106307),
 ((1994, 1, 9), 0.011112791049289385),
 ((1994, 1, 10), 0.018481801444554597),
 ((1994, 1, 11), 0.013738403795765172),
 ((1994, 1, 12), 0.07870958633347451),
 ((1994, 1, 13), 0.022400796472763475),
 ((1994, 1, 14), 0.01504349066020248),
 ((1994, 1, 15), 0.0120414673046252),
 ((1994, 1, 16), 0.047875201721355565),
 ((1994, 1, 17), 0.17112681081852565),
 ((1994, 1, 18), 0.11705949985628054),
 ((1994, 1, 19), 0.07063008130081301),
 ((1994, 1, 20), 0.031078742428612634),
 ((1994, 1, 21), 0.013328556073092081),
 ((1994, 1, 22), 0.009481668773704172),
 ((1994, 1, 23), 0.011328681821592017),
 ((1994, 1, 24), 0.018600407103249807),
 ((1994, 1, 25), 0.04026369887289998),
 ((1994, 1, 26), 0.05