In [1]:
import pyspark
from datetime import datetime
from pyspark.sql.types import IntegerType, FloatType
from pyspark.sql.functions import udf,lit

from pyspark.sql import SQLContext

sc = pyspark.SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

# Extra query: Taxi time in and out per year and airport
## SQL version

In [2]:
d = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true')\
        .load("../BDdata/1994.csv")
for i in range(1995,2009):
    d = d.union(sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true')\
            .load('../BDdata/'+str(i)+'.csv'))

In [4]:
from pyspark.sql.functions import col

d2 = d.where(d.TaxiIn != "NA").where(d.TaxiOut != "NA").groupBy("Year","Origin","Dest").agg({"TaxiIn":"sum","TaxiOut":"sum"}).orderBy("Year","Origin","Dest")
dd2 = d2.groupBy("Year","Origin").agg({"sum(TaxiOut)":"sum"}).select("Year","Origin","sum(sum(TaxiOut))")
dd3 = d2.groupBy("Year","Dest").agg({"sum(TaxiIn)":"sum"}).select("Year","Dest","sum(sum(TaxiIn))")
#d2.coalesce(1).write.csv("./extraQuery/taxi",header=True)
dd4 = dd2.alias("a").join(dd3.alias("b"), on = [col("a.Origin") == col("b.Dest"), col("a.Year") == col("b.Year")])
dd4.show()

+----+------+-----------------+----+----+----------------+
|Year|Origin|sum(sum(TaxiOut))|Year|Dest|sum(sum(TaxiIn))|
+----+------+-----------------+----+----+----------------+
|1995|   FCA|           9369.0|1995| FCA|          4209.0|
|1998|   EWR|        3299816.0|1998| EWR|        995037.0|
|1998|   JNU|          50459.0|1998| JNU|         13645.0|
|1999|   FAT|           3586.0|1999| FAT|          1700.0|
|2000|   LIT|         129359.0|2000| LIT|         54305.0|
|2000|   MSP|        2779914.0|2000| MSP|        950838.0|
|2001|   MLU|          16237.0|2001| MLU|          7348.0|
|2002|   MBS|          20306.0|2002| MBS|          8328.0|
|2004|   ADQ|           3744.0|2004| ADQ|          1878.0|
|2004|   PHL|        2779238.0|2004| PHL|        919327.0|
|2005|   BZN|          63313.0|2005| BZN|         23264.0|
|2005|   CEC|           7894.0|2005| CEC|          2924.0|
|2005|   FSM|          19010.0|2005| FSM|         10219.0|
|2005|   PSE|           2238.0|2005| PSE|           781.

In [5]:
dd4.select("a.Year","Origin","sum(sum(TaxiOut))","sum(sum(TaxiIn))").withColumnRenamed("sum(sum(TaxiIn))","TaxiIn").withColumnRenamed("sum(sum(TaxiOut))","TaxiOut")\
    .orderBy("Year","Origin").coalesce(1).write.csv("./extraQuery/taxi",header=True)

# Extra query: Number of flights per year between any two connected airports 
## SQL version

In [6]:
d3 = d.groupBy("Year","Origin","Dest").agg({"*":"count"}).orderBy("Year","Origin","Dest")
d3.coalesce(1).write.csv("./extraQuery/from_to",header=True)

## Spark DataSet version

In [7]:
start = 1994
end = 2008
ds = sc.textFile('../BDdata/'+str(start)+'.csv')
head = ds.take(1)[0]
ds = ds.filter(lambda x : x != head)
for i in range(start+1,end+1):
    ds = ds.union(sc.textFile('../BDdata/'+str(i)+'.csv').filter(lambda x : x != head))

In [8]:
h = head.split(",")
index = lambda x : h.index(x)

from operator import add
res = sorted(ds.map(lambda x : x.split(","))\
                .map(lambda x : ((x[index("Year")], x[index("Origin")], x[index("Dest")]), 1))\
                .reduceByKey(lambda x, y : x+y)\
                .collect())

In [9]:
res

[(('1994', 'ABE', 'ATL'), 709),
 (('1994', 'ABE', 'BWI'), 484),
 (('1994', 'ABE', 'CLT'), 468),
 (('1994', 'ABE', 'DCA'), 224),
 (('1994', 'ABE', 'DTW'), 948),
 (('1994', 'ABE', 'LGA'), 58),
 (('1994', 'ABE', 'MDT'), 1494),
 (('1994', 'ABE', 'ORD'), 1278),
 (('1994', 'ABE', 'PIT'), 1433),
 (('1994', 'ABQ', 'AMA'), 693),
 (('1994', 'ABQ', 'ATL'), 360),
 (('1994', 'ABQ', 'CVG'), 353),
 (('1994', 'ABQ', 'DAL'), 2057),
 (('1994', 'ABQ', 'DEN'), 3838),
 (('1994', 'ABQ', 'DFW'), 4127),
 (('1994', 'ABQ', 'ELP'), 1809),
 (('1994', 'ABQ', 'IAH'), 1563),
 (('1994', 'ABQ', 'LAS'), 2077),
 (('1994', 'ABQ', 'LAX'), 2406),
 (('1994', 'ABQ', 'LBB'), 679),
 (('1994', 'ABQ', 'MAF'), 358),
 (('1994', 'ABQ', 'MCI'), 1009),
 (('1994', 'ABQ', 'MSP'), 740),
 (('1994', 'ABQ', 'ORD'), 1161),
 (('1994', 'ABQ', 'PHX'), 7197),
 (('1994', 'ABQ', 'PIT'), 594),
 (('1994', 'ABQ', 'SAN'), 1054),
 (('1994', 'ABQ', 'SFO'), 705),
 (('1994', 'ABQ', 'SLC'), 1066),
 (('1994', 'ABQ', 'STL'), 2295),
 (('1994', 'ABQ', 'TUS'),