# Middleware Project: Big Data

## Initialization

In [1]:
DATASET_DIR = './dataset/'

In [2]:
# imports
import operator
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from time import time

In [3]:
# initialize spark
#sc = pyspark.SparkContext("local[4]", "MiddlewareProject")  # 4 threads
spark = SparkSession.builder.appName("MiddlewareProject").getOrCreate()

In [4]:
# https://spark.apache.org/docs/latest/rdd-programming-guide.html

In [5]:
# open files

# years = range(1994, 2009)
years = range(2004, 2005)
path = DATASET_DIR + '{' + ','.join(str(y) for y in years) + '}.csv'
print('Loading from:', path)

# load entire file
df = spark.read.csv(path, header=True)
print('Number of entries:', df.count())

Loading from: ./dataset/{2004}.csv
Number of entries: 7129270


In [6]:
# replace 'NA' with null values
df = df.replace('NA', None)

In [7]:
# cast columns to correct type
intcolumns = ['Year', 'Month', 'DayofMonth', 'DayofWeek', 'DepTime', 'CRSDEPTime', 'ArrTime', 'CRSArrTime',
              'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Distance', 'CarrierDelay',
              'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'Cancelled', 'Diverted']

for c in intcolumns:
    df = df.withColumn(c, df[c].cast('int'))
df.first()

Row(Year=2004, Month=1, DayofMonth=12, DayofWeek=1, DepTime=623, CRSDEPTime=630, ArrTime=901, CRSArrTime=915, UniqueCarrier='UA', FlightNum='462', TailNum='N805UA', ActualElapsedTime=98, CRSElapsedTime=105, AirTime=80, ArrDelay=-14, DepDelay=-7, Origin='ORD', Dest='CLT', Distance=599, TaxiIn='7', TaxiOut='11', Cancelled=0, CancellationCode=None, Diverted=0, CarrierDelay=0, WeatherDelay=0, NASDelay=0, SecurityDelay=0, LateAircraftDelay=0)

### Percentage of cancelled flights per day

Average of percentage of cancelled flights of each day

In [8]:
# get RDD
raw_data = df.rdd

raw_data.first()

Row(Year=2004, Month=1, DayofMonth=12, DayofWeek=1, DepTime=623, CRSDEPTime=630, ArrTime=901, CRSArrTime=915, UniqueCarrier='UA', FlightNum='462', TailNum='N805UA', ActualElapsedTime=98, CRSElapsedTime=105, AirTime=80, ArrDelay=-14, DepDelay=-7, Origin='ORD', Dest='CLT', Distance=599, TaxiIn='7', TaxiOut='11', Cancelled=0, CancellationCode=None, Diverted=0, CarrierDelay=0, WeatherDelay=0, NASDelay=0, SecurityDelay=0, LateAircraftDelay=0)

In [9]:
#total flight per day
indexed_by_day = raw_data.map(lambda m: ((m['Year'], m['Month'], m['DayofMonth']),(m['Cancelled'],1)))
indexed_by_day.take(10)


[((2004, 1, 12), (0, 1)),
 ((2004, 1, 13), (0, 1)),
 ((2004, 1, 14), (0, 1)),
 ((2004, 1, 15), (0, 1)),
 ((2004, 1, 16), (0, 1)),
 ((2004, 1, 17), (0, 1)),
 ((2004, 1, 19), (0, 1)),
 ((2004, 1, 20), (0, 1)),
 ((2004, 1, 21), (0, 1)),
 ((2004, 1, 22), (0, 1))]

In [10]:
num_flights_day = indexed_by_day.reduceByKey(lambda a,b: (a[0] + b[0], a[1] + b[1]))
num_flights_day.take(20)

[((2004, 1, 19), (192, 19539)),
 ((2004, 1, 20), (169, 19246)),
 ((2004, 1, 21), (209, 19255)),
 ((2004, 1, 22), (153, 19548)),
 ((2004, 1, 7), (810, 19256)),
 ((2004, 1, 8), (627, 19579)),
 ((2004, 1, 9), (520, 19616)),
 ((2004, 1, 10), (215, 16761)),
 ((2004, 2, 14), (383, 17008)),
 ((2004, 2, 20), (202, 19859)),
 ((2004, 2, 21), (162, 17174)),
 ((2004, 2, 23), (327, 19803)),
 ((2004, 2, 26), (777, 19800)),
 ((2004, 2, 8), (192, 18446)),
 ((2004, 2, 9), (298, 19549)),
 ((2004, 2, 11), (218, 19405)),
 ((2004, 3, 18), (304, 19925)),
 ((2004, 3, 19), (266, 19955)),
 ((2004, 3, 29), (91, 19909)),
 ((2004, 3, 9), (201, 19607))]

In [11]:
#compute percentage per day
#daily_percentage = num_flights_day.map(lambda m: (m[0], m[1][0]*100.0/m[1][1]))
daily_percentage = num_flights_day.map(lambda m: m[1][0]*100.0/m[1][1])

In [12]:
daily_percentage.take(10)

[0.9826500844464916,
 0.8781045412033669,
 1.0854323552324072,
 0.7826887661141805,
 4.206481096800997,
 3.2024107462076716,
 2.6508972267536706,
 1.2827396933357198,
 2.2518814675446848,
 1.017171055944408]

In [13]:
#compute average
#print("Percentage of cancelled flights per day: ", 
daily_percentage.reduce(lambda a,b: a+b)/daily_percentage.count()

1.795360804356613

[((2004, 1, 12), 1),
 ((2004, 1, 13), 1),
 ((2004, 1, 14), 1),
 ((2004, 1, 15), 1),
 ((2004, 1, 16), 1)]

[((2004, 1, 19), 19539),
 ((2004, 1, 20), 19246),
 ((2004, 1, 21), 19255),
 ((2004, 1, 22), 19548),
 ((2004, 1, 7), 19256)]

[((2004, 1, 19), 19539),
 ((2004, 1, 20), 19246),
 ((2004, 1, 21), 19255),
 ((2004, 1, 22), 19548),
 ((2004, 1, 7), 19256),
 ((2004, 1, 8), 19579),
 ((2004, 1, 9), 19616),
 ((2004, 1, 10), 16761),
 ((2004, 2, 14), 17008),
 ((2004, 2, 20), 19859),
 ((2004, 2, 21), 17174),
 ((2004, 2, 23), 19803),
 ((2004, 2, 26), 19800),
 ((2004, 2, 8), 18446),
 ((2004, 2, 9), 19549),
 ((2004, 2, 11), 19405),
 ((2004, 3, 18), 19925),
 ((2004, 3, 19), 19955),
 ((2004, 3, 29), 19909),
 ((2004, 3, 9), 19607),
 ((2004, 3, 12), 19949),
 ((2004, 3, 14), 18896),
 ((2004, 3, 15), 19907),
 ((2004, 3, 16), 19765),
 ((2004, 4, 5), 19980),
 ((2004, 4, 6), 19695),
 ((2004, 4, 7), 19947),
 ((2004, 4, 8), 19997),
 ((2004, 4, 25), 18900),
 ((2004, 4, 26), 19972),
 ((2004, 4, 27), 19688),
 ((2004, 4, 28), 19862),
 ((2004, 5, 24), 19957),
 ((2004, 5, 25), 19778),
 ((2004, 5, 26), 19880),
 ((2004, 5, 31), 18747),
 ((2004, 5, 4), 19560),
 ((2004, 5, 5), 19756),
 ((2004, 5, 6), 19932),
 ((2004, 5, 11), 19605),
 ((2004, 6, 1

KeyboardInterrupt: 

### Penalty scores for each airport

In [19]:
# a weekly "penalty" score for each airport that depends on both 
# its incoming and outgoing flights. 

# The score adds 0.5 for each incoming flight that is more than 15 minutes
# late, and 1 for each outgoing flight that is more than 15 minutes late

### Our group's data analysis

In [20]:
# an additional data analysis defined by your group

# IDEE
# qualcosa con gli altri delays? security, nas, carrier
# qualche confronto tra carrier
# 

In [21]:
# prima fare con un solo file come debug, poi mettere tutti