# Middleware Project: Big Data

## Initialization

In [1]:
DATASET_DIR = './dataset/'

In [2]:
# imports
import operator
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from time import time

In [3]:
# initialize spark
sc = pyspark.SparkContext("local[4]", "MiddlewareProject")  # 4 threads
spark = SparkSession.builder.appName("MiddlewareProject").getOrCreate()

## test vari

In [4]:
# come aprire un file
df = spark.read.csv(DATASET_DIR + 'short.csv', header=True)

In [5]:
# roba con dataframe
df.first()

Row(Year='1994', Month='1', DayofMonth='7', DayOfWeek='5', DepTime='858', CRSDepTime='900', ArrTime='954', CRSArrTime='1003', UniqueCarrier='US', FlightNum='227', TailNum='NA', ActualElapsedTime='56', CRSElapsedTime='63', AirTime='NA', ArrDelay='-9', DepDelay='-2', Origin='CLT', Dest='ORF', Distance='290', TaxiIn='NA', TaxiOut='NA', Cancelled='0', CancellationCode='NA', Diverted='0', CarrierDelay='NA', WeatherDelay='NA', NASDelay='NA', SecurityDelay='NA', LateAircraftDelay='NA')

In [6]:
df.count()

100000

In [7]:
# replace 'NA' with null values
df = df.replace('NA', None)

In [8]:
# find columns with null values
for col in df.columns:
    num = df.filter(df[col].isNull()).count()
    if num > 0:
        print("{:17} -> {}".format(col, num))

DepTime           -> 5754
ArrTime           -> 6135
TailNum           -> 100000
ActualElapsedTime -> 6135
AirTime           -> 100000
ArrDelay          -> 6135
DepDelay          -> 5754
TaxiIn            -> 100000
TaxiOut           -> 100000
CancellationCode  -> 100000
CarrierDelay      -> 100000
WeatherDelay      -> 100000
NASDelay          -> 100000
SecurityDelay     -> 100000
LateAircraftDelay -> 100000


In [9]:
# cast columns to correct type
intcolumns = ['Year', 'Month', 'DayofMonth', 'DayofWeek', 'DepTime', 'CRSDEPTime', 'ArrTime', 'CRSArrTime',
              'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Distance', 'CarrierDelay',
              'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'Cancelled', 'Diverted']

for c in intcolumns:
    df = df.withColumn(c, df[c].cast('int'))

In [10]:
df.first()

Row(Year=1994, Month=1, DayofMonth=7, DayofWeek=5, DepTime=858, CRSDEPTime=900, ArrTime=954, CRSArrTime=1003, UniqueCarrier='US', FlightNum='227', TailNum=None, ActualElapsedTime=56, CRSElapsedTime=63, AirTime=None, ArrDelay=-9, DepDelay=-2, Origin='CLT', Dest='ORF', Distance=290, TaxiIn=None, TaxiOut=None, Cancelled=0, CancellationCode=None, Diverted=0, CarrierDelay=None, WeatherDelay=None, NASDelay=None, SecurityDelay=None, LateAircraftDelay=None)

In [11]:
## to pandas dataframe
# dfpd = df.toPandas()

In [12]:
# dfpd.head(5)

In [13]:
# dfpd.describe()

## Actual assignment

### Initialization

In [14]:
# years = range(1994, 2009)
years = range(2004, 2005)
path = DATASET_DIR + '{' + ','.join(str(y) for y in years) + '}.csv'
print('Loading from:', path)

# load entire file
df = spark.read.csv(path, header=True)
print('Number of entries:', df.count())

Loading from: ./dataset/{2004}.csv
Number of entries: 7129270


In [15]:
# replace 'NA' with null values
df = df.replace('NA', None)

In [16]:
# cast columns to correct type
intcolumns = ['Year', 'Month', 'DayofMonth', 'DayofWeek', 'DepTime', 'CRSDEPTime', 'ArrTime', 'CRSArrTime',
              'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Distance', 'CarrierDelay',
              'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'Cancelled', 'Diverted']

for c in intcolumns:
    df = df.withColumn(c, df[c].cast('int'))
df.first()

Row(Year=2004, Month=1, DayofMonth=12, DayofWeek=1, DepTime=623, CRSDEPTime=630, ArrTime=901, CRSArrTime=915, UniqueCarrier='UA', FlightNum='462', TailNum='N805UA', ActualElapsedTime=98, CRSElapsedTime=105, AirTime=80, ArrDelay=-14, DepDelay=-7, Origin='ORD', Dest='CLT', Distance=599, TaxiIn='7', TaxiOut='11', Cancelled=0, CancellationCode=None, Diverted=0, CarrierDelay=0, WeatherDelay=0, NASDelay=0, SecurityDelay=0, LateAircraftDelay=0)

### Percentage of canceled flights per day

In [17]:
# the percentage of canceled flights per day

### Weekly percentages of delays due to weather

In [18]:
# https://spark.apache.org/docs/latest/rdd-programming-guide.html

In [19]:
# TODO meaningful variable names?

In [20]:
from datetime import datetime

# get RDD
rdd = df.rdd

# add the "week" as key
def findweek(row):
    '''Returns the week as a tuple (year, week in the year)'''
    d = datetime(year=row['Year'], month=row['Month'], day=row['DayofMonth'])
    ic = d.isocalendar()
    return (ic[0], ic[1])
rdd = rdd.map(lambda r: (findweek(r), r))

# keep only boolean values for a generic delay and weather delays
rdd = rdd.map(lambda r: (r[0], (1 if r[1]['ArrDelay'] else 0, 1 if r[1]['WeatherDelay'] else 0)))

# count days with delay and days with weather delay
rdd = rdd.reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))

# compute percentage of weather delays over all delayed flights
rdd = rdd.map(lambda r: (r[0], 100 * r[1][1] / r[1][0]))

In [29]:
starttime = time()
res = rdd.collect()
endtime = time() - starttime
print('Time taken: {:.5} seconds'.format(endtime))

res = sorted(res, key=lambda x: x[0][0]*53 + x[0][1])
res

Time taken: 0.11777 seconds


[((2004, 1), 3.1154632455334657),
 ((2004, 2), 2.242076326002587),
 ((2004, 3), 2.1238652410732297),
 ((2004, 4), 1.4795516315273154),
 ((2004, 5), 3.4213123046178593),
 ((2004, 6), 2.804987725140371),
 ((2004, 7), 1.5639162306995087),
 ((2004, 8), 0.7523261173566245),
 ((2004, 9), 2.0800377759414475),
 ((2004, 10), 1.4949616009335625),
 ((2004, 11), 0.7065517321788065),
 ((2004, 12), 1.6471343463522707),
 ((2004, 13), 0.5170181925292033),
 ((2004, 14), 0.7455868689996903),
 ((2004, 15), 0.54366820705992),
 ((2004, 16), 1.211035082002824),
 ((2004, 17), 1.1338803126336665),
 ((2004, 18), 1.299238081869156),
 ((2004, 19), 0.9394368068527303),
 ((2004, 20), 2.3946133055147234),
 ((2004, 21), 2.0099709705919473),
 ((2004, 22), 1.8637598705965785),
 ((2004, 23), 1.5102243289422919),
 ((2004, 24), 2.869196998474328),
 ((2004, 25), 2.844110337588565),
 ((2004, 26), 3.656078776964635),
 ((2004, 27), 2.669925657524833),
 ((2004, 28), 1.7841308668399327),
 ((2004, 29), 2.0059060238383033),
 ((2

In [22]:
# TODO visualizzazione
# plot per settimana

In [23]:
## "a week" = tutti i voli con orario di partenza in quella settimana
## "delay" interpreto come arrival delay. 
## Testato, non ci sono weather delay che vengono "recuperati" all'arrivo

## versione 1: (accettata dal prof) <---
# numero di voli con any weather delay / numero di voli con ritardo

## versione 2: (versione "pesata")
# per ogni volo, se ha ritardo, fai weather delay / ritardo volo
# fai media

## versione 3: (aggregata)
# totale weather delay / totale ritardo

### Delay reduced per distance group

In [24]:
# the percentage of flights belonging to a given "distance group"
# that were able to halve their departure delays by the time they 
# arrived at their destinations. 

# Distance groups assort flights by their total distance in miles.
# Flights with distances that are less than 200 miles belong in group 1,
# flights with distances that are between 200 and 399 miles belong 
# in group 2, flights with distances that are between 400 and 599 miles
# belong in group 3, and so on. The last group contains flights whose
# distances are between 2400 and 2599 miles.

### Penalty scores for each airport

In [25]:
# a weekly "penalty" score for each airport that depends on both 
# the its incoming and outgoing flights. 

# The score adds 0.5 for each incoming flight that is more than 15 minutes
# late, and 1 for each outgoing flight that is more than 15 minutes late

### Our group's data analysis

In [26]:
# an additional data analysis defined by your group

# IDEE
# qualcosa con gli altri delays? security, nas, carrier
# qualche confronto tra carrier
# 

In [27]:
# prima fare con un solo file come debug, poi mettere tutti

In [28]:
# come caricare più file allo stesso tempo: