# Middleware Project: Big Data

## Initialization

In [1]:
DATASET_DIR = './dataset/'

In [2]:
# imports
import operator
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [3]:
# initialize spark
sc = pyspark.SparkContext("local[4]", "MiddlewareProject")
spark = SparkSession.builder.appName("MiddlewareProject").getOrCreate()

## test vari

In [4]:
# come aprire un file
rdd = spark.read.csv(DATASET_DIR + 'short.csv', header=True)

In [5]:
# roba con dataframe
rdd.first()

Row(Year='1994', Month='1', DayofMonth='7', DayOfWeek='5', DepTime='858', CRSDepTime='900', ArrTime='954', CRSArrTime='1003', UniqueCarrier='US', FlightNum='227', TailNum='NA', ActualElapsedTime='56', CRSElapsedTime='63', AirTime='NA', ArrDelay='-9', DepDelay='-2', Origin='CLT', Dest='ORF', Distance='290', TaxiIn='NA', TaxiOut='NA', Cancelled='0', CancellationCode='NA', Diverted='0', CarrierDelay='NA', WeatherDelay='NA', NASDelay='NA', SecurityDelay='NA', LateAircraftDelay='NA')

In [6]:
rdd.count()

100000

In [7]:
# replace 'NA' with null values
rdd = rdd.replace('NA', None)

In [8]:
# find columns with null values
for col in rdd.columns:
    num = rdd.filter(rdd[col].isNull()).count()
    if num > 0:
        print("{:17} -> {}".format(col, num))

DepTime           -> 5754
ArrTime           -> 6135
TailNum           -> 100000
ActualElapsedTime -> 6135
AirTime           -> 100000
ArrDelay          -> 6135
DepDelay          -> 5754
TaxiIn            -> 100000
TaxiOut           -> 100000
CancellationCode  -> 100000
CarrierDelay      -> 100000
WeatherDelay      -> 100000
NASDelay          -> 100000
SecurityDelay     -> 100000
LateAircraftDelay -> 100000


In [9]:
# cast columns to correct type
intcolumns = ['Year', 'Month', 'DayofMonth', 'DayofWeek', 'DepTime', 'CRSDEPTime', 'ArrTime', 'CRSArrTime',
              'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Distance', 'CarrierDelay',
              'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'Cancelled', 'Diverted']

for c in intcolumns:
    rdd = rdd.withColumn(c, rdd[c].cast('int'))

In [10]:
rdd.first()

Row(Year=1994, Month=1, DayofMonth=7, DayofWeek=5, DepTime=858, CRSDEPTime=900, ArrTime=954, CRSArrTime=1003, UniqueCarrier='US', FlightNum='227', TailNum=None, ActualElapsedTime=56, CRSElapsedTime=63, AirTime=None, ArrDelay=-9, DepDelay=-2, Origin='CLT', Dest='ORF', Distance=290, TaxiIn=None, TaxiOut=None, Cancelled=0, CancellationCode=None, Diverted=0, CarrierDelay=None, WeatherDelay=None, NASDelay=None, SecurityDelay=None, LateAircraftDelay=None)

In [11]:
# to pandas dataframe
df = rdd.toPandas()

In [21]:
df.head(5)

Unnamed: 0,Year,Month,DayofMonth,DayofWeek,DepTime,CRSDEPTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,1994,1,7,5,858.0,900,954.0,1003,US,227,...,,,0,,0,,,,,
1,1994,1,8,6,859.0,900,952.0,1003,US,227,...,,,0,,0,,,,,
2,1994,1,10,1,935.0,900,1023.0,1003,US,227,...,,,0,,0,,,,,
3,1994,1,11,2,903.0,900,1131.0,1003,US,227,...,,,0,,0,,,,,
4,1994,1,12,3,933.0,900,1024.0,1003,US,227,...,,,0,,0,,,,,


In [12]:
df.describe()

Unnamed: 0,Year,Month,DayofMonth,DayofWeek,DepTime,CRSDEPTime,ArrTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,ArrDelay,DepDelay,Distance,Cancelled,Diverted
count,100000.0,100000.0,100000.0,100000.0,94246.0,100000.0,93865.0,100000.0,93865.0,100000.0,93865.0,94246.0,100000.0,100000.0,100000.0
mean,1994.0,1.0,16.04311,3.96101,1375.857267,1358.4035,1499.445235,1486.77944,85.846876,87.33814,9.671305,11.378679,452.01509,0.05754,0.00381
std,0.0,0.0,8.889891,2.066788,451.669424,444.294523,455.402647,445.426795,47.624833,46.30459,26.917011,23.883334,354.019212,0.232873,0.061608
min,1994.0,1.0,1.0,1.0,1.0,530.0,1.0,5.0,15.0,19.0,-41.0,-13.0,32.0,0.0,0.0
25%,1994.0,1.0,8.0,2.0,1000.0,950.0,1127.0,1120.0,56.0,60.0,-5.0,0.0,248.0,0.0,0.0
50%,1994.0,1.0,16.0,4.0,1356.0,1345.0,1518.0,1513.0,71.0,73.0,2.0,2.0,335.0,0.0,0.0
75%,1994.0,1.0,24.0,6.0,1744.0,1730.0,1902.0,1853.0,99.0,100.0,14.0,13.0,526.0,0.0,0.0
max,1994.0,1.0,31.0,7.0,2400.0,2359.0,2400.0,2400.0,560.0,370.0,423.0,434.0,2521.0,1.0,1.0


## Actual assignment

In [14]:
# the percentage of canceled flights per day

In [15]:
# weekly percentages of delays that are due to weather

In [16]:
# the percentage of flights belonging to a given "distance group"
# that were able to halve their departure delays by the time they 
# arrived at their destinations. 

# Distance groups assort flights by their total distance in miles.
# Flights with distances that are less than 200 miles belong in group 1,
# flights with distances that are between 200 and 399 miles belong 
# in group 2, flights with distances that are between 400 and 599 miles
# belong in group 3, and so on. The last group contains flights whose
# distances are between 2400 and 2599 miles.

In [17]:
# a weekly "penalty" score for each airport that depends on both 
# the its incoming and outgoing flights. 

# The score adds 0.5 for each incoming flight that is more than 15 minutes
# late, and 1 for each outgoing flight that is more than 15 minutes late

In [18]:
# an additional data analysis defined by your group

In [19]:
# prima fare con un solo file come debug, poi mettere tutti

In [20]:
# come caricare più file allo stesso tempo: