# Import libraries

In [68]:
from tabulate import tabulate
import sys, os

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
from pyspark.sql.functions import col, isnan, when, count, concat, lit, substring, udf, desc
import pyspark.sql.functions as F

from pyspark.ml.regression import LinearRegression
from pyspark.mllib.evaluation import RegressionMetrics

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator



**Arguments from command line**

In [2]:
print("\nArguments passed:", end = " ")
for i in range(1, len(sys.argv)):
    print(sys.argv[i], end = " ")


Arguments passed: -f C:\Users\Ofeucor\AppData\Roaming\jupyter\runtime\kernel-8e032ffb-8ad4-499e-bdf4-e2fe8fc27f96.json 

*python app.py --path ./Data*

In [78]:
if '--path' in sys.argv:
    dir_path = sys.argv[sys.argv.index('--path')+1]
else:
    print('Add the argument "--path" followed by the address where the data is located.')
    exit(-1)

Add the argument "--path" followed by the address where the data is located.


**We create our app**

In [3]:
spark = SparkSession.builder.appName("BigDataApp").getOrCreate()
spark.sparkContext

# LOADING THE DATA

**Create a schema for the DataFrame**

In [4]:
schema = StructType([
    StructField("Year",IntegerType(),nullable=True),
    StructField("Month",IntegerType(),nullable=True),
    StructField("DayofMonth",IntegerType(),nullable=True),
    StructField("DayOfWeek",IntegerType(),nullable=True),
    StructField("DepTime",IntegerType(),nullable=True),
    StructField("CRSDepTime",IntegerType(),nullable=True),
    StructField("ArrTime",IntegerType(),nullable=True),
    StructField("CRSArrTime",IntegerType(),nullable=True),
    StructField("UniqueCarrier",StringType(),nullable=True),
    StructField("FlightNum",IntegerType(),nullable=True),
    StructField("TailNum",StringType(),nullable=True),
    StructField("ActualElapsedTime",IntegerType(),nullable=True),
    StructField("CRSElapsedTime",IntegerType(),nullable=True),
    StructField("AirTime",IntegerType(),nullable=True),
    StructField("ArrDelay",IntegerType(),nullable=True),
    StructField("DepDelay",IntegerType(),nullable=True),
    StructField("Origin",StringType(),nullable=True),
    StructField("Dest",StringType(),nullable=True),
    StructField("Distance",IntegerType(),nullable=True),
    StructField("TaxiIn",IntegerType(),nullable=True),
    StructField("TaxiOut",IntegerType(),nullable=True),
    StructField("Cancelled",IntegerType(),nullable=True),
    StructField("CancellationCode",StringType(),nullable=True),
    StructField("Diverted",IntegerType(),nullable=True),
    StructField("CarrierDelay",IntegerType(),nullable=True),
    StructField("WeatherDelay",IntegerType(),nullable=True),
    StructField("NASDelay",IntegerType(),nullable=True),
    StructField("SecurityDelay",IntegerType(),nullable=True),
    StructField("LateAircraftDelay",IntegerType(),nullable=True)
])

**Load data into DataFrame**

In [83]:
arg = './resources/data/2008.csv.bz2'
dir_path = './resources/data'

if dir_path[-1] != '/':
    dir_path+='/'

In [84]:
files_path = []
for path in os.listdir(dir_path):
    # check if current path is a file
    if os.path.isfile(os.path.join(dir_path, path)) and path.endswith(".csv.bz2"):
        files_path.append(dir_path+path)
        
files_path#df1.union(df2)

['./resources/data/2007.csv.bz2', './resources/data/2008.csv.bz2']

In [85]:
df = spark.read.csv(path=files_path[0], schema=schema, header=True)

for f in files_path[1:]:
    df = df.union(spark.read.csv(path=f, schema=schema, header=True))
display(df.take(5))

[Row(Year=2007, Month=1, DayofMonth=1, DayOfWeek=1, DepTime=1232, CRSDepTime=1225, ArrTime=1341, CRSArrTime=1340, UniqueCarrier='WN', FlightNum=2891, TailNum='N351', ActualElapsedTime=69, CRSElapsedTime=75, AirTime=54, ArrDelay=1, DepDelay=7, Origin='SMF', Dest='ONT', Distance=389, TaxiIn=4, TaxiOut=11, Cancelled=0, CancellationCode=None, Diverted=0, CarrierDelay=0, WeatherDelay=0, NASDelay=0, SecurityDelay=0, LateAircraftDelay=0),
 Row(Year=2007, Month=1, DayofMonth=1, DayOfWeek=1, DepTime=1918, CRSDepTime=1905, ArrTime=2043, CRSArrTime=2035, UniqueCarrier='WN', FlightNum=462, TailNum='N370', ActualElapsedTime=85, CRSElapsedTime=90, AirTime=74, ArrDelay=8, DepDelay=13, Origin='SMF', Dest='PDX', Distance=479, TaxiIn=5, TaxiOut=6, Cancelled=0, CancellationCode=None, Diverted=0, CarrierDelay=0, WeatherDelay=0, NASDelay=0, SecurityDelay=0, LateAircraftDelay=0),
 Row(Year=2007, Month=1, DayofMonth=1, DayOfWeek=1, DepTime=2206, CRSDepTime=2130, ArrTime=2334, CRSArrTime=2300, UniqueCarrier='

**Remove forbidden variables**

In [7]:
df = df.drop("ArrTime").drop("ActualElapsedTime"
        ).drop("AirTime").drop("TaxiIn").drop("Diverted"
        ).drop("CarrierDelay").drop("WeatherDelay").drop("NASDelay"
        ).drop("SecurityDelay").drop("LateAircraftDelay")

In [8]:
df.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: integer (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- CRSElapsedTime: integer (nullable = true)
 |-- ArrDelay: integer (nullable = true)
 |-- DepDelay: integer (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: integer (nullable = true)
 |-- TaxiOut: integer (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)



In [9]:
df.show(5, False)

+----+-----+----------+---------+-------+----------+----------+-------------+---------+-------+--------------+--------+--------+------+----+--------+-------+---------+----------------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|CRSElapsedTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiOut|Cancelled|CancellationCode|
+----+-----+----------+---------+-------+----------+----------+-------------+---------+-------+--------------+--------+--------+------+----+--------+-------+---------+----------------+
|2008|1    |3         |4        |1343   |1325      |1435      |WN           |588      |N240WN |70            |16      |18      |HOU   |LIT |393     |9      |0        |null            |
|2008|1    |3         |4        |1125   |1120      |1245      |WN           |1343     |N523SW |85            |2       |5       |HOU   |MAF |441     |8      |0        |null            |
|2008|1    |3         |4        |2009   |2015      |2140      |WN          

# PROCESSING THE DATA

In [10]:
(df.count(), len(df.columns))

(2389217, 19)

## Unique values of the variables

In [65]:
for c in df.columns:
    print(df.groupBy(c).count().show())

+----+-------+
|Year|  count|
+----+-------+
|2008|2319115|
+----+-------+

None
+-----+------+
|Month| count|
+-----+------+
|    1|587130|
|    3|598341|
|    4|586723|
|    2|546921|
+-----+------+

None
+----------+-----+
|DayofMonth|count|
+----------+-----+
|        31|38659|
|        28|80203|
|        27|77669|
|        26|72013|
|        12|71039|
|        22|74352|
|         1|74111|
|        13|77147|
|        16|75442|
|         6|75778|
|         3|78826|
|        20|78537|
|         5|72840|
|        19|72256|
|        15|76676|
|        17|78076|
|         9|73413|
|         4|77589|
|         8|74080|
|        23|75512|
+----------+-----+
only showing top 20 rows

None
+---------+------+
|DayOfWeek| count|
+---------+------+
|        1|337657|
|        6|281129|
|        3|354455|
|        5|338154|
|        4|340309|
|        7|321369|
|        2|346042|
+---------+------+

None
+-------------+------+
|UniqueCarrier| count|
+-------------+------+
|           UA|149156|

## Removing noise

**Remove duplicated rows**

In [11]:
df = df.distinct()

In [12]:
df.groupBy(df.columns).count().filter("count > 1").show()

+----+-----+----------+---------+-------+----------+----------+-------------+---------+-------+--------------+--------+--------+------+----+--------+-------+---------+----------------+-----+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|CRSElapsedTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiOut|Cancelled|CancellationCode|count|
+----+-----+----------+---------+-------+----------+----------+-------------+---------+-------+--------------+--------+--------+------+----+--------+-------+---------+----------------+-----+
+----+-----+----------+---------+-------+----------+----------+-------------+---------+-------+--------------+--------+--------+------+----+--------+-------+---------+----------------+-----+



**Remove instances of cancelled flights**

In [13]:
df.groupBy('Cancelled').count().show()

+---------+-------+
|Cancelled|  count|
+---------+-------+
|        1|  64442|
|        0|2324771|
+---------+-------+



In [14]:
df.groupBy('CancellationCode').count().show()

+----------------+-------+
|CancellationCode|  count|
+----------------+-------+
|            null|2324771|
|               B|  25744|
|               C|  12617|
|               A|  26075|
|               D|      6|
+----------------+-------+



In [15]:
df = df.filter(df.Cancelled == 0)

In [16]:
df = df.filter(df.CancellationCode.isNull())

**CancellationCode has more than 97% missing so it is removed**

In [17]:
df = df.drop('CancellationCode', 'Cancelled')

**Analyze missing values**

In [20]:
print(tabulate([[c, df.filter(col(c).isNull()).count()/df.count()] for c in df.columns], headers=['Name', 'Count %']))

Name                Count %
--------------  -----------
Year            0
Month           0
DayofMonth      0
DayOfWeek       0
DepTime         0
CRSDepTime      0
CRSArrTime      0
UniqueCarrier   0
FlightNum       0
TailNum         1.29045e-06
CRSElapsedTime  0.000123883
ArrDelay        0.00243207
DepDelay        0
Origin          0
Dest            0
Distance        0
TaxiOut         0


**TailNum and CRSElapsedTime can not be imputed from any other column, and ArrDelay is the target variable, but their number of missing values is not significant taking into account the total number, so the rows containing those missing values are removed**

In [18]:
df = df.na.drop()

In [19]:
(df.count(), len(df.columns))

(2319115, 17)

## !!!!!Date preprocess

In [23]:
df = df.withColumn(
    "Date", F.date_format(F.expr("make_date(Year, Month, DayofMonth)"), "MM/dd/yyyy")
)

df.groupBy('Date').count().show()

+----------+-----+
|      Date|count|
+----------+-----+
|04/05/2008|16939|
|03/28/2008|20534|
|04/24/2008|20524|
|01/15/2008|19167|
|01/08/2008|19104|
|03/29/2008|17168|
|03/02/2008|19095|
|03/14/2008|20591|
|04/13/2008|19442|
|01/13/2008|18549|
|03/27/2008|19997|
|02/10/2008|18438|
|04/23/2008|20320|
|03/11/2008|19882|
|02/05/2008|18322|
|01/24/2008|19899|
|04/30/2008|20311|
|03/08/2008|15670|
|03/23/2008|19309|
|03/25/2008|19971|
+----------+-----+
only showing top 20 rows



In [20]:
df = df.withColumn(
    "Season", when((df.Month>2) & (df.Month<6), 1).when((df.Month>5) & (df.Month<9), 2
        ).when((df.Month>8) & (df.Month<12), 3).otherwise(4)
)
df.groupBy('Season').count().show()

+------+-------+
|Season|  count|
+------+-------+
|     1|1185064|
|     4|1134051|
+------+-------+



**Check that DepDelay = DepTime - CRSDepTime**

In [21]:
df = df.withColumn("DepTimeNew", when(F.length(df.DepTime) == 3, concat(lit("0"),df.DepTime)) \
        .when(F.length(df.DepTime) == 2, concat(lit("00"),df.DepTime)) \
        .otherwise(df.DepTime))

In [22]:
df = df.withColumn("CRSDepTimeNew", when(F.length(df.CRSDepTime) == 3, concat(lit("0"),df.CRSDepTime)) \
        .when(F.length(df.CRSDepTime) == 2, concat(lit("00"),df.CRSDepTime)) \
        .otherwise(df.CRSDepTime))

In [23]:
df = df.withColumn("CRSArrTimeNew", when(F.length(df.CRSArrTime) == 3, concat(lit("0"),df.CRSArrTime)) \
        .when(F.length(df.CRSArrTime) == 2, concat(lit("00"),df.CRSArrTime)) \
        .otherwise(df.CRSArrTime))

In [24]:
df.select("DepTime","DepTimeNew","CRSDepTime","CRSDepTimeNew","CRSArrTime","CRSArrTimeNew").show()

+-------+----------+----------+-------------+----------+-------------+
|DepTime|DepTimeNew|CRSDepTime|CRSDepTimeNew|CRSArrTime|CRSArrTimeNew|
+-------+----------+----------+-------------+----------+-------------+
|   1215|      1215|      1220|         1220|      1324|         1324|
|   1540|      1540|      1409|         1409|      1711|         1711|
|   1835|      1835|      1755|         1755|      1845|         1845|
|    709|      0709|       655|         0655|       830|         0830|
|    700|      0700|       705|         0705|       825|         0825|
|    724|      0724|       737|         0737|       859|         0859|
|    738|      0738|       740|         0740|       835|         0835|
|    742|      0742|       745|         0745|       845|         0845|
|    852|      0852|       850|         0850|      1105|         1105|
|    905|      0905|       908|         0908|      1015|         1015|
|    930|      0930|       935|         0935|      1030|         1030|
|    9

In [25]:
df.select("DepTimeNew","CRSDepTimeNew","DepDelay").show()

+----------+-------------+--------+
|DepTimeNew|CRSDepTimeNew|DepDelay|
+----------+-------------+--------+
|      1215|         1220|      -5|
|      1540|         1409|      91|
|      1835|         1755|      40|
|      0709|         0655|      14|
|      0700|         0705|      -5|
|      0724|         0737|     -13|
|      0738|         0740|      -2|
|      0742|         0745|      -3|
|      0852|         0850|       2|
|      0905|         0908|      -3|
|      0930|         0935|      -5|
|      0940|         0940|       0|
|      1029|         1010|      19|
|      1037|         1040|      -3|
|      1051|         1040|      11|
|      1239|         1136|      63|
|      1259|         1300|      -1|
|      1325|         1315|      10|
|      1437|         1425|      12|
|      1435|         1436|      -1|
+----------+-------------+--------+
only showing top 20 rows



In [26]:
df = df.withColumn("CRSDepTimeNewHour", substring(df.CRSDepTimeNew, 1,2)) \
    .withColumn("CRSDepTimeNewMinute", substring(df.CRSDepTimeNew, 3,2)) \
    .withColumn("DepTimeNewHour", substring(df.DepTimeNew, 1,2)) \
    .withColumn("DepTimeNewMinute", substring(df.DepTimeNew, 3,2))

df = df.withColumn("CRSArrTimeNewHour", substring(df.CRSArrTimeNew, 1,2)) \
    .withColumn("CRSArrTimeNewMinute", substring(df.CRSArrTimeNew, 3,2))

In [27]:
df.select("CRSDepTimeNew","CRSDepTimeNewHour","CRSDepTimeNewMinute","DepTimeNew","DepTimeNewHour","DepTimeNewMinute").show()

+-------------+-----------------+-------------------+----------+--------------+----------------+
|CRSDepTimeNew|CRSDepTimeNewHour|CRSDepTimeNewMinute|DepTimeNew|DepTimeNewHour|DepTimeNewMinute|
+-------------+-----------------+-------------------+----------+--------------+----------------+
|         1220|               12|                 20|      1215|            12|              15|
|         1409|               14|                 09|      1540|            15|              40|
|         1755|               17|                 55|      1835|            18|              35|
|         0655|               06|                 55|      0709|            07|              09|
|         0705|               07|                 05|      0700|            07|              00|
|         0737|               07|                 37|      0724|            07|              24|
|         0740|               07|                 40|      0738|            07|              38|
|         0745|               

In [32]:
df = df.withColumn(
    "datetime",
    F.date_format(
        F.expr("make_timestamp(Year, Month, DayofMonth, CRSDepTimeNewHour, CRSDepTimeNewMinute, 0)"),
        "dd/MM/yyyy HH:mm"
    )
)

df.select("datetime").show()

+----------------+
|        datetime|
+----------------+
|28/01/2008 12:20|
|27/01/2008 14:09|
|04/01/2008 17:55|
|21/01/2008 06:55|
|21/01/2008 07:05|
|16/01/2008 07:37|
|02/01/2008 07:40|
|29/01/2008 07:45|
|07/01/2008 08:50|
|30/01/2008 09:08|
|20/01/2008 09:35|
|20/01/2008 09:40|
|28/01/2008 10:10|
|11/01/2008 10:40|
|25/01/2008 10:40|
|27/01/2008 11:36|
|30/01/2008 13:00|
|23/01/2008 13:15|
|05/01/2008 14:25|
|15/01/2008 14:36|
+----------------+
only showing top 20 rows



#   !!!!!!

In [38]:
#df = df.withColumn("DepDelayNew", when(abs((col("DepTimeNewHour").cast('long')*60 + col("DepTimeNewMinute")) - (col("CRSDepTimeNewHour").cast('long')*60 + col("CRSDepTimeNewMinute"))) < abs((col("CRSDepTimeNewHour").cast('long')*60 + col("CRSDepTimeNewMinute")) - (col("DepTimeNewHour").cast('long')*60 + col("DepTimeNewMinute"))), 
#    (col("DepTimeNewHour").cast('long')*60 + col("DepTimeNewMinute")) - (col("CRSDepTimeNewHour").cast('long')*60 + col("CRSDepTimeNewMinute"))) \
#    .otherwise((col("CRSDepTimeNewHour").cast('long')*60 + col("CRSDepTimeNewMinute")) - (col("DepTimeNewHour").cast('long')*60 + col("DepTimeNewMinute"))))
                   
df = df.withColumn("Duration", 
    (col("CRSArrTimeNewHour").cast('long')*60 + col("CRSArrTimeNewMinute").cast('long')) - (col("CRSDepTimeNewHour").cast('long')*60 + col("CRSDepTimeNewMinute").cast('long')))

AnalysisException: Column 'CRSArrTimeNewHour' does not exist. Did you mean one of the following? [ArrDelay, CRSElapsedTime, Duration, Season, TailNum, DayOfWeek, FlightNum, Origin, TaxiOut, UniqueCarrier, Year, DepDelay, Dest, Distance, DayofMonth, Month];
'Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20, Season#767, (((cast('CRSArrTimeNewHour as bigint) * 60) + cast('CRSArrTimeNewMinute as bigint)) - ((cast('CRSDepTimeNewHour as bigint) * 60) + cast('CRSDepTimeNewMinute as bigint))) AS Duration#2219]
+- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20, Season#767, Duration#1218]
   +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20, Season#767, DepTimeNew#838, CRSDepTimeNew#858, CRSArrTimeNew#879, CRSDepTimeNewHour#986, CRSDepTimeNewMinute#1009, DepTimeNewHour#1033, ... 4 more fields]
      +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20, Season#767, DepTimeNew#838, CRSDepTimeNew#858, CRSArrTimeNew#879, CRSDepTimeNewHour#986, CRSDepTimeNewMinute#1009, DepTimeNewHour#1033, ... 4 more fields]
         +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20, Season#767, DepTimeNew#838, CRSDepTimeNew#858, CRSArrTimeNew#879, CRSDepTimeNewHour#986, CRSDepTimeNewMinute#1009, DepTimeNewHour#1033, ... 3 more fields]
            +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20, Season#767, DepTimeNew#838, CRSDepTimeNew#858, CRSArrTimeNew#879, CRSDepTimeNewHour#986, CRSDepTimeNewMinute#1009, DepTimeNewHour#1033, ... 2 more fields]
               +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20, Season#767, DepTimeNew#838, CRSDepTimeNew#858, CRSArrTimeNew#879, CRSDepTimeNewHour#986, CRSDepTimeNewMinute#1009, DepTimeNewHour#1033, substring(DepTimeNew#838, 3, 2) AS DepTimeNewMinute#1058]
                  +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20, Season#767, DepTimeNew#838, CRSDepTimeNew#858, CRSArrTimeNew#879, CRSDepTimeNewHour#986, CRSDepTimeNewMinute#1009, substring(DepTimeNew#838, 1, 2) AS DepTimeNewHour#1033]
                     +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20, Season#767, DepTimeNew#838, CRSDepTimeNew#858, CRSArrTimeNew#879, CRSDepTimeNewHour#986, substring(CRSDepTimeNew#858, 3, 2) AS CRSDepTimeNewMinute#1009]
                        +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20, Season#767, DepTimeNew#838, CRSDepTimeNew#858, CRSArrTimeNew#879, substring(CRSDepTimeNew#858, 1, 2) AS CRSDepTimeNewHour#986]
                           +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20, Season#767, DepTimeNew#838, CRSDepTimeNew#858, CASE WHEN (length(cast(CRSArrTime#7 as string)) = 3) THEN concat(0, cast(CRSArrTime#7 as string)) WHEN (length(cast(CRSArrTime#7 as string)) = 2) THEN concat(00, cast(CRSArrTime#7 as string)) ELSE cast(CRSArrTime#7 as string) END AS CRSArrTimeNew#879]
                              +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20, Season#767, DepTimeNew#838, CASE WHEN (length(cast(CRSDepTime#5 as string)) = 3) THEN concat(0, cast(CRSDepTime#5 as string)) WHEN (length(cast(CRSDepTime#5 as string)) = 2) THEN concat(00, cast(CRSDepTime#5 as string)) ELSE cast(CRSDepTime#5 as string) END AS CRSDepTimeNew#858]
                                 +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20, Season#767, CASE WHEN (length(cast(DepTime#4 as string)) = 3) THEN concat(0, cast(DepTime#4 as string)) WHEN (length(cast(DepTime#4 as string)) = 2) THEN concat(00, cast(DepTime#4 as string)) ELSE cast(DepTime#4 as string) END AS DepTimeNew#838]
                                    +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20, CASE WHEN ((Month#1 > 2) AND (Month#1 < 6)) THEN 1 WHEN ((Month#1 > 5) AND (Month#1 < 9)) THEN 2 WHEN ((Month#1 > 8) AND (Month#1 < 12)) THEN 3 ELSE 4 END AS Season#767]
                                       +- Filter atleastnnonnulls(17, Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20)
                                          +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20]
                                             +- Filter isnull(CancellationCode#22)
                                                +- Filter (Cancelled#21 = 0)
                                                   +- Deduplicate [TaxiOut#20, CRSDepTime#5, Month#1, Dest#17, Origin#16, DepTime#4, DayOfWeek#3, Year#0, UniqueCarrier#8, ArrDelay#14, Distance#18, DepDelay#15, FlightNum#9, CRSElapsedTime#12, CancellationCode#22, DayofMonth#2, CRSArrTime#7, TailNum#10, Cancelled#21]
                                                      +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20, Cancelled#21, CancellationCode#22]
                                                         +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20, Cancelled#21, CancellationCode#22, LateAircraftDelay#28]
                                                            +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20, Cancelled#21, CancellationCode#22, SecurityDelay#27, LateAircraftDelay#28]
                                                               +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20, Cancelled#21, CancellationCode#22, NASDelay#26, SecurityDelay#27, LateAircraftDelay#28]
                                                                  +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20, Cancelled#21, CancellationCode#22, WeatherDelay#25, NASDelay#26, SecurityDelay#27, LateAircraftDelay#28]
                                                                     +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20, Cancelled#21, CancellationCode#22, CarrierDelay#24, WeatherDelay#25, NASDelay#26, SecurityDelay#27, LateAircraftDelay#28]
                                                                        +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiOut#20, Cancelled#21, CancellationCode#22, Diverted#23, CarrierDelay#24, WeatherDelay#25, NASDelay#26, SecurityDelay#27, LateAircraftDelay#28]
                                                                           +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiIn#19, TaxiOut#20, Cancelled#21, CancellationCode#22, Diverted#23, CarrierDelay#24, WeatherDelay#25, NASDelay#26, ... 2 more fields]
                                                                              +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, CRSElapsedTime#12, AirTime#13, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiIn#19, TaxiOut#20, Cancelled#21, CancellationCode#22, Diverted#23, CarrierDelay#24, WeatherDelay#25, ... 3 more fields]
                                                                                 +- Project [Year#0, Month#1, DayofMonth#2, DayOfWeek#3, DepTime#4, CRSDepTime#5, CRSArrTime#7, UniqueCarrier#8, FlightNum#9, TailNum#10, ActualElapsedTime#11, CRSElapsedTime#12, AirTime#13, ArrDelay#14, DepDelay#15, Origin#16, Dest#17, Distance#18, TaxiIn#19, TaxiOut#20, Cancelled#21, CancellationCode#22, Diverted#23, CarrierDelay#24, ... 4 more fields]
                                                                                    +- Relation [Year#0,Month#1,DayofMonth#2,DayOfWeek#3,DepTime#4,CRSDepTime#5,ArrTime#6,CRSArrTime#7,UniqueCarrier#8,FlightNum#9,TailNum#10,ActualElapsedTime#11,CRSElapsedTime#12,AirTime#13,ArrDelay#14,DepDelay#15,Origin#16,Dest#17,Distance#18,TaxiIn#19,TaxiOut#20,Cancelled#21,CancellationCode#22,Diverted#23,... 5 more fields] csv


In [29]:
#df = df.withColumn("DepDelayNew",col("DepDelayNew").cast(IntegerType()))
df = df.withColumn("Duration",col("Duration").cast(IntegerType()))

In [30]:
#df.select("DepTimeNew","CRSDepTimeNew","DepDelay","DepDelayNew").show()

In [31]:
df.select("CRSArrTimeNew","CRSDepTimeNew","Duration").show()

+-------------+-------------+--------+
|CRSArrTimeNew|CRSDepTimeNew|Duration|
+-------------+-------------+--------+
|         1324|         1220|      64|
|         1711|         1409|     182|
|         1845|         1755|      50|
|         0830|         0655|      95|
|         0825|         0705|      80|
|         0859|         0737|      82|
|         0835|         0740|      55|
|         0845|         0745|      60|
|         1105|         0850|     135|
|         1015|         0908|      67|
|         1030|         0935|      55|
|         1105|         0940|      85|
|         1115|         1010|      65|
|         1140|         1040|      60|
|         1140|         1040|      60|
|         1245|         1136|      69|
|         1415|         1300|      75|
|         1512|         1315|     117|
|         1650|         1425|     145|
|         1516|         1436|      40|
+-------------+-------------+--------+
only showing top 20 rows



In [33]:
df = df.drop("CRSDepTime", "CRSArrTime", "DepTime", 'DepTimeNew', 'CRSDepTimeNew', 'CRSArrTimeNew', 'CRSDepTimeNewHour',
 'CRSDepTimeNewMinute', 'DepTimeNewHour',  'DepTimeNewMinute', 'DepDelayNew', 'CRSArrTimeNewHour', 'CRSArrTimeNewMinute')

## Concordancy between related variables

**No flights with same Origin and Destination**

In [34]:
# quitarlos directamente para automatizar el proceso?
df.filter(df.Origin == df.Dest).show()

+----+-----+----------+---------+-------------+---------+-------+--------------+--------+--------+------+----+--------+-------+------+--------+
|Year|Month|DayofMonth|DayOfWeek|UniqueCarrier|FlightNum|TailNum|CRSElapsedTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiOut|Season|Duration|
+----+-----+----------+---------+-------------+---------+-------+--------------+--------+--------+------+----+--------+-------+------+--------+
+----+-----+----------+---------+-------------+---------+-------+--------------+--------+--------+------+----+--------+-------+------+--------+



In [35]:
df.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- CRSElapsedTime: integer (nullable = true)
 |-- ArrDelay: integer (nullable = true)
 |-- DepDelay: integer (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: integer (nullable = true)
 |-- TaxiOut: integer (nullable = true)
 |-- Season: integer (nullable = false)
 |-- Duration: integer (nullable = true)



## Input

In [36]:
print(tabulate([[c, df.filter(col(c).isNull()).count()] for c in df.columns], headers=['Name', 'Count %']))

Name              Count %
--------------  ---------
Year                    0
Month                   0
DayofMonth              0
DayOfWeek               0
UniqueCarrier           0
FlightNum               0
TailNum                 0
CRSElapsedTime          0
ArrDelay                0
DepDelay                0
Origin                  0
Dest                    0
Distance                0
TaxiOut                 0
Season                  0
Duration             3879


In [37]:
df.filter(df.Duration.isNull()).show(5)

+----+-----+----------+---------+-------------+---------+-------+--------------+--------+--------+------+----+--------+-------+------+--------+
|Year|Month|DayofMonth|DayOfWeek|UniqueCarrier|FlightNum|TailNum|CRSElapsedTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiOut|Season|Duration|
+----+-----+----------+---------+-------------+---------+-------+--------------+--------+--------+------+----+--------+-------+------+--------+
|2008|    1|        25|        5|           DL|      690| N619DL|           151|     -24|      -3|   ATL| BOS|     946|     11|     4|    null|
|2008|    1|         5|        6|           WN|     2025| N776WN|           200|      49|      57|   LAS| MDW|    1521|     10|     4|    null|
|2008|    1|        18|        5|           WN|     1747| N451WN|           190|      11|      28|   PHX| BNA|    1448|      7|     4|    null|
|2008|    1|        25|        5|           OH|     4948| N597SW|           139|      11|      26|   LGA| TYS|     647|     15|     4|  

In [54]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['Duration'], 
    outputCols=['Duration']
)

df = imputer.fit(df).transform(df)

## Fix format of variables

In [55]:
df = df.drop( 'Date',  'datetime', 'FlightNum', 'TailNum')

In [56]:
df.columns

['Year',
 'Month',
 'DayofMonth',
 'DayOfWeek',
 'UniqueCarrier',
 'CRSElapsedTime',
 'ArrDelay',
 'DepDelay',
 'Origin',
 'Dest',
 'Distance',
 'TaxiOut',
 'Season',
 'Duration']

In [57]:
df.show(5)

+----+-----+----------+---------+-------------+--------------+--------+--------+------+----+--------+-------+------+--------+
|Year|Month|DayofMonth|DayOfWeek|UniqueCarrier|CRSElapsedTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiOut|Season|Duration|
+----+-----+----------+---------+-------------+--------------+--------+--------+------+----+--------+-------+------+--------+
|2008|    1|        28|        1|           EV|            64|     -19|      -5|   AGS| ATL|     143|      3|     4|      64|
|2008|    1|        27|        7|           YV|           122|      69|      91|   ORD| SAV|     773|      3|     4|     182|
|2008|    1|         4|        5|           OO|            50|      42|      40|   PDX| SEA|     129|      4|     4|      50|
|2008|    1|        21|        1|           WN|            95|      -4|      14|   DAL| BHM|     587|      5|     4|      95|
|2008|    1|        21|        1|           WN|            80|     -11|      -5|   MSY| BNA|     471|      5|     4|  

In [58]:
from pyspark.ml.feature import VectorAssembler
training = df.drop( 'Date',  'datetime', 'FlightNum', 'TailNum', 'UniqueCarrier', 'Origin', 'Dest')
training = training.select('Year', 'Month', 'DayofMonth', 'DayOfWeek', 'CRSElapsedTime',
 'DepDelay', 'Distance', 'TaxiOut', 'Season', 'Duration', 'ArrDelay')

vectorAssembler = VectorAssembler(inputCols = ['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'CRSElapsedTime',
 'DepDelay', 'Distance', 'TaxiOut', 'Season', 'Duration'], outputCol = 'features')
vdf = vectorAssembler.transform(training)
vdf = vdf.select(['features', 'ArrDelay'])
vdf.show(3)

+--------------------+--------+
|            features|ArrDelay|
+--------------------+--------+
|[2008.0,1.0,28.0,...|     -19|
|[2008.0,1.0,27.0,...|      69|
|[2008.0,1.0,4.0,5...|      42|
+--------------------+--------+
only showing top 3 rows



In [59]:
training.show(5)

+----+-----+----------+---------+--------------+--------+--------+-------+------+--------+--------+
|Year|Month|DayofMonth|DayOfWeek|CRSElapsedTime|DepDelay|Distance|TaxiOut|Season|Duration|ArrDelay|
+----+-----+----------+---------+--------------+--------+--------+-------+------+--------+--------+
|2008|    1|        28|        1|            64|      -5|     143|      3|     4|      64|     -19|
|2008|    1|        27|        7|           122|      91|     773|      3|     4|     182|      69|
|2008|    1|         4|        5|            50|      40|     129|      4|     4|      50|      42|
|2008|    1|        21|        1|            95|      14|     587|      5|     4|      95|      -4|
|2008|    1|        21|        1|            80|      -5|     471|      5|     4|      80|     -11|
+----+-----+----------+---------+--------------+--------+--------+-------+------+--------+--------+
only showing top 5 rows



In [60]:
splits = vdf.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

In [61]:
lr = LinearRegression(featuresCol = 'features', labelCol='ArrDelay', maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [62]:
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [0.0,0.0,0.0,0.0,-0.027314036042924667,0.986464511194288,0.0,0.7983756045738953,0.0,0.0]
Intercept: -10.73518439003692


In [63]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 10.973455
r2: 0.923366


In [51]:
from pyspark.ml.classification import LogisticRegression
training = df.drop( 'Date',  'datetime', 'FlightNum', 'TailNum', 'UniqueCarrier', 'Origin', 'Dest')
training = training.select('Year', 'Month', 'DayofMonth', 'DayOfWeek', 'CRSElapsedTime',
 'DepDelay', 'Distance', 'TaxiOut', 'Season', 'Duration', 'ArrDelay')
trainingData=training.rdd.map(lambda x:(Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"])


lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(trainingData)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

# We can also use the multinomial family for binary classification
mlr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial")

# Fit the model
mlrModel = mlr.fit(training)

# Print the coefficients and intercepts for logistic regression with multinomial family
print("Multinomial coefficients: " + str(mlrModel.coefficientMatrix))
print("Multinomial intercepts: " + str(mlrModel.interceptVector))

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 62.0 failed 1 times, most recent failure: Lost task 0.0 in stage 62.0 (TID 180) (DESKTOP-6TQUAIA executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:189)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:164)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:135)
	at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:409)
	at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:199)
	at java.net.ServerSocket.implAccept(ServerSocket.java:560)
	at java.net.ServerSocket.accept(ServerSocket.java:528)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:176)
	... 14 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.immutable.List.foreach(List.scala:333)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:437)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:189)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:164)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:135)
	at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:409)
	at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:199)
	at java.net.ServerSocket.implAccept(ServerSocket.java:560)
	at java.net.ServerSocket.accept(ServerSocket.java:528)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:176)
	... 14 more


# Split

In [44]:
# split into training(60%), validation(20%) and test(20%) datasets
trainingDf, validationDf, testDf = df.randomSplit([7, 1, 2])

#print(trainingDf.take(1))

#lets cache these datasets
trainingDf.cache()
validationDf.cache()
testDf.cache()

print("Num of training observations : %s" % trainingDf.count())
print("Num of validation observations : %s" % validationRdd.count())
print("Num of test observations : %s" % testDf.count())

Py4JJavaError: An error occurred while calling o274.randomSplit.
: java.lang.ClassCastException: java.lang.Integer cannot be cast to java.lang.Double
	at scala.runtime.BoxesRunTime.unboxToDouble(BoxesRunTime.java:112)
	at scala.collection.mutable.ArrayBuilder$ofDouble.addOne(ArrayBuilder.scala:402)
	at scala.collection.mutable.Growable.addAll(Growable.scala:62)
	at scala.collection.mutable.Growable.addAll$(Growable.scala:57)
	at scala.collection.mutable.ArrayBuilder.addAll(ArrayBuilder.scala:66)
	at scala.collection.IterableOnceOps.toArray(IterableOnce.scala:1282)
	at scala.collection.IterableOnceOps.toArray$(IterableOnce.scala:1276)
	at scala.collection.AbstractIterable.toArray(Iterable.scala:926)
	at org.apache.spark.sql.Dataset.randomSplit(Dataset.scala:2378)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)


## Categoricals

In [None]:
#convert the categorical attributes to binary features
categoricalAttributes = ['Origin', 'Dest', 'UniqueCarrier']

#Build a list of pipelist stages for the machine learning pipeline. 
#start by the feature transformer of one hot encoder for building the categorical features
pipelineStages = []
for columnName in categoricalAttributes:
    stringIndexer = StringIndexer(inputCol=columnName, outputCol=columnName+ "Index")
    pipelineStages.append(stringIndexer)
    oneHotEncoder = OneHotEncoder(inputCol=columnName+ "Index", outputCol=columnName + "Vec")
    pipelineStages.append(oneHotEncoder)
    
    
print("%s string indexer and one hot encoders transformers" %  len(pipelineStages) )

In [None]:
# Combine all the feature columns into a single column in the dataframe
numericColumns = ['Year', 'Month', 'DayofMonth', 'DayOfWeek',
 'UniqueCarrier', 
 'CRSElapsedTime', 'ArrDelay', 'DepDelay',
 'Origin', 'Dest', 'Distance', 'TaxiOut', 'Season', 'Duration']
categoricalCols = [s + "Vec" for s in categoricalAttributes]
allFeatureCols =  numericColumns + categoricalCols
vectorAssembler = VectorAssembler(
    inputCols=allFeatureCols,
    outputCol="features")
pipelineStages.append(vectorAssembler)
print("%s feature columns: %s" % (len(allFeatureCols),allFeatureCols))


#Build pipeline for feature extraction
featurePipeline = Pipeline(stages=pipelineStages)
featureOnlyModel = featurePipeline.fit(df)

In [None]:
#create list of Dataframes with features
trainingFeaturesDf = featureOnlyModel.transform(df)
validationFeaturesDf = featureOnlyModel.transform(validationDf)
testFeaturesDf = featureOnlyModel.transform(testDf)

#peek
trainingFeaturesDf.select("features", "label").rdd.take(2)

# Building A Machine Learning Model With Spark ML

In [92]:
from pyspark.ml.classification import LogisticRegression

# Configure an machine learning pipeline, which consists of the 
# an estimator (classification) (Logistic regression)
lr = LogisticRegression(maxIter=10, regParam=0.01)
lrPipeline = Pipeline(stages=[lr])

# Fit the pipeline to create a model from the training data
lrPipelineModel = lrPipeline.fit(trainingFeaturesDf)

def getAccuracyForPipelineModel(featuresDf, model):
    #perform prediction using the featuresdf and pipelineModel
    #compute the accuracy in percentage float
    results = model.transform(featuresDf)
    labelsAndPreds = results.map(lambda p: (p.label, p.prediction))
    return (calculateAccuracy(labelsAndPreds), results) 

# Evaluating the model on training data
lrTrainAccuracy, lrTrainResultDf = getAccuracyForPipelineModel(trainingFeaturesDf, lrPipelineModel)

# Repeat on test data
lrTestAccuracy, lrTestResultDf = getAccuracyForPipelineModel(testFeaturesDf, lrPipelineModel)

# Repeat on validation data
lrValidationAccuracy, lrValidationResultDf = getAccuracyForPipelineModel(validationFeaturesDf, lrPipelineModel)

print("==========================================")
print("LogisticRegression Model training accuracy (%) = " + str(lrTrainAccuracy))
print("LogisticRegression Model test accuracy (%) = " + str(lrTestAccuracy))
print("LogisticRegression Model validation accuracy (%) = " + str(lrValidationAccuracy))
print("==========================================")


NameError: name 'trainingFeaturesDf' is not defined

**Hyperparameter Tuning with Grid search**

In [88]:
maxIterRange = [5, 10, 30, 50, 100]
regParamRange = [1e-10, 1e-5, 1e-1]
#baseline values from previous section
bestIter = 10
bestRegParam = 0.01
bestModel = lr
bestAccuracy = lrValidationAccuracy


NameError: name 'lrValidationAccuracy' is not defined

In [76]:

#for plotting purpose
iterations = []
regParams = []
accuracies = []
for maxIter in maxIterRange:
    for rp in regParamRange:
        currentLr = LogisticRegression(maxIter=maxIter, regParam=rp)
        pipeline = Pipeline(stages=[currentLr])
        model = pipeline.fit(trainingFeaturesDf)
        
        #use validation dataset test for accuracy
        accuracy, resultDf = getAccuracyForPipelineModel(validationFeaturesDf, model)
        print "maxIter: %s, regParam: %s, accuracy: %s " % (maxIter, rp, accuracy)
        accuracies.append(accuracy)
        regParams.append(log(rp))
        iterations.append(maxIter)
        
        if accuracy > lrValidationAccuracy:
            bestIter = maxIter
            bestRegParam = rp
            bestModel = model
            bestAccuracy = accuracy


print "Best parameters: maxIter %s, regParam %s, accuracy : %s" % (bestIter, bestRegParam, bestAccuracy)

# Repeat on test data
gridTestAccuracy, gridTestResultDf = getAccuracyForPipelineModel(testFeaturesDf, bestModel)

print("==========================================")
print("Grid search Model test accuracy (%) = " + str(gridTestAccuracy))
print("==========================================")


In [91]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# We use a ParamGridBuilder to construct a grid of parameters to search over.
grid = (ParamGridBuilder()
        .addGrid(lr.maxIter, maxIterRange) 
        .addGrid(lr.regParam,regParamRange )
        .build())


evaluator = BinaryClassificationEvaluator()

# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
# A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
crossValidator = CrossValidator(estimator=lrPipeline, 
                                estimatorParamMaps=grid, 
                                numFolds=5,
                                evaluator=evaluator)


# Run cross-validation, and choose the best model
bestCvModel = crossValidator.fit(trainingFeaturesDf)

# verify results on training dataset
cvTrainAccuracy, cvTrainResultDf = getAccuracyForPipelineModel(trainingFeaturesDf, bestCvModel)

# Repeat on test data
cvTestAccuracy, cvTestResultDf = getAccuracyForPipelineModel(testFeaturesDf, bestCvModel)

print("==========================================")
print("CV Model training accuracy (%) = " + str(cvTrainAccuracy))
print("CV Model test accuracy (%) = " + str(cvTestAccuracy))
print("==========================================")

NameError: name 'lrPipeline' is not defined

# Evaluating the Model

In [None]:
# Coefficients for the model
linearModel.coefficients

In [None]:
# Intercept for the model
linearModel.intercept

In [None]:
coeff_df = pd.DataFrame({"Feature": ["Intercept"] + featureCols, "Co-efficients": np.insert(linearModel.coefficients.toArray(), 0, linearModel.intercept)})
coeff_df = coeff_df[["Feature", "Co-efficients"]]
coeff_df