### Test Model

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
sc = SparkContext(master="local[1]")
sc

In [5]:
import pandas as pd
fl_temp = pd.read_csv('2007.csv')

In [17]:
del fl_temp

In [13]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('test').getOrCreate()

In [14]:
df = spark.read.csv('2007.csv', inferSchema=True, header=True)

In [15]:
df1 = df.limit(1000)

In [16]:
df1.show(1)

+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|2007|    1|         1|        1|   1232|      1225|   1341|      1340|           WN|     2891

In [21]:
df1.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: string (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: integer (nullable = true)
 |-- TaxiIn: integer (nullable = true)
 |-- TaxiOut: integer (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- C

In [22]:
df1 = df1.drop('UniqueCarrier', 'FlightNum', 'TailNum', 'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 
               'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'CancellationCode', 
               'Diverted', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay')

In [23]:
df1.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- ArrDelay: string (nullable = true)



In [33]:
df_new = df1.select(df1.Month.cast("string"), df1.DayofMonth, df1.DayOfWeek, df1.DepTime.cast("float"), df1.CRSDepTime, 
           df1.ArrTime.cast("float"), df1.CRSArrTime, df1.ArrDelay.cast("float"))

In [34]:
df_new.printSchema()

root
 |-- Month: string (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: float (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: float (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- ArrDelay: float (nullable = true)



In [89]:
df_new.show(2)

+----------+---------+-------+----------+-------+----------+--------+----------+
|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|ArrDelay|  business|
+----------+---------+-------+----------+-------+----------+--------+----------+
|         1|        1| 1232.0|      1225| 1341.0|      1340|     1.0|Low Season|
|         1|        1| 1918.0|      1905| 2043.0|      2035|     8.0|Low Season|
+----------+---------+-------+----------+-------+----------+--------+----------+
only showing top 2 rows



In [76]:
from pyspark.sql import functions as F
df_new = df_new.withColumn('business', F.array(F.lit("High Season"), F.lit("Low Season"), F.lit("Intermediate"), )\
                 .getItem((F.rand()*3).cast("int")))

In [77]:
df_new = df_new.drop('Month')

In [112]:
df_new = df_new.na.drop()

In [113]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.feature import OneHotEncoderEstimator, VectorAssembler, StringIndexer
from pyspark.ml import Pipeline

In [114]:
cols = df_new.columns

In [115]:
[i for i in cols]

['DayofMonth',
 'DayOfWeek',
 'DepTime',
 'CRSDepTime',
 'ArrTime',
 'CRSArrTime',
 'ArrDelay',
 'business']

In [116]:
cols_string = ['business']
cols_num = ['DayofMonth','DayOfWeek','DepTime','CRSDepTime','ArrTime','CRSArrTime','ArrDelay']

In [117]:
stages= []
for col in cols_string:
    stringIndexer = StringIndexer(inputCol = col , outputCol = col+'_StringIndex')
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[col + '_ClassVect'])
stages += [stringIndexer, encoder]

In [118]:
assemblerInputs = [c + "_ClassVect" for c in cols_string] + cols_num

In [119]:
assemblerInputs

['business_ClassVect',
 'DayofMonth',
 'DayOfWeek',
 'DepTime',
 'CRSDepTime',
 'ArrTime',
 'CRSArrTime',
 'ArrDelay']

In [120]:
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

In [121]:
stages += [assembler]

In [122]:
pipeline = Pipeline(stages=stages)

In [123]:
pipelineModel = pipeline.fit(df_new)

In [124]:
out_df = pipelineModel.transform(df_new)

In [125]:
out_df.columns

['DayofMonth',
 'DayOfWeek',
 'DepTime',
 'CRSDepTime',
 'ArrTime',
 'CRSArrTime',
 'ArrDelay',
 'business',
 'business_StringIndex',
 'business_ClassVect',
 'features']

In [126]:
out_df.show(1)

+----------+---------+-------+----------+-------+----------+--------+----------+--------------------+------------------+--------------------+
|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|ArrDelay|  business|business_StringIndex|business_ClassVect|            features|
+----------+---------+-------+----------+-------+----------+--------+----------+--------------------+------------------+--------------------+
|         1|        1| 1232.0|      1225| 1341.0|      1340|     1.0|Low Season|                 0.0|     (2,[0],[1.0])|[1.0,0.0,1.0,1.0,...|
+----------+---------+-------+----------+-------+----------+--------+----------+--------------------+------------------+--------------------+
only showing top 1 row



In [127]:
train, test = out_df.randomSplit([0.80, 0.20], seed=1)

In [128]:
lr = LinearRegression(maxIter=10, featuresCol="features", labelCol="ArrDelay")

In [129]:
lrmodel = lr.fit(train)

In [130]:
trainingSummary = lrmodel.summary

In [133]:
trainingSummary.totalIterations

11

In [134]:
trainingSummary.objectiveHistory

[0.5,
 0.20138194142528876,
 0.0579458764671586,
 0.00013909001712753,
 3.0862810309284505e-05,
 2.1769533005477903e-05,
 2.053059468232199e-05,
 1.2255359182256065e-05,
 7.493118569734669e-06,
 6.974512186785908e-06,
 2.53228374702541e-06]

In [135]:
trainingSummary.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|-0.01728633432284...|
|-0.00695610006483...|
|-0.03592328109586...|
|0.056600330021144465|
| 0.02016084153481401|
|3.004352001738652...|
|0.019821441940166704|
| 0.01649448418504562|
|0.009582999306423368|
|0.020624569201370235|
|0.003045160937494...|
|-0.00957254113564...|
|0.023119321430641993|
|-0.02229109878016...|
|-0.00614336746321...|
|-0.00299653921125...|
|5.225180427714804E-5|
|-0.01095814757118...|
|0.012912306232155224|
|0.013578802806035739|
+--------------------+
only showing top 20 rows



In [136]:
trainingSummary.rootMeanSquaredError

0.04844938912109281

In [137]:
trainingSummary.r2

0.999994935432506

In [138]:
from pyspark.ml.evaluation import RegressionEvaluator

In [139]:
predictions = lrmodel.transform(test)

In [144]:
predictions.show(2)

+----------+---------+-------+----------+-------+----------+--------+-----------+--------------------+------------------+--------------------+------------------+
|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|ArrDelay|   business|business_StringIndex|business_ClassVect|            features|        prediction|
+----------+---------+-------+----------+-------+----------+--------+-----------+--------------------+------------------+--------------------+------------------+
|         1|        1|  715.0|       715|  722.0|       730|    -8.0| Low Season|                 0.0|     (2,[0],[1.0])|[1.0,0.0,1.0,1.0,...|-7.984643598803764|
|         1|        1|  739.0|       740|  832.0|       840|    -8.0|High Season|                 1.0|     (2,[1],[1.0])|[0.0,1.0,1.0,1.0,...|-8.016700743670981|
+----------+---------+-------+----------+-------+----------+--------+-----------+--------------------+------------------+--------------------+------------------+
only showing top 2 rows



In [147]:
evaluator = RegressionEvaluator(labelCol = "ArrDelay", predictionCol="prediction", metricName="rmse")

In [148]:
print("Test set AOC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})))

Test set AOC: 0.047992191205631154
