In [63]:
import pyspark

In [64]:
from pyspark import SparkContext as sc

In [65]:
from pyspark.sql import SparkSession

In [66]:
import pandas as pd
import numpy as np

In [67]:
from pyspark.ml.feature import StandardScaler,StringIndexer, VectorAssembler, VectorIndexer, OneHotEncoder

In [68]:
from pyspark.ml import pipeline

In [69]:
import sklearn
from sklearn.model_selection import train_test_split

In [113]:
spark = (SparkSession.builder
                  .getOrCreate())

In [141]:
flights = spark.read.format("csv") \
       .option("header", "true") \
       .load('../Datasets/flights.csv') 

In [142]:
flights.show(5)

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|
|2014|    3|  9|    1443|       -2|    1652|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|  14|    43|
|2014|    4|  9|    1705|       45|    1839|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|  17|     5|
|2014|    3|  9|     754|       -1|    1015|        1|     AS| N612AS|   522|   SEA| BUR|     127|     937|   7|    54|
+----+-----+---+--------+---------+-----

In [143]:
flights.printSchema()

root
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- dep_time: string (nullable = true)
 |-- dep_delay: string (nullable = true)
 |-- arr_time: string (nullable = true)
 |-- arr_delay: string (nullable = true)
 |-- carrier: string (nullable = true)
 |-- tailnum: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- air_time: string (nullable = true)
 |-- distance: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- minute: string (nullable = true)



In [144]:
flights= flights.withColumn('arr_time', flights.arr_time.cast('int'))

In [145]:
flights= flights.withColumn('dep_time', flights.dep_time.cast('int'))

In [146]:
flights= flights.withColumn('arr_delay', flights.arr_delay.cast('int'))

In [147]:
flights= flights.withColumn('air_time', flights.air_time.cast('int'))

In [148]:
flights= flights.withColumn('distance', flights.distance.cast('int'))

In [149]:
planes= spark.read.format("csv") \
       .option("header", "true") \
       .load('../Datasets/planes.csv')

In [150]:
planes.show(2)

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
only showing top 2 rows



In [151]:
planes= planes.withColumnRenamed('year', 'plane_year')

In [152]:
planes.show(2)

+-------+----------+--------------------+----------------+--------+-------+-----+-----+---------+
|tailnum|plane_year|                type|    manufacturer|   model|engines|seats|speed|   engine|
+-------+----------+--------------------+----------------+--------+-------+-----+-----+---------+
| N102UW|      1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
| N103US|      1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
+-------+----------+--------------------+----------------+--------+-------+-----+-----+---------+
only showing top 2 rows



In [153]:
flights.show(2)

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
only showing top 2 rows



In [154]:
model_data = flights.join(planes, on ='tailnum', how='leftouter')

In [155]:
model_data.show(2)

+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----------+--------------------+------------+--------+-------+-----+-----+---------+
|tailnum|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|flight|origin|dest|air_time|distance|hour|minute|plane_year|                type|manufacturer|   model|engines|seats|speed|   engine|
+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----------+--------------------+------------+--------+-------+-----+-----+---------+
| N846VA|2014|   12|  8|     658|       -7|     935|       -5|     VX|  1780|   SEA| LAX|     132|     954|   6|    58|      2011|Fixed wing multi ...|      AIRBUS|A320-214|      2|  182|   NA|Turbo-fan|
| N559AS|2014|    1| 22|    1040|        5|    1505|        5|     AS|   851|   SEA| HNL|     360|    2677|  10|    40|      2006|Fixed wing multi ...|      BOEING| 737-890|      2|  1

In [156]:
model_data = model_data.withColumn('month', model_data.month.cast('int'))

In [157]:
model_data = model_data.withColumn('plane_year', model_data.plane_year.cast('int'))

In [158]:
model_data.printSchema()

root
 |-- tailnum: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: string (nullable = true)
 |-- dep_time: integer (nullable = true)
 |-- dep_delay: string (nullable = true)
 |-- arr_time: integer (nullable = true)
 |-- arr_delay: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- air_time: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- hour: string (nullable = true)
 |-- minute: string (nullable = true)
 |-- plane_year: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- engines: string (nullable = true)
 |-- seats: string (nullable = true)
 |-- speed: string (nullable = true)
 |-- engine: string (nullable = true)



In [159]:
model_data= model_data.withColumn('plane_age', model_data.year - model_data.plane_year)

In [160]:
model_data  = model_data.withColumn('is_late', model_data.arr_delay >0)

In [161]:
model_data = model_data.withColumn('label', model_data.is_late.cast('int'))

In [162]:
model_data.show(2)

+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----------+--------------------+------------+--------+-------+-----+-----+---------+---------+-------+-----+
|tailnum|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|flight|origin|dest|air_time|distance|hour|minute|plane_year|                type|manufacturer|   model|engines|seats|speed|   engine|plane_age|is_late|label|
+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----------+--------------------+------------+--------+-------+-----+-----+---------+---------+-------+-----+
| N846VA|2014|   12|  8|     658|       -7|     935|       -5|     VX|  1780|   SEA| LAX|     132|     954|   6|    58|      2011|Fixed wing multi ...|      AIRBUS|A320-214|      2|  182|   NA|Turbo-fan|      3.0|  false|    0|
| N559AS|2014|    1| 22|    1040|        5|    1505|        5|     AS|   851|   SEA| HNL

In [163]:
model_data = model_data.filter('arr_delay is not NULL and dep_delay is not NULL and air_time is not NULL and plane_year is not NULL')

In [164]:
model_data.show()

+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----------+--------------------+--------------+-----------+-------+-----+-----+---------+---------+-------+-----+
|tailnum|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|flight|origin|dest|air_time|distance|hour|minute|plane_year|                type|  manufacturer|      model|engines|seats|speed|   engine|plane_age|is_late|label|
+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----------+--------------------+--------------+-----------+-------+-----+-----+---------+---------+-------+-----+
| N846VA|2014|   12|  8|     658|       -7|     935|       -5|     VX|  1780|   SEA| LAX|     132|     954|   6|    58|      2011|Fixed wing multi ...|        AIRBUS|   A320-214|      2|  182|   NA|Turbo-fan|      3.0|  false|    0|
| N559AS|2014|    1| 22|    1040|        5|    1505|        5|     A

In [165]:
carr_indexer = StringIndexer(inputCol='carrier', outputCol='carrier_index')

In [167]:
carr_encoder= OneHotEncoder(inputCol='carrier_index', outputCol= 'carrier_fact')

In [168]:
dest_indexer= StringIndexer(inputCol='dest', outputCol='dest_index')

In [170]:
dest_encoder = OneHotEncoder(inputCol='dest_index', outputCol='dest_fact')

In [171]:
from pyspark.ml.feature import VectorAssembler

In [172]:
vec_assembler = VectorAssembler(inputCols= ['month', 'air_time', 'carrier_fact', 'dest_fact', 'plane_age'], outputCol='features')

In [173]:
from pyspark.ml import Pipeline

In [174]:
flights_pipe = Pipeline(stages= [dest_indexer, dest_encoder, carr_indexer, carr_encoder, vec_assembler])

In [175]:
piped_data= flights_pipe.fit(model_data).transform(model_data)

In [176]:
training, test= piped_data.randomSplit([.6,.4])

In [196]:
training.show(3)

+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----------+--------------------+----------------+--------+-------+-----+-----+---------+---------+-------+-----+----------+---------------+-------------+--------------+--------------------+
|tailnum|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|flight|origin|dest|air_time|distance|hour|minute|plane_year|                type|    manufacturer|   model|engines|seats|speed|   engine|plane_age|is_late|label|dest_index|      dest_fact|carrier_index|  carrier_fact|            features|
+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----------+--------------------+----------------+--------+-------+-----+-----+---------+---------+-------+-----+----------+---------------+-------------+--------------+--------------------+
| N105UW|2014|    3| 13|    1325|        5|    2123|       13|     US|  1

In [177]:
from pyspark.ml.classification import LogisticRegression

In [178]:
lr = LogisticRegression()

In [179]:
import pyspark.ml.evaluation as evals

In [180]:
evaluator = evals.BinaryClassificationEvaluator(metricName='areaUnderROC')

In [181]:
import pyspark.ml.tuning as tune

In [182]:
grid= tune.ParamGridBuilder()

In [183]:
grid = grid.addGrid(lr.regParam, np.arange(0, .1, .01))

In [184]:
grid = grid.addGrid(lr.elasticNetParam, [0,1])

In [185]:
grid = grid.build()

In [186]:
cv = tune.CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)

In [187]:
models = cv.fit(training)

In [188]:
best_lr = models.bestModel

In [189]:
print(best_lr)

LogisticRegressionModel: uid=LogisticRegression_85f1a077b4c7, numClasses=2, numFeatures=81


In [191]:
best_lr2 = lr.fit(training)

In [192]:
best_lr2

LogisticRegressionModel: uid=LogisticRegression_85f1a077b4c7, numClasses=2, numFeatures=81

In [193]:
test_results= best_lr2.transform(test)

In [194]:
evaluator.evaluate(test_results)

0.6778400243924788