In [21]:
import pyspark

In [22]:
import pandas as pd

In [23]:
import numpy as np

In [24]:
from pyspark.sql import SparkSession

In [25]:
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [26]:
from pyspark.ml.classification import DecisionTreeClassifier

In [27]:
from pyspark.ml.classification import LogisticRegression

In [28]:
flights= spark.read.csv('../Datasets/flights.csv', header=True, inferSchema=True, nullValue='NA')

In [29]:
flights.show(2)

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
only showing top 2 rows



In [30]:
flights= flights.dropna()

In [31]:
flights.printSchema()

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- dep_time: integer (nullable = true)
 |-- dep_delay: integer (nullable = true)
 |-- arr_time: integer (nullable = true)
 |-- arr_delay: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- tailnum: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- air_time: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)



In [32]:
flights= flights.withColumn('label', (flights.arr_delay >15).cast('int'))

In [33]:
flights.show(2)

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|label|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|    0|
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|    0|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----+
only showing top 2 rows



In [34]:
flights_train, flights_test= flights.randomSplit([0.8,0.2])

In [15]:
from pyspark.ml import Pipeline

In [16]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

In [17]:
indexer = StringIndexer(inputCols=['carrier'], outputCols=['carrier_idx'])

In [18]:
ohe= OneHotEncoder(inputCols=['carrier_idx'], outputCols=['carrier_dum'])

In [19]:
vec = VectorAssembler(inputCols=['carrier_dum', 'distance'], outputCol='features')

In [20]:
tree= DecisionTreeClassifier()

In [82]:
logreg= LogisticRegression()

In [83]:
pipeline = Pipeline(stages=[indexer, ohe, vec, logreg])

In [84]:
pipeline_fit= pipeline.fit(flights_train)

In [85]:
pipe_pred_train = pipeline_fit.transform(flights_train)

In [86]:
pipe_pred_test = pipeline_fit.transform(flights_test)

In [87]:
pipe_pred_train.show(2)

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----+-----------+--------------+--------------------+--------------------+--------------------+----------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|label|carrier_idx|   carrier_dum|            features|       rawPrediction|         probability|prediction|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----+-----------+--------------+--------------------+--------------------+--------------------+----------+
|2014|    1|  1|     550|        0|     837|      -12|     DL| N660DL|  1634|   SEA| SLC|      82|     689|   5|    50|    0|        3.0|(10,[3],[1.0])|(11,[3,10],[1.0,6...|[2.17098367705574...|[0.89761340536418...|       0.0|
|2014|    1|  1|     600|      -10|     842|       -8|     AS| N786AS|   426|   SEA| LAX|   

In [88]:
pipe_pred_train.groupby('prediction', 'label').count().show()

+----------+-----+-----+
|prediction|label|count|
+----------+-----+-----+
|       0.0|    0| 6716|
|       0.0|    1| 1204|
+----------+-----+-----+

