### Create a model to predict the flight delay over 15 minutes (ARR_DEL15) 


In [0]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark ML Class Assisgnment") \
    .getOrCreate()

In [0]:
# read the file
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, TimestampType
df = (spark.read.format("csv").
  option("header", "true").
  option("nullValue", "NA").
  option("inferSchema", True).
  load("/FileStore/tables/flight_weather_small.csv"))

In this dataset,

ARR_DEL15 : 1 when the flight is delayed over 15 minutes, 0 otherwise.
XXXOrigin : Weather conditions in departure airport.
XXXDest : Weather conditions in destination airport.

In [0]:
# check sample data from table

df.printSchema()

root
 |-- X.1: integer (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY_OF_MONTH: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- FL_DATE: date (nullable = true)
 |-- UNIQUE_CARRIER: string (nullable = true)
 |-- TAIL_NUM: string (nullable = true)
 |-- FL_NUM: integer (nullable = true)
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- ORIGIN_STATE_ABR: string (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DEST_STATE_ABR: string (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- DEP_TIME: integer (nullable = true)
 |-- DEP_DELAY: integer (nullable = true)
 |-- DEP_DELAY_NEW: integer (nullable = true)
 |-- DEP_DEL15: integer (nullable = true)
 |-- DEP_DELAY_GROUP: integer (nullable = true)
 |-- TAXI_OUT: integer (nullable = true)
 |-- WHEELS_OFF: integer (nullable = true)
 |-- WHEELS_ON: int

In [0]:
# mark as "delayed over 15 minutes" if it's canceled
from pyspark.sql.functions import when
df = df.withColumn("ARR_DEL15", when(df["CANCELLED"] == 1, 1).otherwise(df["ARR_DEL15"]))

In [0]:
# remove flights if it's diverted
df = df.filter(df["DIVERTED"] == 0)

#### Narrow to required columns.

"ARR_DEL15",
  "MONTH",
  "DAY_OF_WEEK",
  "UNIQUE_CARRIER",
  "ORIGIN",
  "DEST",
  "CRS_DEP_TIME",
  "CRS_ARR_TIME",
  "RelativeHumidityOrigin",
  "AltimeterOrigin",
  "DryBulbCelsiusOrigin",
  "WindSpeedOrigin",
  "VisibilityOrigin",
  "DewPointCelsiusOrigin",
  "RelativeHumidityDest",
  "AltimeterDest",
  "DryBulbCelsiusDest",
  "WindSpeedDest",
  "VisibilityDest",
  "DewPointCelsiusDest"

In [0]:
# select required columns
required_columns = [
    "ARR_DEL15",
    "MONTH",
    "DAY_OF_WEEK",
    "UNIQUE_CARRIER",
    "ORIGIN",
    "DEST",
    "CRS_DEP_TIME",
    "CRS_ARR_TIME",
    "RelativeHumidityOrigin",
    "AltimeterOrigin",
    "DryBulbCelsiusOrigin",
    "WindSpeedOrigin",
    "VisibilityOrigin",
    "DewPointCelsiusOrigin",
    "RelativeHumidityDest",
    "AltimeterDest",
    "DryBulbCelsiusDest",
    "WindSpeedDest",
    "VisibilityDest",
    "DewPointCelsiusDest"
]

df = df.select(required_columns)

In [0]:
# drop rows with null value
df = df.dropna()

In [0]:
# split data into training data and evaluation data (ratio 80% : 20%)
train_data, eval_data = df.randomSplit([0.8, 0.2], seed=42)

#### Convert categorical values to index values (0, 1, ...) for the following columns.

Carrier code (UNIQUE_CARRIER)
Airport code in departure (ORIGIN)
Airport code in destination (DEST)
Flag (0 or 1) for delay over 15 minutes (ARR_DEL15)

hint: pyspark.ml.feature check StringIndexer transformer

In [0]:
from pyspark.ml.feature import StringIndexer

# convert categorical values to index values
uniqueCarrierIndexer = StringIndexer(inputCol="UNIQUE_CARRIER", outputCol="Indexed_UNIQUE_CARRIER").fit(df)
originIndexer = StringIndexer(inputCol="ORIGIN", outputCol="Indexed_ORIGIN").fit(df)
destIndexer = StringIndexer(inputCol="DEST", outputCol="Indexed_DEST").fit(df)
arrDel15Indexer = StringIndexer(inputCol="ARR_DEL15", outputCol="Indexed_ARR_DEL15").fit(df)

# apply the indexers to the df
df = uniqueCarrierIndexer.transform(df)
df = originIndexer.transform(df)
df = destIndexer.transform(df)
df = arrDel15Indexer.transform(df)

# drop the original categorical columns as they are now indexed
df = df.drop("UNIQUE_CARRIER", "ORIGIN", "DEST", "ARR_DEL15")

# rename the indexed ARR_DEL15 column back to ARR_DEL15
df = df.withColumnRenamed("Indexed_ARR_DEL15", "ARR_DEL15")

# show the transformed df
df.show(5)

+-----+-----------+------------+------------+----------------------+---------------+--------------------+---------------+----------------+---------------------+--------------------+-------------+------------------+-------------+--------------+-------------------+----------------------+--------------+------------+---------+
|MONTH|DAY_OF_WEEK|CRS_DEP_TIME|CRS_ARR_TIME|RelativeHumidityOrigin|AltimeterOrigin|DryBulbCelsiusOrigin|WindSpeedOrigin|VisibilityOrigin|DewPointCelsiusOrigin|RelativeHumidityDest|AltimeterDest|DryBulbCelsiusDest|WindSpeedDest|VisibilityDest|DewPointCelsiusDest|Indexed_UNIQUE_CARRIER|Indexed_ORIGIN|Indexed_DEST|ARR_DEL15|
+-----+-----------+------------+------------+----------------------+---------------+--------------------+---------------+----------------+---------------------+--------------------+-------------+------------------+-------------+--------------+-------------------+----------------------+--------------+------------+---------+
|    1|          3|      

#### In Spark machine learning, the feature columns must be wrapped as a single vector value.

So create new vector column named "features".

Hint: pyspark.ml.feature check VectorAssembler

In [0]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
  inputCols = [
    "MONTH",
    "DAY_OF_WEEK",
    "Indexed_UNIQUE_CARRIER",
    "Indexed_ORIGIN",
    "Indexed_DEST",
    "CRS_DEP_TIME",
    "CRS_ARR_TIME",
    "RelativeHumidityOrigin",
    "AltimeterOrigin",
    "DryBulbCelsiusOrigin",
    "WindSpeedOrigin",
    "VisibilityOrigin",
    "DewPointCelsiusOrigin",
    "RelativeHumidityDest",
    "AltimeterDest",
    "DryBulbCelsiusDest",
    "WindSpeedDest",
    "VisibilityDest",
    "DewPointCelsiusDest"],
  outputCol = "features")

# transform the df to include the features column
df = assembler.transform(df)

#### Generate classifier. Here we use Decision Tree classifier.

Hint: From pyspark.ml.classification check DecisionTreeClassifier

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier

# create the Decision Tree classifier
classifier = DecisionTreeClassifier(labelCol="ARR_DEL15", featuresCol="features", maxBins=300)

#### Generate SparkML pipeline and run training.
Trained model (with coefficients) and pipeline are stored in the variable "model".

In [0]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[uniqueCarrierIndexer, originIndexer, destIndexer, arrDel15Indexer, assembler, classifier])
model = pipeline.fit(train_data)

#### Predict with eveluation data.

In [0]:
# predict with evaluation data
pred = model.transform(eval_data)

#### Show eveluation result.

Hint: pyspark.ml.evaluation check MulticlassClassificationEvaluator

In [0]:
# evaluate results
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="Indexed_ARR_DEL15")
accuracy = evaluator.evaluate(pred)
print(f"Evaluation set accuracy = {accuracy:.2f}")

Evaluation set accuracy = 0.44


In [0]:
# save pipeline
#model.save("...")