In [1]:
import findspark
findspark.init()

findspark.find()
import pyspark

In [42]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [3]:
import numpy as np
import pandas as pd

In [4]:
spark = SparkSession.builder.getOrCreate()
print(spark)

<pyspark.sql.session.SparkSession object at 0x0000029F6F8E0648>


In [7]:
flights = spark.read.csv('flights_small.csv', header=True)
planes = spark.read.csv('planes.csv', header=True)

# Join the DataFrames

In [10]:
# rename year column
planes = planes.withColumnRenamed('year', 'plane_year')

# join the dataframes
model_data = flights.join(planes, on='tailnum', how='leftouter')

# String to integer

In [14]:
model_data.dtypes

[('tailnum', 'string'),
 ('year', 'string'),
 ('month', 'string'),
 ('day', 'string'),
 ('dep_time', 'string'),
 ('dep_delay', 'string'),
 ('arr_time', 'string'),
 ('arr_delay', 'string'),
 ('carrier', 'string'),
 ('flight', 'string'),
 ('origin', 'string'),
 ('dest', 'string'),
 ('air_time', 'string'),
 ('distance', 'string'),
 ('hour', 'string'),
 ('minute', 'string'),
 ('plane_year', 'string'),
 ('type', 'string'),
 ('manufacturer', 'string'),
 ('model', 'string'),
 ('engines', 'string'),
 ('seats', 'string'),
 ('speed', 'string'),
 ('engine', 'string')]

In [15]:
# cast the columns to integers
model_data = model_data.withColumn("arr_delay", model_data.arr_delay.cast('integer'))
model_data = model_data.withColumn("air_time", model_data.air_time.cast('integer'))
model_data = model_data.withColumn("month", model_data.month.cast('integer'))
model_data = model_data.withColumn("plane_year", model_data.plane_year.cast('integer'))

# Create a new column

In [16]:
# create column plane_age
model_data = model_data.withColumn('plane_age', model_data.year - model_data.plane_year)

# Making a Boolean

In [18]:
# create is_late
model_data = model_data.withColumn('is_late', model_data.arr_delay > 0)

# convert to integer
model_data = model_data.withColumn('label', model_data.is_late.cast('integer'))

# remove missing values
model_data = model_data.filter('arr_delay is not NULL and dep_delay is not NULL and air_time is not NULL and plane_year is not NULL')

In [19]:
model_data.count()

6216

# Carrier

In [21]:
# create StringIndexer
carr_indexer = StringIndexer(inputCol='carrier',
                             outputCol='carrier_index')

# create OneHotEncoder
carr_encoder = OneHotEncoder(inputCol='carrier_index',
                             outputCol='carrier_fact')

# Destination

In [22]:
# create a StringIndexer
dest_indexer = StringIndexer(inputCol='dest',
                             outputCol='dest_index')

# create a OneHotEncoder
dest_encoder = OneHotEncoder(inputCol='dest_index', 
                             outputCol='dest_fact')

# Assemble a vector

In [24]:
# Make a VectorAssembler
input_cols = ['month', 'air_time', 'carrier_fact', 'dest_fact', 'plane_age']
vec_assembler = VectorAssembler(inputCols=input_cols,
                                outputCol='features')

# Create the pipeline

In [26]:
flights_pipe = Pipeline(stages=[dest_indexer,
                                dest_encoder,
                                carr_indexer,
                                carr_encoder,
                                vec_assembler])

# Transform the data

In [27]:
# fit and transform the data
piped_data = flights_pipe.fit(model_data).transform(model_data)

# Split the data

In [32]:
train, test = piped_data.randomSplit([.8, .2])

# Create the modeler

In [37]:
lr = LogisticRegression()

# Create the evaluator

In [39]:
evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')

# Make a grid for fine-tuning

In [41]:
# create parameter grid
grid = ParamGridBuilder()

grid = grid.addGrid(lr.regParam, np.arange(0, 0.1, 0.01))
grid = grid.addGrid(lr.elasticNetParam, [0, 1])

# build grid
grid = grid.build()

# Make the validator

In [43]:
cv = CrossValidator(estimator=lr,
                    estimatorParamMaps=grid,
                    evaluator=evaluator)

# Fit the model

In [44]:
models = cv.fit(train)

In [45]:
# get best fit
best_lr = models.bestModel

# Evaluate the model

In [46]:
# use the model to predict the test set
test_results = best_lr.transform(test)

# eval
print(f'test acc: {evaluator.evaluate(test_results): .1%}')

test acc:  71.0%
