In [1]:
tripdelaysFilePath = "/databricks-datasets/flights/departuredelays.csv"
airportsnaFilePath = "/databricks-datasets/flights/airport-codes-na.txt"

# Obtain airports dataset
airportsna = sqlContext.read.format("com.databricks.spark.csv").options(header='true', inferschema='true', delimiter='\t').load(airportsnaFilePath)
airportsna.registerTempTable("airports_na") # make table available in sql commands


In [2]:
display(airportsna)

In [3]:
airportsna

In [4]:
airports_na


In [5]:
departureDelays = sqlContext.read.format("com.databricks.spark.csv").options(header='true').load(tripdelaysFilePath)
departureDelays.registerTempTable("departureDelays")
departureDelays.cache() # caches table in memory (lazy); can also use sqlContext.cacheTable (greedy)

In [6]:
# Available IATA codes from the departuredelays sample dataset
tripIATA = sqlContext.sql("select distinct iata from (select distinct origin as iata from departureDelays union all select distinct destination as iata from departureDelays) a")
tripIATA.registerTempTable("tripIATA")

# Only include airports with atleast one trip from the departureDelays dataset
airports = sqlContext.sql("select f.IATA, f.City, f.State, f.Country from airports_na f join tripIATA t on t.IATA = f.IATA")
airports.registerTempTable("airports")
airports.cache()

departureDelays_geo = sqlContext.sql("select cast(f.date as int) as tripid, cast(concat(concat(concat(concat(concat(concat('2014-', concat(concat(substr(cast(f.date as string), 1, 2), '-')), substr(cast(f.date as string), 3, 2)), ' '), substr(cast(f.date as string), 5, 2)), ':'), substr(cast(f.date as string), 7, 2)), ':00') as timestamp) as `localdate`, cast(f.delay as int) as delay, cast(f.delay < 0 as int) as delay_bool, cast(f.distance as int), f.origin as src, f.destination as dst, o.city as city_src, d.city as city_dst, o.state as state_src, d.state as state_dst from departuredelays f join airports o on o.iata = f.origin join airports d on d.iata = f.destination") 

# RegisterTempTable
departureDelays_geo.registerTempTable("departureDelays_geo")

# Cache and Count
departureDelays_geo.cache()
departureDelays_geo.count()

In [7]:
display(sqlContext.sql("select count(distinct destination) from departuredelays"))
# display(departureDelays_geo)


In [8]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

# STRING INDEXER EXAMPLE: outputs categoryIndex after run in category
#  id | category | categoryIndex
# ----|----------|---------------
#  0  | a        | 0.0
#  1  | b        | 2.0
#  2  | c        | 1.0
#  3  | a        | 0.0
#  4  | a        | 0.0
#  5  | c        | 1.0

# ONE HOT ENCODER: changes numeric value into binary vector with a 1 in the place of value

categoricalColumns = ["src", "dst"]
stages = [] # stages in our Pipeline
for categoricalCol in categoricalColumns:
  # Category Indexing with StringIndexer
  stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
  # Use OneHotEncoder to convert categorical variables into binary SparseVectors
  encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
  # Add stages.  These are not run here, but will run all at once later on.
  stages += [stringIndexer, encoder]
  
label_stringIdx = StringIndexer(inputCol = "delay_bool", outputCol = "label")
stages += [label_stringIdx]

numericCols = ["distance"]
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + numericCols

assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

pipeline = Pipeline(stages=stages)
# Run the feature transformations.
#  - fit() computes feature statistics as needed.
#  - transform() actually transforms the features.
pipelineModel = pipeline.fit(departureDelays_geo)
dataset = pipelineModel.transform(departureDelays_geo)

# Keep relevant columns
selectedcols = ["label", "features"] + departureDelays_geo.columns
# dataset = dataset.select(selectedcols)
display(dataset)

(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print trainingData.count()
print testData.count()

In [9]:
from pyspark.ml.classification import LogisticRegression

# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)

# Train model with Training Data
lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions.printSchema()

selected = predictions.select("label", "prediction", "probability")
display(selected)

In [10]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

evaluator.getMetricName()

print lr.explainParams()