In [1]:
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql.functions import udf
from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from IPython.display import display
from ipywidgets import interact
from pyspark.sql.functions import *
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from operator import add
import sys
import numpy as np
import pandas as pd
import time
import datetime
from __future__ import print_function
from pyspark.mllib.tree import RandomForest, RandomForestModel


from pyspark import SparkContext
from pyspark.sql import SparkSession

sc = SparkContext(appName="local")
spark = SparkSession.builder.appName('name').config('spark.sql.shuffle.partitions',5).getOrCreate()

# 1. load data
## 1.1 big trainset

In [4]:
data=spark.read.csv('data/train_flight.csv',header=True,inferSchema=True)
data.printSchema()

root
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- AIRLINE: string (nullable = true)
 |-- ORIGIN_AIRPORT: string (nullable = true)
 |-- DESTINATION_AIRPORT: string (nullable = true)
 |-- DEPARTURE_TIME: integer (nullable = true)
 |-- DEPARTURE_DELAY: integer (nullable = true)
 |-- SCHEDULED_TIME: integer (nullable = true)
 |-- ELAPSED_TIME: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)
 |-- SCHEDULED_ARRIVAL: integer (nullable = true)
 |-- ARRIVAL_TIME: integer (nullable = true)
 |-- ARRIVAL_DELAY: integer (nullable = true)
 |-- schedule_departure: integer (nullable = true)
 |-- NEW_DAY: integer (nullable = true)



## 1.2 small_trainset

In [2]:
data_small=spark.read.csv('data/small.csv',header=True,inferSchema=True)
data_small.count()

45616

## 1.3 choose one dataset

In [27]:
 # can be updated
#dataset=data_small
dataset=data
print(dataset.count())
dataset.printSchema()

4571729
root
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- AIRLINE: string (nullable = true)
 |-- ORIGIN_AIRPORT: string (nullable = true)
 |-- DESTINATION_AIRPORT: string (nullable = true)
 |-- DEPARTURE_TIME: integer (nullable = true)
 |-- DEPARTURE_DELAY: integer (nullable = true)
 |-- SCHEDULED_TIME: integer (nullable = true)
 |-- ELAPSED_TIME: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)
 |-- SCHEDULED_ARRIVAL: integer (nullable = true)
 |-- ARRIVAL_TIME: integer (nullable = true)
 |-- ARRIVAL_DELAY: integer (nullable = true)
 |-- schedule_departure: integer (nullable = true)
 |-- NEW_DAY: integer (nullable = true)



# 0.1 change label to classification

# 0.2 change label to doubletype


In [10]:
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import DoubleType
name = 'DEPARTURE_DELAY'

udf = UserDefinedFunction(lambda x: x*1.0, DoubleType())

new_data=dataset.select('*',udf(dataset['DEPARTURE_DELAY']).alias('double_labels'))
dataset=new_data.drop('DEPARTURE_DELAY')
dataset=dataset.withColumnRenamed('double_labels','DEPARTURE_DELAY')
dataset.printSchema()

root
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- AIRLINE: string (nullable = true)
 |-- ORIGIN_AIRPORT: string (nullable = true)
 |-- DESTINATION_AIRPORT: string (nullable = true)
 |-- DEPARTURE_TIME: integer (nullable = true)
 |-- SCHEDULED_TIME: integer (nullable = true)
 |-- ELAPSED_TIME: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)
 |-- SCHEDULED_ARRIVAL: integer (nullable = true)
 |-- ARRIVAL_TIME: integer (nullable = true)
 |-- ARRIVAL_DELAY: integer (nullable = true)
 |-- schedule_departure: integer (nullable = true)
 |-- NEW_DAY: integer (nullable = true)
 |-- DEPARTURE_DELAY: double (nullable = true)



# 2 feature transformation pipeline
## 2.1 feature selection (can be updated)

In [11]:
# dataset=data
categoricalColumns = ['ORIGIN_AIRPORT']  # to add
numericCols = ['SCHEDULED_TIME','NEW_DAY','SCHEDULED_ARRIVAL','DISTANCE']  # to add
# all_features=categoricalColumns+numericCols

## 2.2 transform and onehot

In [12]:

from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
cols=dataset.columns

stages = [] 
feature_names=[]
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, 
        outputCol=categoricalCol+"Index")
    encoder = OneHotEncoder(inputCol=categoricalCol+"Index", 
        outputCol=categoricalCol+"classVec")
    stages += [stringIndexer, encoder]

assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(dataset)
dataset_transformed = pipelineModel.transform(dataset)
selectedcols = ['DEPARTURE_DELAY', "features"] 
dataset_transformed = dataset_transformed.select(selectedcols)
dataset_transformed=dataset_transformed.select('*').withColumnRenamed('DEPARTURE_DELAY','label')
dataset_transformed.printSchema()
dataset_transformed.select('features').show()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)

+--------------------+
|            features|
+--------------------+
|(631,[243,627,628...|
|(631,[243,627,628...|
|(631,[243,627,628...|
|(631,[243,627,628...|
|(631,[243,627,628...|
|(631,[243,627,628...|
|(631,[243,627,628...|
|(631,[243,627,628...|
|(631,[243,627,628...|
|(631,[243,627,628...|
|(631,[243,627,628...|
|(631,[243,627,628...|
|(631,[243,627,628...|
|(631,[243,627,628...|
|(631,[243,627,628...|
|(631,[243,627,628...|
|(631,[243,627,628...|
|(631,[243,627,628...|
|(631,[243,627,628...|
|(631,[333,627,628...|
+--------------------+
only showing top 20 rows



## 2.7 feature importances

In [36]:
from pyspark.ml.regression import RandomForestRegressor
rf= RandomForestRegressor(numTrees=5, maxDepth=3, seed=42)
model = rf.fit(dataset_transformed)
print assemblerInputs
model.featureImportances

['ORIGIN_AIRPORTclassVec', 'SCHEDULED_TIME', 'NEW_DAY']


(548,[1,27,36,57,143,295,321,322,400,526,546,547],[0.0408834454234,0.00876645216567,0.0327657812684,0.0767878620169,0.0108964305701,0.155255353251,0.0174407157481,0.0242766147333,0.0499435380631,0.0662880595107,0.063992319078,0.452703428171])

# 4 use ML library

# 4.1 train model (RandomForest)

In [16]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

dataset_used,left_behind=dataset_transformed.randomSplit((0.01,0.99),1)

numFolds =2
trainingData,testData=dataset_used.randomSplit((0.8,0.2),1)

rf = RandomForestRegressor(labelCol="label", featuresCol="features")   
paramGrid = ParamGridBuilder()\
    .addGrid(rf.numTrees,[100]) \
    .build()
crossval = CrossValidator(
    estimator=rf,
    estimatorParamMaps=paramGrid,
    evaluator=RegressionEvaluator(),
    numFolds=numFolds)

model = crossval.fit(trainingData)

Py4JJavaError: An error occurred while calling o224.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 19.0 failed 1 times, most recent failure: Lost task 2.0 in stage 19.0 (TID 56, localhost, executor driver): org.apache.spark.util.TaskCompletionListenerException: sun.nio.ch.DirectBuffer.cleaner()Lsun/misc/Cleaner;

Previous exception in task: sun.nio.ch.DirectBuffer.cleaner()Lsun/misc/Cleaner;
	org.apache.spark.storage.StorageUtils$.cleanDirectBuffer(StorageUtils.scala:293)
	org.apache.spark.storage.StorageUtils$.dispose(StorageUtils.scala:288)
	org.apache.spark.storage.StorageUtils.dispose(StorageUtils.scala)
	org.apache.spark.io.NioBufferedFileInputStream.close(NioBufferedFileInputStream.java:130)
	java.base/java.io.FilterInputStream.close(FilterInputStream.java:180)
	org.spark_project.guava.io.Closeables.close(Closeables.java:77)
	org.apache.spark.sql.execution.python.DiskRowQueue.close(RowQueue.scala:152)
	org.apache.spark.sql.execution.python.HybridRowQueue.remove(RowQueue.scala:258)
	org.apache.spark.sql.execution.python.BatchEvalPythonExec$$anonfun$doExecute$1$$anonfun$apply$7.apply(BatchEvalPythonExec.scala:166)
	org.apache.spark.sql.execution.python.BatchEvalPythonExec$$anonfun$doExecute$1$$anonfun$apply$7.apply(BatchEvalPythonExec.scala:158)
	scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.sort_addToSorter1$(Unknown Source)
	org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.sort_addToSorter$(Unknown Source)
	org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
	org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
	scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1799)
	org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1158)
	org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1158)
	org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2062)
	org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2062)
	org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	org.apache.spark.scheduler.Task.run(Task.scala:108)
	org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1167)
	java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:641)
	java.base/java.lang.Thread.run(Thread.java:844)
	at org.apache.spark.TaskContextImpl.invokeListeners(TaskContextImpl.scala:138)
	at org.apache.spark.TaskContextImpl.markTaskCompleted(TaskContextImpl.scala:116)
	at org.apache.spark.scheduler.Task.run(Task.scala:118)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1167)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:641)
	at java.base/java.lang.Thread.run(Thread.java:844)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087)
	at org.apache.spark.rdd.RDD.count(RDD.scala:1158)
	at org.apache.spark.ml.tree.impl.DecisionTreeMetadata$.buildMetadata(DecisionTreeMetadata.scala:118)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:105)
	at org.apache.spark.ml.regression.RandomForestRegressor.train(RandomForestRegressor.scala:130)
	at org.apache.spark.ml.regression.RandomForestRegressor.train(RandomForestRegressor.scala:45)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:564)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.base/java.lang.Thread.run(Thread.java:844)
Caused by: org.apache.spark.util.TaskCompletionListenerException: sun.nio.ch.DirectBuffer.cleaner()Lsun/misc/Cleaner;

Previous exception in task: sun.nio.ch.DirectBuffer.cleaner()Lsun/misc/Cleaner;
	org.apache.spark.storage.StorageUtils$.cleanDirectBuffer(StorageUtils.scala:293)
	org.apache.spark.storage.StorageUtils$.dispose(StorageUtils.scala:288)
	org.apache.spark.storage.StorageUtils.dispose(StorageUtils.scala)
	org.apache.spark.io.NioBufferedFileInputStream.close(NioBufferedFileInputStream.java:130)
	java.base/java.io.FilterInputStream.close(FilterInputStream.java:180)
	org.spark_project.guava.io.Closeables.close(Closeables.java:77)
	org.apache.spark.sql.execution.python.DiskRowQueue.close(RowQueue.scala:152)
	org.apache.spark.sql.execution.python.HybridRowQueue.remove(RowQueue.scala:258)
	org.apache.spark.sql.execution.python.BatchEvalPythonExec$$anonfun$doExecute$1$$anonfun$apply$7.apply(BatchEvalPythonExec.scala:166)
	org.apache.spark.sql.execution.python.BatchEvalPythonExec$$anonfun$doExecute$1$$anonfun$apply$7.apply(BatchEvalPythonExec.scala:158)
	scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.sort_addToSorter1$(Unknown Source)
	org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.sort_addToSorter$(Unknown Source)
	org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
	org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
	scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1799)
	org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1158)
	org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1158)
	org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2062)
	org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2062)
	org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	org.apache.spark.scheduler.Task.run(Task.scala:108)
	org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1167)
	java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:641)
	java.base/java.lang.Thread.run(Thread.java:844)
	at org.apache.spark.TaskContextImpl.invokeListeners(TaskContextImpl.scala:138)
	at org.apache.spark.TaskContextImpl.markTaskCompleted(TaskContextImpl.scala:116)
	at org.apache.spark.scheduler.Task.run(Task.scala:118)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1167)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:641)
	... 1 more


## 4.2 performance

In [75]:
from pyspark.mllib.evaluation import RegressionMetrics
cvModel=model
trainPredictionsAndLabels = cvModel.transform(trainingData).select("label", "prediction").rdd
validPredictionsAndLabels = cvModel.transform(testData).select("label", "prediction").rdd
trainRegressionMetrics = RegressionMetrics(trainPredictionsAndLabels)
validRegressionMetrics = RegressionMetrics(validPredictionsAndLabels)

bestModel = cvModel.bestModel
featureImportances = bestModel.featureImportances.toArray()
print featureImportances

output = str("\n=====================================================================\n" +
      "TrainingData count: {0}\n".format(trainingData.count()) +
      "TestData count: {0}\n".format(testData.count()) +
      "=====================================================================\n" +
      "Training data MSE = {}\n".format(trainRegressionMetrics.meanSquaredError) +
      "Training data RMSE = {}\n".format(trainRegressionMetrics.rootMeanSquaredError) +
      "Training data R-squared = {}\n".format(trainRegressionMetrics.r2) +
      "Training data MAE = {}\n".format(trainRegressionMetrics.meanAbsoluteError) +
      "Training data Explained variance = {}\n".format(trainRegressionMetrics.explainedVariance) +
      "=====================================================================\n" +
      "Validation data MSE = {0}\n".format(validRegressionMetrics.meanSquaredError) +
      "Validation data RMSE = {0}\n".format(validRegressionMetrics.rootMeanSquaredError) +
      "Validation data R-squared = {0}\n".format(validRegressionMetrics.r2) +
      "Validation data MAE = {0}\n".format(validRegressionMetrics.meanAbsoluteError) +
      "Validation data Explained variance = {0}\n".format(validRegressionMetrics.explainedVariance) +
      "=====================================================================\n" +
      "CV params explained: {}\n".format(cvModel.explainParams()) +
      "RandomForest params explained: {}\n".format(bestModel.explainParams()) +
      #"RandomForest features importances:\n {0}\n".format("\n".join(map(lambda z: "{0} = {1}".format(str(z[0]),str(z[1])), zip(featureCols, featureImportances)))) +
"=====================================================================\n")
print(output)



TrainingData count: 36502
TestData count: 9114
Training data MSE = 1432.04823345
Training data RMSE = 37.8424131557
Training data R-squared = -84.3787281914
Training data MAE = 18.7357809654
Training data Explained variance = 1496.97441388
Validation data MSE = 1224.12882869
Validation data RMSE = 34.9875524822
Validation data R-squared = -188.941401709
Validation data MAE = 18.4183763529
Validation data Explained variance = 1229.21362335
CV params explained: estimator: estimator to be cross-validated (current: RandomForestRegressor_4ae499c2617bddfa4284)
estimatorParamMaps: estimator param maps (current: [{Param(parent=u'RandomForestRegressor_4ae499c2617bddfa4284', name='numTrees', doc='Number of trees to train (>= 1).'): 100}])
evaluator: evaluator used to select hyper-parameters that maximize the validator metric (current: RegressionEvaluator_4e9699f7e2bc594d0c94)
seed: random seed. (default: -4372709618522015412)
RandomForest params explained: 

Traceback (most recent call last):
 

# 3. ML model (by MLlib)
## 3.1 generate RDD

In [13]:
# change into RDD
from pyspark.ml.linalg import Vector as MLVector, Vectors as MLVectors
from pyspark.mllib.linalg import Vector as MLLibVector, Vectors as MLLibVectors
from pyspark.ml import linalg as ml_linalg

def as_mllib(v):
    if isinstance(v, ml_linalg.SparseVector):
        return MLLibVectors.sparse(v.size, v.indices, v.values)
    elif isinstance(v, ml_linalg.DenseVector):
        return MLLibVectors.dense(v.toArray())
    else:
        raise TypeError("Unsupported type: {0}".format(type(v)))
        
airlineRDD=dataset_transformed.rdd.map(lambda row: LabeledPoint(row['label'],as_mllib(row['features'])))

## 3.2 split trainset and testset 

In [17]:
#  Spliting dataset into train and test dtasets
airlineRDD.cache()
use_data,left_data=airlineRDD.randomSplit([0.01,0.99])
trainingData,testData=use_data.randomSplit([0.7,0.3])

## 3.3 use Random Forest classifier

In [18]:
from pyspark.mllib.tree import RandomForest, RandomForestModel
# Train a RandomForest model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
#  Note: Use larger numTrees in practice.
#  Setting featureSubsetStrategy="auto" lets the algorithm choose.
model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                        numTrees=3, featureSubsetStrategy="auto",
                                        impurity='variance', maxDepth=4, maxBins=32)


Test Error = 1.0


In [24]:
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
        float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))

Test Mean Squared Error = 1359.99922727


In [26]:
from pyspark.mllib.evaluation import RegressionMetrics
#valuesAndPreds = testData.map(lambda p: (model.predict(p.features), p.label))
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
# Instantiate metrics object
metrics = RegressionMetrics(labelsAndPredictions)
    # Squared Error
print("MSE = %s" % metrics.meanSquaredError)
print("RMSE = %s" % metrics.rootMeanSquaredError)

    # R-squared
print("R-squared = %s" % metrics.r2)

    # Mean absolute error
print("MAE = %s" % metrics.meanAbsoluteError)

    # Explained variance
print("Explained variance = %s" % metrics.explainedVariance)
    # exampleoff

MSE = 1359.99922727
RMSE = 36.8781673524
R-squared = -88.29262504
MAE = 18.4746674913
Explained variance = 1366.23195794


## 3.4 use GBDT

In [137]:
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
model = GradientBoostedTrees.trainClassifier(trainingData,
                                             categoricalFeaturesInfo={}, numIterations=3)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))

Test Error = 0.619792551291
Learned classification GBT model:
TreeEnsembleModel classifier with 3 trees

  Tree 0:
    If (feature 187 <= 235.0)
     If (feature 0 <= 0.0)
      If (feature 2 <= 0.0)
       Predict: 0.20240178145379353
      Else (feature 2 > 0.0)
       Predict: 0.5941038239692373
     Else (feature 0 > 0.0)
      If (feature 187 <= 16.0)
       Predict: 1.0702360391479562
      Else (feature 187 > 16.0)
       Predict: 0.5228811820233942
    Else (feature 187 > 235.0)
     If (feature 187 <= 355.0)
      If (feature 2 <= 0.0)
       Predict: -0.04078576685870917
      Else (feature 2 > 0.0)
       Predict: 0.412833776038026
     Else (feature 187 > 355.0)
      If (feature 3 <= 0.0)
       Predict: 0.7047692439421184
      Else (feature 3 > 0.0)
       Predict: 1.110079575596817
  Tree 1:
    If (feature 187 <= 236.0)
     If (feature 1 <= 0.0)
      If (feature 0 <= 0.0)
       Predict: -0.7436935228217824
      Else (feature 0 > 0.0)
       Predict: -1.153059822728

## 3.5 logistic 

In [141]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel

# Build the model
model = LogisticRegressionWithLBFGS.train(trainingData,numClasses=3)
print('finish training')
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))

Test Error = 0.38938509204


## 3.6 SVM

In [143]:
from pyspark.mllib.classification import SVMWithSGD, SVMModel

# Build the model
model = SVMWithSGD.train(trainingData, iterations=100)
print('finish training')
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))

Name: org.apache.toree.interpreter.broker.BrokerException
Message: Py4JJavaError: An error occurred while calling o4436.trainSVMModelWithSGD.
: org.apache.spark.SparkException: Input validation failed.
	at org.apache.spark.mllib.regression.GeneralizedLinearAlgorithm.run(GeneralizedLinearAlgorithm.scala:256)
	at org.apache.spark.mllib.api.python.PythonMLLibAPI.trainRegressionModel(PythonMLLibAPI.scala:92)
	at org.apache.spark.mllib.api.python.PythonMLLibAPI.trainSVMModelWithSGD(PythonMLLibAPI.scala:248)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.Abstrac

## 3.3 model training

In [45]:
# train models
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
model = LinearRegressionWithSGD.train(trainRDD, iterations=100, step=0.0000001)



## 3.4 model evaluation

In [None]:
# Evaluate the model on training data
valuesAndPreds = testRDD.map(lambda p: (p.label, model.predict(p.features)))
MSE = valuesAndPreds \
    .map(lambda vp: (vp[0] - vp[1])**2) \
    .reduce(lambda x, y: x + y) / valuesAndPreds.count()
print("Mean Squared Error = " + str(MSE))

##  3.5 save model

In [None]:
# Save and load model
model.save(sc, "model/pythonLinearRegressionWithSGDModel")

sameModel = LinearRegressionModel.load(sc, "model/pythonLinearRegressionWithSGDModel")