## Lecture 16 Spark Tree based classification

In [1]:

import findspark

findspark.init('/opt/spark')
from pyspark.sql import SparkSession



In [2]:
spark = SparkSession.builder.appName('Random_Forest_moynihanl').getOrCreate()


In [3]:
from pyspark import SparkFiles

url = 'https://raw.githubusercontent.com/BlueJayADAL/DS420/master/datasets/mllib/sample_libsvm_data.txt'


In [4]:
spark.sparkContext.addFile(url)



In [5]:
fileloc = SparkFiles.get('sample_libsvm_data.txt')
fileloc

'/tmp/spark-c9768ffe-1524-4b1c-a2bd-a27e5e4330fa/userFiles-3c51ec17-269b-493a-acf5-889d273bca2a/sample_libsvm_data.txt'

In [6]:
data = spark.read.format('libsvm').load('file://'+fileloc)

In [8]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [10]:
trainingData, testingData = data.randomSplit([0.7,0.3], seed = 101)

In [11]:
trainingData.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [15]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

In [16]:
rf = RandomForestClassifier(labelCol='label',
                           featuresCol = 'features',
                           predictionCol = 'prediction',
                           numTrees=20)

In [17]:
model = rf.fit(trainingData)

In [18]:
predictions = model.transform(testingData)

In [19]:
predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [20]:
predictions.select(['prediction', 'label','features']).show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[122,123,124...|
|       0.0|  0.0|(692,[122,123,148...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[125,126,127...|
|       0.0|  0.0|(692,[126,127,128...|
+----------+-----+--------------------+
only showing top 5 rows



In [22]:
evaluator = BinaryClassificationEvaluator(labelCol='label', rawPredictionCol = 'prediction')

In [25]:
acc = evaluator.evaluate(predictions)
print('test accuracy is ', acc)
print('my test error is ', 1-acc)

test accuracy is  1.0
my test error is  0.0


In [26]:
model.featureImportances

SparseVector(692, {234: 0.0076, 239: 0.0044, 243: 0.0392, 244: 0.0368, 268: 0.0024, 287: 0.0026, 291: 0.0028, 300: 0.0422, 301: 0.0043, 317: 0.0388, 324: 0.0022, 344: 0.002, 346: 0.0028, 350: 0.042, 351: 0.0051, 355: 0.0024, 371: 0.0029, 374: 0.0029, 379: 0.0179, 406: 0.0977, 407: 0.0442, 411: 0.0049, 427: 0.0424, 433: 0.0568, 434: 0.0448, 435: 0.0052, 440: 0.0418, 442: 0.0023, 461: 0.012, 466: 0.0044, 490: 0.138, 496: 0.0123, 512: 0.042, 517: 0.0476, 524: 0.004, 549: 0.0079, 554: 0.0052, 568: 0.0779, 577: 0.0022, 604: 0.0285, 625: 0.0023, 628: 0.0107, 690: 0.0036})

# Gradient Boost Trees

In [27]:
data = spark.read.format('libsvm').load('file://'+fileloc)

In [31]:
from pyspark.ml.classification import GBTClassifier


In [32]:
trainingData, testingData = data.randomSplit([0.7,0.3], seed = 101)

In [37]:
trainingData.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [35]:
gbt = GBTClassifier(labelCol='label',featuresCol='features',maxIter=10)

In [38]:
model = gbt.fit(trainingData)

In [39]:
predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [41]:
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction',
                                             metricName = 'f1')

In [42]:
evaluator.evaluate(predictions)

1.0