In [1]:
from pyspark import SparkContext
# initialize a new Spark Context to use for the execution of the script
sc = SparkContext(appName="MY-APP-NAME", master="local[*]")


In [2]:
import numpy as np
from pyspark.mllib.linalg import Vectors


In [3]:
# Use a NumPy array as a dense vector 
dv1 = np.array([1.0, 0.0, 3.0])
# Use a Python list as a dense vector 
dv2 = [1.0, 0.0, 3.0]
# Create a DenseVector
dv3 = Vectors.dense([1.0, 0.0, 3.0])
# Create a SparseVector
sv1 = Vectors.sparse(3, [0, 2], [1.0, 3.0])

In [4]:
from pyspark.mllib.linalg import SparseVector 
from pyspark.mllib.regression import LabeledPoint


In [7]:
pos = LabeledPoint(-1.0, [1.0, 0.0, 3.0])
print (pos)

(-1.0,[1.0,0.0,3.0])


In [8]:
neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0]))
print (neg)

(0.0,(3,[0,2],[1.0,3.0]))


In [10]:
from pyspark.mllib.regression import LabeledPoint 
data = sc.textFile('sample_data.txt')
#print(data)

# Sample data contains lines of floats, separated by space
numbers = data.map(lambda line: [float(x) for x in line.split()]) 
#print(numbers.take(2))

# We assume the first float if the label
# Remaining floats are features
labeled = numbers.map(lambda v: LabeledPoint(v[0], v[1:])) 

#print(labeled.take(2))

labels = labeled.map(lambda x: x.label)
features = labeled.map(lambda x: x.features)
print(labels.take(10)) 
print(features.take(1))

[1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0]
[DenseVector([0.0, 2.5208, 0.0, 0.0, 0.0, 2.0047, 2.0003, 0.0, 2.2284, 2.2284, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])]


In [8]:
from pyspark.mllib.util import MLUtils
data = MLUtils.loadLibSVMFile(sc, "sample_libsvm_data.txt") 

labels = data.map(lambda x: x.label)
features = data.map(lambda x: x.features)
print(labels.take(10)) 
print(features.take(1))

[0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0]
[SparseVector(692, {127: 51.0, 128: 159.0, 129: 253.0, 130: 159.0, 131: 50.0, 154: 48.0, 155: 238.0, 156: 252.0, 157: 252.0, 158: 252.0, 159: 237.0, 181: 54.0, 182: 227.0, 183: 253.0, 184: 252.0, 185: 239.0, 186: 233.0, 187: 252.0, 188: 57.0, 189: 6.0, 207: 10.0, 208: 60.0, 209: 224.0, 210: 252.0, 211: 253.0, 212: 252.0, 213: 202.0, 214: 84.0, 215: 252.0, 216: 253.0, 217: 122.0, 235: 163.0, 236: 252.0, 237: 252.0, 238: 252.0, 239: 253.0, 240: 252.0, 241: 252.0, 242: 96.0, 243: 189.0, 244: 253.0, 245: 167.0, 262: 51.0, 263: 238.0, 264: 253.0, 265: 253.0, 266: 190.0, 267: 114.0, 268: 253.0, 269: 228.0, 270: 47.0, 271: 79.0, 272: 255.0, 273: 168.0, 289: 48.0, 290: 238.0, 291: 252.0, 292: 252.0, 293: 179.0, 294: 12.0, 295: 75.0, 296: 121.0, 297: 21.0, 300: 253.0, 301: 243.0, 302: 50.0, 316: 38.0, 317: 165.0, 318: 253.0, 319: 233.0, 320: 208.0, 321: 84.0, 328: 253.0, 329: 252.0, 330: 165.0, 343: 7.0, 344: 178.0, 345: 252.0, 346: 240.0, 347:

In [13]:
import numpy as np
from pyspark.mllib.stat import Statistics

mat = sc.parallelize([
np.array([1.0, 10.0, 100.0]),
np.array([2.0, 0.0, 200.0]),
np.array([3.0, 30.0, 300.0])
]) 

# an RDD of Vectors
# Compute column summary statistics.
summary = Statistics.colStats(mat)

# A dense vector containing the mean value for each column 
print(summary.mean())

mat.map(lambda x: (x-summary.min)/(summary.max-summary.min))

# column-wise variance
print(summary.variance())
# number of nonzeros in each column 
print(summary.numNonzeros())

[  1.   0. 100.]
[1.00000000e+00 2.33333333e+02 1.00000000e+04]
[3. 2. 3.]


In [10]:
from pyspark.mllib.stat import Statistics
# a series
seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0])
# seriesY must have the same number of cardinality as seriesX 
seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0])

# Compute the correlation using Pearson's method.
# Enter "spearman" for Spearman's method.
# If a method is not specified, Pearson's method will be used by default. 
print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson")))

data = sc.parallelize( [
np.array([1.0, 90.0, 100.0]),
np.array([2.0, 2.0, 200.0]),
np.array([3.0, 30.0, 366.0])
] ) # an RDD of Vectors

# Compute the correlation matrix using Pearson's method.
# Use "spearman" for Spearman's method.
# If a method is not specified, Pearson's method will be used by default. 
print(Statistics.corr(data, method="pearson"))


Correlation is: 0.8500286768773001
[[ 1.         -0.66727105  0.98989465]
 [-0.66727105  1.         -0.55490978]
 [ 0.98989465 -0.55490978  1.        ]]


In [22]:
from pyspark.mllib.feature import HashingTF, IDF
documents = sc.textFile('tragedies.txt').map(lambda line: line.split(' '))
hashingTF = HashingTF()
tf = hashingTF.transform(documents)
print(tf.sample(withReplacement = False, fraction = 0.001).collect())


[SparseVector(1048576, {0: 1.0}), SparseVector(1048576, {0: 1.0}), SparseVector(1048576, {0: 1.0}), SparseVector(1048576, {20335: 1.0, 84860: 1.0, 110703: 1.0, 221871: 1.0, 924518: 1.0, 983645: 1.0}), SparseVector(1048576, {70882: 1.0, 110703: 1.0, 154253: 1.0, 168767: 1.0, 319591: 1.0, 374216: 1.0, 580265: 1.0, 605366: 1.0, 711033: 1.0, 803757: 1.0}), SparseVector(1048576, {6822: 1.0, 45274: 1.0, 324280: 1.0, 431618: 1.0, 960719: 1.0, 1001303: 1.0, 1045512: 1.0}), SparseVector(1048576, {548991: 1.0, 822082: 1.0, 1022968: 1.0}), SparseVector(1048576, {158236: 1.0, 357784: 1.0, 522386: 1.0, 680105: 1.0, 790201: 1.0, 880771: 1.0, 1046240: 1.0}), SparseVector(1048576, {0: 1.0}), SparseVector(1048576, {170731: 1.0, 250402: 1.0, 348943: 1.0, 682167: 1.0, 714823: 1.0}), SparseVector(1048576, {102483: 1.0, 247145: 1.0, 268040: 1.0, 308288: 1.0, 438276: 1.0}), SparseVector(1048576, {0: 1.0}), SparseVector(1048576, {0: 1.0}), SparseVector(1048576, {0: 1.0}), SparseVector(1048576, {154253: 1.0, 

In [12]:
from pyspark.mllib.feature import StandardScaler 
from pyspark.mllib.linalg import Vectors

vectors = [Vectors.dense([-2.0, 115.0, 1.0]), Vectors.dense([ 2.0, 20.0, 1.0])]
vectors2 = [Vectors.dense([-1.0, 114.0, 1.0]), Vectors.dense([ 2.0, 12.0, 1.0])]

training = sc.parallelize(vectors)
test = sc.parallelize(vectors2)

scaler = StandardScaler(withMean=True, withStd=True) 
transformation = scaler.fit(training)
result = transformation.transform(training)

# ... model creation: i.e classifier on training

result2 = transformation.transform(test)

# ... model application: i.e classifier on test


print(result.collect())
print(result2.collect())
# Result: {[-0.7071, 0.7071, 0.0], [0.7071, -0.7071, 0.0]}

[DenseVector([-0.7071, 0.7071, 0.0]), DenseVector([0.7071, -0.7071, 0.0])]
[DenseVector([-0.3536, 0.6922, 0.0]), DenseVector([0.7071, -0.8262, 0.0])]


In [13]:
from pyspark.mllib.feature import Normalizer 
from pyspark.mllib.util import MLUtils
data = MLUtils.loadLibSVMFile(sc, "sample_libsvm_data.txt") 

labels = data.map(lambda x: x.label)
features = data.map(lambda x: x.features)

normalizer1 = Normalizer()
normalizer2 = Normalizer(p=float("inf"))

# Each sample in data1 will be normalized using $L^2$ norm. 
data1 = labels.zip(normalizer1.transform(features))
print(data1.collect())
# Each sample in data2 will be normalized using $L^\infty$ norm. 
data2 = labels.zip(normalizer2.transform(features))

[(0.0, SparseVector(692, {127: 0.0196, 128: 0.0612, 129: 0.0974, 130: 0.0612, 131: 0.0192, 154: 0.0185, 155: 0.0916, 156: 0.097, 157: 0.097, 158: 0.097, 159: 0.0912, 181: 0.0208, 182: 0.0874, 183: 0.0974, 184: 0.097, 185: 0.092, 186: 0.0897, 187: 0.097, 188: 0.0219, 189: 0.0023, 207: 0.0038, 208: 0.0231, 209: 0.0862, 210: 0.097, 211: 0.0974, 212: 0.097, 213: 0.0777, 214: 0.0323, 215: 0.097, 216: 0.0974, 217: 0.047, 235: 0.0627, 236: 0.097, 237: 0.097, 238: 0.097, 239: 0.0974, 240: 0.097, 241: 0.097, 242: 0.0369, 243: 0.0727, 244: 0.0974, 245: 0.0643, 262: 0.0196, 263: 0.0916, 264: 0.0974, 265: 0.0974, 266: 0.0731, 267: 0.0439, 268: 0.0974, 269: 0.0878, 270: 0.0181, 271: 0.0304, 272: 0.0981, 273: 0.0647, 289: 0.0185, 290: 0.0916, 291: 0.097, 292: 0.097, 293: 0.0689, 294: 0.0046, 295: 0.0289, 296: 0.0466, 297: 0.0081, 300: 0.0974, 301: 0.0935, 302: 0.0192, 316: 0.0146, 317: 0.0635, 318: 0.0974, 319: 0.0897, 320: 0.0801, 321: 0.0323, 328: 0.0974, 329: 0.097, 330: 0.0635, 343: 0.0027, 344:

In [14]:
from pyspark.mllib.util import MLUtils
data = MLUtils.loadLibSVMFile(sc, "sample_libsvm_data.txt") 

training, test = data.randomSplit([0.8, 0.2], seed=11)

In [18]:
from pyspark.mllib.clustering import KMeans 
from numpy import array

# 12 records with height, weight data
data = array([185,72, 170,56, 168,60, 179,68, 182,72,
188,77, 180,71, 180,70, 183,84, 180,88, 180,67, 177,76]).reshape(12,2)

# Generate Kmeans
model = KMeans.train(sc.parallelize(data), 5, initializationMode="random")

test = array([185,72, 170,56, 168,60, 179,68, 182,72,
188,77, 180,71, 180,70, 183,84, 180,88, 180,67, 177,76]).reshape(12,2)
testRDD = sc.parallelize(test)

result = testRDD.map(lambda x: (x,model.predict(x)))
print(result.collect())
#print(result.zip(testRDD).collect())



[(array([185,  72]), 1), (array([170,  56]), 0), (array([168,  60]), 0), (array([179,  68]), 2), (array([182,  72]), 2), (array([188,  77]), 4), (array([180,  71]), 2), (array([180,  70]), 2), (array([183,  84]), 4), (array([180,  88]), 3), (array([180,  67]), 1), (array([177,  76]), 2)]


In [5]:
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LinearRegressionWithSGD

data = MLUtils.loadLibSVMFile(sc, "regression_data.txt") 

model = LinearRegressionWithSGD.train(data, iterations=10)

valuesAndPreds = data.map(lambda p: (p.label, model.predict(p.features)))

print(valuesAndPreds.collect())

SE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2)
MSE = SE.reduce(lambda x, y: x + y) / valuesAndPreds.count()

print("Mean Squared Error = " + str(MSE)) 
print("Coefficients = " + str(model.weights)) 
print("Intercept = " + str(model.intercept))

from pyspark.mllib.regression import LinearRegressionModel 
model.save(sc, "lrmodel")

[(-9.490009878824548, 1.14700193828909), (0.2577820163584905, -0.5402104097029288), (-4.438869807456516, -0.001945404945051299), (-19.782762789614537, 0.41487461503468337), (-7.966593841555266, 1.7478682835614248), (-7.896274316726144, -1.741574255032109), (-8.464803554195287, 1.6706685083179424), (2.1214592666251364, -0.12658874112560337), (1.0720117616524107, -2.547698483916556), (-13.772441561702871, 2.810090843973237), (-5.082010756207233, -0.5282261377224253), (7.887786536531237, 1.108871981430088), (14.323146365332388, 2.2704230879930387), (-20.057482615789212, 0.35658848120652287), (-0.8995693247765151, 1.3207253854706869), (-19.16829262296376, -2.088422525780799), (5.601801561245534, -2.2078749576199828), (-3.2256352187273354, -1.6072118322988396), (1.5299675726687754, 1.6548712228957942), (-0.250102447941961, 0.5704742905307469), (12.792267926563595, -0.4560306010691579), (6.082192787194888, -0.3094069367708253), (-7.481405271455238, -0.8943375499569934), (6.739533816100517, 0

Py4JJavaError: An error occurred while calling o103.save.
: org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory file:/home/hpsa00/lrmodel/metadata already exists
	at org.apache.hadoop.mapred.FileOutputFormat.checkOutputSpecs(FileOutputFormat.java:131)
	at org.apache.spark.internal.io.HadoopMapRedWriteConfigUtil.assertConf(SparkHadoopWriter.scala:289)
	at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:71)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1096)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1094)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:1067)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1032)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:958)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:958)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:958)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:957)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1544)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1523)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1523)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1523)
	at org.apache.spark.mllib.regression.impl.GLMRegressionModel$SaveLoadV1_0$.save(GLMRegressionModel.scala:56)
	at org.apache.spark.mllib.regression.LinearRegressionModel.save(LinearRegression.scala:52)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [6]:
sameModel = LinearRegressionModel.load(sc, "lrmodel")
print("Coefficients = " + str(sameModel.weights)) 
print("Intercept = " + str(sameModel.intercept))


Coefficients = [0.05309042984022284,0.622717875539783,-0.6457288619775139,1.957778700366432,0.39071567094199594,0.9564530964899689,-0.2772380549078009,-0.41795253664067283,-0.5266131124623027,0.56734829561925]
Intercept = 0.0


In [7]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel 
from pyspark.mllib.regression import LabeledPoint

# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(' ')] 
    return LabeledPoint(values[0], values[1:])

data = sc.textFile("sample_data.txt")
parsedData = data.map(parsePoint)
               
# Build the model
model = LogisticRegressionWithLBFGS.train(parsedData)

# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
print(labelsAndPreds.collect())

trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(parsedData.count()) 
print("Training Error = " + str(trainErr))

# Save and load model
# model.save(sc, "logregmodel2")
# sameModel = LogisticRegressionModel.load(sc, "logregmodel2")

[(1.0, 1), (0.0, 1), (0.0, 0), (1.0, 1), (1.0, 0), (0.0, 1), (1.0, 1), (1.0, 1), (0.0, 0), (0.0, 0), (1.0, 1), (1.0, 0), (1.0, 0), (1.0, 0), (0.0, 0), (0.0, 1), (0.0, 0), (0.0, 1), (0.0, 0), (1.0, 1), (0.0, 1), (0.0, 0), (0.0, 0), (1.0, 0), (1.0, 1), (1.0, 0), (0.0, 1), (1.0, 1), (0.0, 1), (1.0, 1), (1.0, 1), (0.0, 1), (1.0, 0), (0.0, 0), (1.0, 1), (0.0, 1), (1.0, 1), (1.0, 0), (1.0, 1), (0.0, 1), (1.0, 1), (0.0, 0), (0.0, 1), (1.0, 1), (1.0, 1), (0.0, 1), (1.0, 1), (0.0, 0), (1.0, 1), (1.0, 1), (0.0, 0), (0.0, 0), (0.0, 0), (1.0, 0), (1.0, 1), (0.0, 1), (0.0, 1), (0.0, 0), (1.0, 0), (1.0, 1), (0.0, 0), (0.0, 1), (1.0, 1), (1.0, 1), (1.0, 1), (0.0, 0), (0.0, 1), (0.0, 1), (1.0, 0), (1.0, 1), (0.0, 0), (0.0, 0), (0.0, 0), (0.0, 1), (1.0, 1), (1.0, 1), (1.0, 1), (1.0, 1), (1.0, 1), (0.0, 0), (1.0, 1), (0.0, 1), (1.0, 0), (1.0, 0), (1.0, 1), (0.0, 0), (1.0, 1), (1.0, 1), (0.0, 0), (1.0, 1), (1.0, 1), (0.0, 1), (0.0, 0), (1.0, 0), (0.0, 1), (1.0, 0), (0.0, 0), (1.0, 1), (1.0, 1), (1.0, 1),

In [8]:
# CLASSIFICATION
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel 
from pyspark.mllib.util import MLUtils

# Load and parse the data file into an RDD of LabeledPoint. 
data = MLUtils.loadLibSVMFile(sc, 'sample_libsvm_data.txt')

# Split the data into training and test sets (30% held out for testing) 
(trainingData, testData) = data.randomSplit([0.7, 0.3], seed=11)

# Train a DecisionTree model.
model = DecisionTree.trainClassifier(trainingData, 
                                     numClasses=2, 
                                     categoricalFeaturesInfo={},
                                     impurity='gini', 
                                     maxDepth=5,
                                     maxBins=32)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda lp: lp[0] != lp[1]).count() / float(testData.count()) 

print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

Test Error = 0.09090909090909091
Learned classification tree model:
DecisionTreeModel classifier of depth 1 with 3 nodes
  If (feature 406 <= 161.0)
   Predict: 0.0
  Else (feature 406 > 161.0)
   Predict: 1.0



In [9]:
from pyspark.mllib.tree import RandomForest, RandomForestModel 
from pyspark.mllib.util import MLUtils

# Load and parse the data file into an RDD of LabeledPoint. 
data = MLUtils.loadLibSVMFile(sc, 'sample_libsvm_data.txt')
(trainingData, testData) = data.randomSplit([0.7, 0.3], seed=11)

# Train a RandomForest model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
# Note: Use larger numTrees in practice.
# Setting featureSubsetStrategy="auto" lets the algorithm choose.
model = RandomForest.trainClassifier(trainingData, 
                                     numClasses=2, 
                                     categoricalFeaturesInfo={},
                                     numTrees=1, 
                                     featureSubsetStrategy="auto", 
                                     impurity='gini', 
                                     maxDepth=4, 
                                     maxBins=32)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda lp: lp[0] != lp[1]).count() / float(testData.count()) 
print('Test Error = ' + str(testErr))
print('Learned classification forest model:')
print(model.toDebugString())

Test Error = 0.09090909090909091
Learned classification forest model:
TreeEnsembleModel classifier with 1 trees

  Tree 0:
    If (feature 406 <= 161.0)
     Predict: 0.0
    Else (feature 406 > 161.0)
     Predict: 1.0

