In [26]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import RandomForestRegressionModel
from numpy import allclose
import pandas as pd
from pyspark.ml.linalg import Vectors
import pyspark

# set folder to save/load model:
temp_path = "temp"

# start pyspark session:
spark = pyspark.sql.SparkSession.builder \
.master("local") \
.appName("RF model") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()

# example input data structure / data:
df = spark.createDataFrame([
    (1.0, Vectors.dense(2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)),
    (0.0, Vectors.dense(3.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)),
    (0.5, Vectors.dense(1.0, 2.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)),
    (2.0, Vectors.dense(2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))], ["label", "features"])

# input structure of RF model: two column table, label and features. 
# Features consist of a vector containing all feature values for one object/location
df.show()

# Build the RF model:
# numTrees = 128 https://link.springer.com/chapter/10.1007/978-3-642-31537-4_13 or 
# ~200 https://link.springer.com/content/pdf/10.1007/978-0-387-84858-7_15.pdf <-- RF book
# maxDepth = None?  (we basically want to have as many leafs as possible to get the most accurate prediciton,
# however, this is pretty much debateable as high depth trees are resource expensive)
# featureSubsetStrategy = onethird (Used in regresssion, Set it to this instead of auto, 
# cuz I don't trust developers)
rf = RandomForestRegressor(numTrees=3, maxDepth=5, featureSubsetStrategy='onethird')

print("Minimum population size of nodes after decision:")
print(rf.getMinInstancesPerNode())

rf.setSeed(42)

# model training
model = rf.fit(df)
print("number of trees used:")
print(model.getNumTrees)


print("Is bagging performed:")
print(model.getBootstrap())

print("Decision strategy:")
print(model.getFeatureSubsetStrategy())

print("Model seed:")
print(model.getSeed())

model.setLeafCol("leafId")
print("\n")
print("Feature importances (how much one feature influences the end prediction):")
print(model.featureImportances)
print("Output interpretation: Not sure if it's actually only deciding based on the first feature value or " + 
      "if this information is false")
print("\n")

# Weights assigned to each tree: (if not done should also assign 1.0 to all of them by default)
allclose(model.treeWeights, [1.0, 1.0, 1.0])

# test on impossible values:
test0 = spark.createDataFrame([(Vectors.dense([-1.0, -1.0, -1.0]),)], ["features"])
print("Impossible input prediction:")
print(model.predict(test0.head().features))
print("Leaf Id's obtained during prediction:")
print(model.predictLeaf(test0.head().features))
print("\n")
result = model.transform(test0).head()
result.prediction

result.leafId

model.numFeatures

model.trees

model.getNumTrees

test1 = spark.createDataFrame([(Vectors.dense(1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0),)], ["features"])
print(test1.toPandas())
print("model prediction:")
print(model.transform(test1).head().prediction)

# save a RF instance:
#rfr_path = temp_path + "/rfr"
#rf.save(rfr_path)
# load a RF instance:
#rf2 = RandomForestRegressor.load(rfr_path)
#rf2.getNumTrees()

# save the model:
#model_path = temp_path + "/rfr_model"
#model.save(model_path)
# loaf the model:
#model2 = RandomForestRegressionModel.load(model_path)
#model.featureImportances == model2.featureImportances
spark.stop()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|[2.0,1.0,1.0,1.0,...|
|  0.0|[3.0,2.0,1.0,1.0,...|
|  0.5|[1.0,2.0,3.0,1.0,...|
|  2.0|[2.0,2.0,2.0,1.0,...|
+-----+--------------------+

Minimum population size of nodes after decision:
1
number of trees used:
3
Is bagging performed:
True
Decision strategy:
onethird
Model seed:
42


Feature importances (how much one feature influences the end prediction):
(10,[0],[1.0])
Output interpretation: Not sure if it's actually only deciding based on the first feature value or if this information is false


Impossible input prediction:
0.8333333333333334
Leaf Id's obtained during prediction:
[0.0,0.0,0.0]


                                            features
0  [1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...
model prediction:
0.8333333333333334


In [66]:
a = Vectors.sparse(4, [1, 3], [3.0, 4.0])
a
b = Vectors.dense([1,2,3,4])
b
a + b
Vectors.sparse(3, [0,1.0], [1, 1.0])

SparseVector(3, {0: 1.0, 1: 1.0})