# Random Forest Regression

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
spark = SparkContext("local", "sqlContext")
sql = SQLContext(spark)

#### Load data

In [6]:
path = 'D:/ProgramFiles/Spark/spark-3.0.0-bin-hadoop2.7/data/mllib/'
df = sql.read.format("libsvm").load(path + "sample_libsvm_data.txt")
df.toPandas()

Unnamed: 0,label,features
0,0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...
95,0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
96,0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
97,0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
98,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


#### Data preparation

In [8]:
# Automatically identify categorical features, and index them.Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(df)

# Split the data into training and test sets (30% held out for testing)
(df_train, df_test) = df.randomSplit([0.7, 0.3])

#### Build the model and train

In [9]:
# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="indexedFeatures")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])

# Train model.  This also runs the indexer.
rf_fit = pipeline.fit(df_train)

In [13]:
# Make predictions.
pred = rf_fit.transform(df_test)

# Select example rows to display.
pred.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
eval = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = eval.evaluate(pred)
print("Root Mean Squared Error (RMSE) on test data = ",  rmse)

rfModel = rf_fit.stages[1]
print(rfModel)  # summary only

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[95,96,97,12...|
|       0.0|  0.0|(692,[98,99,100,1...|
|      0.45|  0.0|(692,[100,101,102...|
|       0.2|  0.0|(692,[122,123,148...|
|       0.0|  0.0|(692,[123,124,125...|
+----------+-----+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data =  0.13153946776138092
RandomForestRegressionModel: uid=RandomForestRegressor_ca1da6e7db03, numTrees=20, numFeatures=692


In [14]:
spark.stop()

## Credits & Links

http://spark.apache.org/docs/2.2.0/ml-classification-regression.html