In [1]:
!pip install pyspark

Defaulting to user installation because normal site-packages is not writeable


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pandas as pd
from IPython.display import display

In [3]:
spark_application_name = "Spark_Application_Name"

In [4]:
spark = (SparkSession.builder.appName(spark_application_name).getOrCreate())

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/13 11:45:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/06/13 11:45:24 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/06/13 11:45:24 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [5]:
filePath = "sf-stocks-clean.parquet"
stocksDF = spark.read.parquet(filePath)
(trainDF, testDF) = stocksDF.randomSplit([.8, .2], seed=42)

                                                                                

In [6]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

categoricalCols = []
for (field, dataType) in trainDF.dtypes:
    if dataType == "string":
        categoricalCols.append(field)

indexOutputCols = [x + "Index" for x in categoricalCols]

stringIndexer = StringIndexer(inputCols=categoricalCols, outputCols=indexOutputCols, handleInvalid="skip")

numericCols = []
for (field, dataType) in trainDF.dtypes:
    if dataType == "double" and field != "Low":
        numericCols.append(field)

assemblerInputs = indexOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

## Random Forest

In [7]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline

rf = RandomForestRegressor(labelCol="Low", seed=42)
pipeline = Pipeline(stages = [stringIndexer, vecAssembler, rf])

## Grid Search

In [8]:
from pyspark.ml.tuning import ParamGridBuilder

paramGrid = (ParamGridBuilder()
            .addGrid(rf.maxDepth, [2, 4, 6])
            .addGrid(rf.numTrees, [10, 100])
            .build())

## Cross Validation

In [9]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator

evaluator = RegressionEvaluator(labelCol="Low",
                                predictionCol="prediction", 
                                metricName="rmse")

cv = CrossValidator(estimator=pipeline, 
                    evaluator=evaluator, 
                    estimatorParamMaps=paramGrid, 
                    numFolds=3, 
                    seed=42)

In [14]:
cvModel = cv.setParallelism(4).fit(trainDF)

22/06/13 11:52:48 WARN BlockManager: Block rdd_4426_0 already exists on this machine; not re-adding it


In [15]:
cv = CrossValidator(estimator=rf, 
                    evaluator=evaluator, 
                    estimatorParamMaps=paramGrid, 
                    numFolds=3, 
                    parallelism=4, 
                    seed=42)

pipeline = Pipeline(stages=[stringIndexer, vecAssembler, cv])

pipelineModel = pipeline.fit(trainDF)

In [16]:
list(zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics))

[({Param(parent='RandomForestRegressor_d1d74429f629', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 2,
   Param(parent='RandomForestRegressor_d1d74429f629', name='numTrees', doc='Number of trees to train (>= 1).'): 10},
  173.65945299918488),
 ({Param(parent='RandomForestRegressor_d1d74429f629', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 2,
   Param(parent='RandomForestRegressor_d1d74429f629', name='numTrees', doc='Number of trees to train (>= 1).'): 100},
  164.4144031848288),
 ({Param(parent='RandomForestRegressor_d1d74429f629', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 4,
   Param(parent='RandomForestRegressor_d1d74429f629', nam

In [18]:
predDF = pipelineModel.transform(testDF)

regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="Low", metricName="rmse")

rmse = regressionEvaluator.evaluate(predDF)
r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
print(f"RMSE is {rmse}")
print(f"R2 is {r2}")

RMSE is 89.15791516871782
R2 is 0.9847921359842783


In [19]:
# A littleee bit better than Decision Tree.