# Decision Tree

In [1]:
!pip install pyspark

Defaulting to user installation because normal site-packages is not writeable


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pandas as pd
from IPython.display import display

In [3]:
spark_application_name = "Spark_Application_Name"

In [4]:
spark = (SparkSession.builder.appName(spark_application_name).getOrCreate())

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/13 11:11:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
filePath = "sf-stocks-clean.parquet"
stocksDF = spark.read.parquet(filePath)
(trainDF, testDF) = stocksDF.randomSplit([.8, .2], seed=42)

                                                                                

## String Indexer

In [6]:
from pyspark.ml.feature import StringIndexer

categoricalCols = []
for (field, dataType) in trainDF.dtypes:
    if dataType == "string":
        categoricalCols.append(field)
indexOutputCols = [x + "Index" for x in categoricalCols]

stringIndexer = StringIndexer(inputCols=categoricalCols, outputCols=indexOutputCols, handleInvalid="skip")

## VectorAssembler

In [15]:
from pyspark.ml.feature import VectorAssembler

# Filter for just numeric columns (and exclude price, our label)
numericCols = []
for (field, dataType) in trainDF.dtypes:
    if dataType == "double" and field != "Low":
        numericCols.append(field)

# Combine output of StringIndexer defined above and numeric columns
assemblerInputs = indexOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

## Decision Tree

In [16]:
from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(labelCol="Low")

In [17]:
from pyspark.ml import Pipeline

# Combine stages into pipeline
stages = [stringIndexer, vecAssembler, dt]
pipeline = Pipeline(stages=stages)

pipelineModel = pipeline.fit(trainDF)

In [10]:
# MaxBins
# Did not set Maxbins because we only have 1 categorical feature in our df (Date) and Max Bins
# has to be at least be the equal to the number of categorical features, and its default is already
# at 32.

## Pipeline

In [18]:
pipelineModel = pipeline.fit(trainDF)

## Visualize the Decision Tree

In [19]:
dtModel = pipelineModel.stages[-1]
print(dtModel.toDebugString)

DecisionTreeRegressionModel: uid=DecisionTreeRegressor_5fa9f3e08449, depth=5, numNodes=63, numFeatures=6
  If (feature 0 in {0.0,1.0,3.0,4.0,6.0})
   If (feature 5 <= 148.97000122070312)
    If (feature 3 <= 83.76499938964844)
     If (feature 2 <= 59.36199951171875)
      If (feature 2 <= 48.48124885559082)
       Predict: 41.82414340573874
      Else (feature 2 > 48.48124885559082)
       Predict: 54.02548275173271
     Else (feature 2 > 59.36199951171875)
      If (feature 2 <= 69.94199752807617)
       Predict: 65.86217955905859
      Else (feature 2 > 69.94199752807617)
       Predict: 75.84026235904334
    Else (feature 3 > 83.76499938964844)
     If (feature 5 <= 117.44313049316406)
      If (feature 5 <= 104.53890991210938)
       Predict: 96.86899994757482
      Else (feature 5 > 104.53890991210938)
       Predict: 113.88738145828248
     Else (feature 5 > 117.44313049316406)
      If (feature 2 <= 137.08499908447266)
       Predict: 130.8158513322661
      Else (feature 2 > 1

## Feature Importance

In [20]:
dtModel = pipelineModel.stages[-1]
dtModel.featureImportances

SparseVector(6, {0: 0.7927, 1: 0.0421, 2: 0.1374, 3: 0.0201, 4: 0.0007, 5: 0.007})

## Interpreting Feature Importance

In [21]:
import pandas as pd
dtModel = pipelineModel.stages[-1]
featureImp = pd.DataFrame(
  list(zip(vecAssembler.getInputCols(), dtModel.featureImportances)),
  columns=["feature", "importance"])
featureImp.sort_values(by="importance", ascending=False)

Unnamed: 0,feature,importance
0,company_nameIndex,0.792699
2,Open,0.137401
1,Low,0.042075
3,Close,0.020082
5,AdjClose,0.007049
4,Volume,0.000694


In [22]:
## Apply model to test set

In [24]:
predDF = pipelineModel.transform(testDF)

predDF.select("features", "Low", "prediction").orderBy("Low", ascending=False).show()

+--------------------+-----------------+------------------+
|            features|             High|        prediction|
+--------------------+-----------------+------------------+
|[5.0,3378.0,3450....|           3453.0|2870.1531218317627|
|[5.0,3288.8798828...|3366.800048828125|2870.1531218317627|
|[5.0,3068.3898925...|  3344.2900390625|2870.1531218317627|
|[5.0,3153.3000488...| 3282.97998046875|2870.1531218317627|
|[5.0,3192.0100097...|           3266.0|2870.1531218317627|
|[5.0,3157.1799316...|3248.949951171875|2870.1531218317627|
|[5.0,3135.6999511...|           3215.0|2870.1531218317627|
|[5.0,3140.8500976...|3202.530029296875|2870.1531218317627|
|[5.0,3135.2600097...|          3189.25|2870.1531218317627|
|[5.0,3108.9199218...| 3175.02001953125|2870.1531218317627|
|[5.0,3101.4199218...|3174.389892578125|2870.1531218317627|
|[5.0,3101.2099609...|3167.239990234375|2870.1531218317627|
|[5.0,3065.4599609...|          3139.75|2870.1531218317627|
|[5.0,2950.0,3089....|  3127.3798828125|

In [25]:
from pyspark.ml.evaluation import RegressionEvaluator

regressionEvaluator = RegressionEvaluator(predictionCol="prediction", 
                                          labelCol="Low",
                                          metricName="rmse")

rmse = regressionEvaluator.evaluate(predDF)
r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
print(f"RMSE is {rmse}")
print(f"R2 is {r2}")

RMSE is 90.76386060381067
R2 is 0.9842393424872098


In [27]:
# Way better than LogScale