# Decision Tree

In [18]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pandas as pd
from IPython.display import display

In [20]:
spark_application_name = "Spark_Application_Name"

In [21]:
spark = (SparkSession.builder.appName(spark_application_name).getOrCreate())

In [22]:
from pyspark.sql.functions import percent_rank
from pyspark.sql import Window

filePath = "stocks-final.parquet"
stocksDF = spark.read.parquet(filePath)

stocksDF = stocksDF.withColumn("rank", percent_rank().over(Window.partitionBy().orderBy("Date")))
trainDF = stocksDF.where("rank <= .8").drop("rank")
testDF = stocksDF.where("rank > .8").drop("rank")

## String Indexer

In [23]:
# Didn't do StrinIndexer because there is no need for it since only 1 df is read so there's only 1 categorica1 column (company_name)

## VectorAssembler

In [24]:
from pyspark.ml.feature import VectorAssembler

numericCols = []
for (field, dataType) in trainDF.dtypes:
    if (dataType == "double") & (field != "Next"):
        numericCols.append(field)

vecAssembler = VectorAssembler(inputCols=numericCols, outputCol="features")

## Decision Tree

In [25]:
from pyspark.ml.regression import DecisionTreeRegressor

decisionTree = DecisionTreeRegressor(labelCol="Next")

In [26]:
from pyspark.ml import Pipeline

# Combine stages into pipeline
stages = [vecAssembler, decisionTree]
pipeline = Pipeline(stages=stages)

pipelineModel = pipeline.fit(trainDF)

In [27]:
# MaxBins
# Did not set Maxbins because we only have 1 categorical feature in our df (Date) and Max Bins
# has to be at least be the equal to the number of categorical features, and its default is already
# at 32.

## Pipeline

In [28]:
pipelineModel = pipeline.fit(trainDF)

## Visualize the Decision Tree

In [29]:
dtModel = pipelineModel.stages[-1]
print(dtModel.toDebugString)

DecisionTreeRegressionModel: uid=DecisionTreeRegressor_f00db7cbd26e, depth=5, numNodes=61, numFeatures=5
  If (feature 3 <= 1080.9400024414062)
   If (feature 3 <= 958.4499816894531)
    If (feature 1 <= 845.6400146484375)
     If (feature 3 <= 818.2799987792969)
      If (feature 0 <= 821.2944946289062)
       Predict: 807.487132196841
      Else (feature 0 > 821.2944946289062)
       Predict: 821.7799987792969
     Else (feature 3 > 818.2799987792969)
      If (feature 3 <= 830.5450134277344)
       Predict: 829.1996663411459
      Else (feature 3 > 830.5450134277344)
       Predict: 841.1006005859375
    Else (feature 1 > 845.6400146484375)
     If (feature 3 <= 920.6299743652344)
      If (feature 0 <= 852.125)
       Predict: 853.4000244140625
      Else (feature 0 > 852.125)
       Predict: 912.344248453776
     Else (feature 3 > 920.6299743652344)
      If (feature 3 <= 941.010009765625)
       Predict: 934.9957138372928
      Else (feature 3 > 941.010009765625)
       Predict: 

## Feature Importance

In [30]:
decisionTreeModel = pipelineModel.stages[-1]
decisionTreeModel.featureImportances

SparseVector(5, {0: 0.0265, 1: 0.0354, 2: 0.0, 3: 0.9381})

## Interpreting Feature Importance

In [31]:
import pandas as pd
dtModel = pipelineModel.stages[-1]
featureImp = pd.DataFrame(
  list(zip(vecAssembler.getInputCols(), decisionTreeModel.featureImportances)),
  columns=["feature", "importance"])
featureImp.sort_values(by="importance", ascending=False)

Unnamed: 0,feature,importance
3,Close,0.938105
1,Low,0.035369
0,High,0.026484
2,Open,4.2e-05
4,AdjClose,0.0


## Apply model to test set

In [32]:
predictionDF = pipelineModel.transform(testDF)

predictionDF.select("features", "Next", "prediction").show()

+--------------------+------------------+------------------+
|            features|              Next|        prediction|
+--------------------+------------------+------------------+
|[1436.96997070312...|1438.1400146484375|1379.6908624822443|
|[1438.14001464843...| 1415.699951171875|1379.6908624822443|
|[1415.69995117187...|1371.7039794921875|1379.6908624822443|
|[1371.70397949218...|1341.1400146484375|1322.2032114664714|
|[1341.14001464843...|1390.8699951171875|1298.9300537109375|
|[1390.86999511718...|1410.1500244140625|1322.2032114664714|
|[1410.15002441406...|1388.0899658203125|1322.2032114664714|
|[1388.08996582031...|1358.9100341796875|1379.6908624822443|
|[1358.91003417968...| 1306.219970703125|1322.2032114664714|
|[1306.21997070312...| 1254.760009765625|1298.9300537109375|
|[1254.76000976562...|1281.1500244140625|1237.4631596779336|
|[1281.15002441406...|   1260.9599609375|1237.4631596779336|
|[1260.9599609375,...|1193.8699951171875|1237.4631596779336|
|[1193.86999511718...| 1

In [33]:
from pyspark.ml.evaluation import RegressionEvaluator

regEvaluator = RegressionEvaluator(predictionCol="prediction",
                                          labelCol="Next", 
                                          metricName="rmse")

rmse = regEvaluator.evaluate(predictionDF)
r2 = regEvaluator.setMetricName("r2").evaluate(predictionDF)
print(f"RMSE is {rmse}")
print(f"R2 is {r2}")

RMSE is 112.42707182860805
R2 is 0.5727359194946977
