In [1]:
!pip install pyspark

Defaulting to user installation because normal site-packages is not writeable


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pandas as pd
from IPython.display import display

In [3]:
spark_application_name = "Spark_Application_Name"

In [4]:
spark = (SparkSession.builder.appName(spark_application_name).getOrCreate())

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/13 11:32:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/06/13 11:32:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
filePath = "sf-stocks-clean.parquet"
stocksDF = spark.read.parquet(filePath)

                                                                                

## Train/Test Split

In [6]:
trainDF, testDF = stocksDF.randomSplit([.8, .2], seed=42)

## StringIndexer, OneHotEncoder, and VectorAssembler

In [7]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

categoricalCols = []
for (field, dataType) in trainDF.dtypes:
    if dataType == "string":
        categoricalCols.append(field)

indexOutputCols = [x + "Index" for x in categoricalCols]
oheOutputCols = [x + "OHE" for x in categoricalCols]

stringIndexer = StringIndexer(inputCols=categoricalCols, 
                              outputCols=indexOutputCols, 
                              handleInvalid="skip")
oheEncoder = OneHotEncoder(inputCols=indexOutputCols, 
                           outputCols=oheOutputCols)

# StringIndexer encodes a string column of labels to a column of label indices.
# If the input column is numeric, we cast it to string and index the string values.

# One Hot Encoding is a technique for converting categorical attributes into a binary vector.
# A one-hot encoder that maps a column of category indices to a column of binary vectors,
# with at most a single one-value per row that indicates the input category index.

In [8]:
from pyspark.ml.feature import VectorAssembler

numericCols = []

for (field, dataType) in trainDF.dtypes:
    if dataType == "double" and field != "Low":
        numericCols.append(field)

assemblerInputs = oheOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, 
                               outputCol="features")

# VectorAssembler is a transformer that combines a given list of columns into a single vector column.

## Linear Regression

In [9]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(labelCol="Low", featuresCol="features")

## Pipeline

In [10]:
# Option 1: StringIndexer + OHE + VectorAssembler
from pyspark.ml import Pipeline

stages = [stringIndexer, oheEncoder, vecAssembler, lr]
pipeline = Pipeline(stages=stages)

pipelineModel = pipeline.fit(trainDF)
predDF = pipelineModel.transform(testDF)
predDF.select("features", "Low", "prediction").show(10)

22/06/13 11:32:17 WARN Instrumentation: [1c247cd9] regParam is zero, which might cause numerical instability and overfitting.


+--------------------+------------------+------------------+
|            features|              High|        prediction|
+--------------------+------------------+------------------+
|(11,[0,6,7,8,9,10...| 45.49599838256836|46.304193414515446|
|(11,[0,6,7,8,9,10...| 45.99599838256836| 46.55924680098951|
|(11,[0,6,7,8,9,10...| 47.56999969482422| 47.74904183179527|
|(11,[0,6,7,8,9,10...|50.178001403808594|   50.284607650766|
|(11,[0,6,7,8,9,10...|51.178001403808594|50.876756405420636|
+--------------------+------------------+------------------+
only showing top 5 rows



## Evaluate RMSE: Model

In [11]:
from pyspark.ml.evaluation import RegressionEvaluator

regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="Low", metricName="rmse")

rmse = round(regressionEvaluator.evaluate(predDF), 2)
print(f"RMSE is {rmse}")

RMSE is 6.35


In [12]:
r2 = round(regressionEvaluator.setMetricName("r2").evaluate(predDF), 2)
print(f"R2 is {r2}")

# In finance, an R-Squared above 0.7 would generally be seen as showing a high level of correlation, whereas a measure below 0.4 would show a low correlation. 

R2 is 1.0


In [15]:
# IT IS OVERFITTING !

## Loading Models

In [13]:
pipelinePath = "lr-pipeline-model"
pipelineModel.write().overwrite().save(pipelinePath)

In [14]:
from pyspark.ml import PipelineModel

savedPipelineModel = PipelineModel.load(pipelinePath)

                                                                                