In [41]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [42]:
from pyspark.sql import SparkSession

In [43]:
spark_application_name = "Spark_Application_Name"

In [44]:
spark = (SparkSession.builder.appName(spark_application_name).getOrCreate())

In [45]:
filePath = "stocks-final.parquet"
stocksDF = spark.read.parquet(filePath)
display(stocksDF)

DataFrame[Date: date, High: double, Low: double, Open: double, Close: double, Volume: int, AdjClose: double, company_name: string, Next: double]

## Train/Test Split

In [46]:
from pyspark.sql.functions import percent_rank
from pyspark.sql import Window

stocksDF = stocksDF.withColumn("rank", percent_rank().over(Window.partitionBy().orderBy("Date")))

In [47]:
trainDF = stocksDF.where("rank <= .8").drop("rank")
trainDF.show()

+----------+-----------------+-----------------+-----------------+-----------------+-------+-----------------+------------+-----------------+
|      Date|             High|              Low|             Open|            Close| Volume|         AdjClose|company_name|             Next|
+----------+-----------------+-----------------+-----------------+-----------------+-------+-----------------+------------+-----------------+
|2017-01-03|789.6300048828125|775.7999877929688|778.8099975585938|786.1400146484375|1657300|786.1400146484375|      GOOGLE|791.3400268554688|
|2017-01-04|791.3400268554688|783.1599731445312|788.3599853515625|786.9000244140625|1073000|786.9000244140625|      GOOGLE|  794.47998046875|
|2017-01-05|  794.47998046875|  785.02001953125|786.0800170898438|  794.02001953125|1335200|  794.02001953125|      GOOGLE|807.9000244140625|
|2017-01-06|807.9000244140625|792.2039794921875| 795.260009765625|806.1500244140625|1640200|806.1500244140625|      GOOGLE|809.9660034179688|
|2017-

In [48]:
testDF = stocksDF.where("rank > .8").drop("rank")
testDF.show()

+----------+------------------+------------------+------------------+------------------+-------+------------------+------------+------------------+
|      Date|              High|               Low|              Open|             Close| Volume|          AdjClose|company_name|              Next|
+----------+------------------+------------------+------------------+------------------+-------+------------------+------------+------------------+
|2020-02-24| 1436.969970703125|1411.3900146484375|1426.1099853515625|1421.5899658203125|2867100|1421.5899658203125|      GOOGLE|1438.1400146484375|
|2020-02-25|1438.1400146484375|1382.4000244140625|            1433.0| 1388.449951171875|2478300| 1388.449951171875|      GOOGLE| 1415.699951171875|
|2020-02-26| 1415.699951171875|            1379.0|1396.1400146484375|1393.1800537109375|2202400|1393.1800537109375|      GOOGLE|1371.7039794921875|
|2020-02-27|1371.7039794921875|1317.1700439453125|  1362.06005859375|1318.0899658203125|2978300|1318.08996582031

/home/phuhien/PycharmProjects/ocvx## Vector Assembler

In [49]:
from pyspark.ml.feature import VectorAssembler

numericCols = []
for (field, dataType) in trainDF.dtypes:
    if (dataType == "double") & (field != "Next"):
        numericCols.append(field)

vecAssembler = VectorAssembler(inputCols=numericCols, outputCol="features")

vecTrainDF = vecAssembler.transform(trainDF)

## Linear Regression

In [50]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol="features", labelCol="Next", regParam=0.01)
lrModel = lr.fit(vecTrainDF)

## Pipeline

In [51]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[vecAssembler, lr])
pipelineModel = pipeline.fit(trainDF)

## Apply to Test Set

In [52]:
predictionDF = pipelineModel.transform(testDF)

predictionDF.select("features", "Next","prediction").show(10)

+--------------------+------------------+------------------+
|            features|              Next|        prediction|
+--------------------+------------------+------------------+
|[1436.96997070312...|1438.1400146484375|1433.3858691797398|
|[1438.14001464843...| 1415.699951171875|1408.5239465590419|
|[1415.69995117187...|1371.7039794921875|  1408.25904452282|
|[1371.70397949218...|1341.1400146484375|1339.6268740697183|
|[1341.14001464843...|1390.8699951171875|1356.9669645018753|
|[1390.86999511718...|1410.1500244140625|1403.0954723388015|
|[1410.15002441406...|1388.0899658203125|1367.4898097392918|
|[1388.08996582031...|1358.9100341796875|1398.3019850759447|
|[1358.91003417968...| 1306.219970703125| 1337.199784601689|
|[1306.21997070312...| 1254.760009765625|1311.6375892531723|
+--------------------+------------------+------------------+
only showing top 10 rows



## Baseline Model

In [55]:
from pyspark.ml.evaluation import RegressionEvaluator

regEvaluator = RegressionEvaluator(labelCol="Next", predictionCol="prediction")
rmse = regEvaluator.setMetricName("rmse").evaluate(predictionDF)
r2 = regEvaluator.setMetricName("r2").evaluate(predictionDF)
print(f"RMSE is {round(rmse, 2)}")
print(f"R2 is {round(r2, 2)}")

RMSE is 26.95
R2 is 0.98
