# Log Scale

In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 47 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 6.4 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=47a31ad081b8fd69b001bc09c12064c2cc0406c106234b923d62bfd37c2af603
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pandas as pd
from IPython.display import display

In [3]:
spark_application_name = "Spark_Application_Name"

In [4]:
spark = (SparkSession.builder.appName(spark_application_name).getOrCreate())

In [5]:
from pyspark.sql.functions import percent_rank
from pyspark.sql import Window

filePath = "stocks-final.parquet"
stocksDF = spark.read.parquet(filePath)

stocksDF = stocksDF.withColumn("rank", percent_rank().over(Window.partitionBy().orderBy("Date"))).drop("company_name")
trainDF = stocksDF.where("rank <= .8").drop("rank")
testDF = stocksDF.where("rank > .8").drop("rank")

In [6]:
from pyspark.sql.functions import col, log
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression

logTrainDF = trainDF.withColumn("log_next", log(col("Next")))
logTestDF = testDF.withColumn("log_next", log(col("Next")))

from pyspark.ml.feature import VectorAssembler

numericCols = []
for (field, dataType) in trainDF.dtypes:
    if (dataType == "double") & (field != "Next"):
        numericCols.append(field)

vecAssembler = VectorAssembler(inputCols=numericCols, outputCol="features")

lr = LinearRegression(labelCol="log_next", predictionCol="log_pred")
pipeline = Pipeline(stages = [vecAssembler, lr])
pipelineModel = pipeline.fit(logTrainDF)
predictionDF = pipelineModel.transform(logTestDF)

## Exponentiate

In [7]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, exp

expDF = predictionDF.withColumn("prediction", exp(col("log_pred")))

regEvaluator = RegressionEvaluator(labelCol="Next", predictionCol="prediction")
rmse = regEvaluator.setMetricName("rmse").evaluate(expDF)
r2 = regEvaluator.setMetricName("r2").evaluate(expDF)
print(f"RMSE is {rmse}")
print(f"R2 is {r2}")

RMSE is 107.45571599482726
R2 is 0.6096864492272256


## Apply to Test Set

In [8]:
expDF.select("features", "Next","prediction").show(10)

+--------------------+------------------+------------------+
|            features|              Next|        prediction|
+--------------------+------------------+------------------+
|[1436.96997070312...|1438.1400146484375|1477.1218308939633|
|[1438.14001464843...| 1415.699951171875|1460.0566315762965|
|[1415.69995117187...|1371.7039794921875|1450.0198583333124|
|[1371.70397949218...|1341.1400146484375|1371.7448274763638|
|[1341.14001464843...|1390.8699951171875|1399.9846185911151|
|[1390.86999511718...|1410.1500244140625|1455.2577405579018|
|[1410.15002441406...|1388.0899658203125|1418.3932186939137|
|[1388.08996582031...|1358.9100341796875|1439.6753633039857|
|[1358.91003417968...| 1306.219970703125| 1367.325193292934|
|[1306.21997070312...| 1254.760009765625|  1331.44913530293|
+--------------------+------------------+------------------+
only showing top 10 rows

