In [212]:
# Spark init
!wget -q https://dlcdn.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz 
!tar xf spark-3.1.2-bin-hadoop2.7.tgz
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/default-java"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"
!pip install -q findspark
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [213]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [214]:
import pyspark
from pyspark.sql.types import StructField
from pyspark.sql.types import *

custom_schema = StructType([
        StructField("Date", StringType(), True),
        StructField("Open", FloatType(), True),
        StructField("High", FloatType(), True),
        StructField("Low", FloatType(), True), 
        StructField("Close", FloatType(), True),
        StructField("Negativity", FloatType(), True),
        StructField("Positivity", FloatType(), True)
    ])

In [215]:
df = spark.read.csv("/content/drive/MyDrive/data/Tech_Industry_consolidated.csv", sep=',', schema = custom_schema, header=True)

In [216]:
df = df.dropna()
df.show()

+---------+-----+-----+-----+-----+----------+----------+
|     Date| Open| High|  Low|Close|Negativity|Positivity|
+---------+-----+-----+-----+-----+----------+----------+
| 4/1/2020|839.9|840.4|839.3|839.8|     12.71|     16.98|
| 4/2/2020|837.1|837.8|836.6|837.1|     15.67|     15.28|
| 4/3/2020|854.0|854.4|853.5|853.9|     15.75|     15.48|
| 4/6/2020|857.0|857.4|856.6|857.0|     12.03|     11.66|
| 4/7/2020|889.3|889.8|888.7|889.2|     12.46|     15.75|
| 4/8/2020|878.6|879.0|878.2|878.6|     15.68|     15.53|
| 4/9/2020|905.8|906.3|905.4|905.8|     13.43|     16.51|
|4/13/2020|929.6|930.1|929.1|929.6|     14.07|     14.52|
|4/15/2020|999.2|999.8|998.6|999.2|     12.82|     18.43|
|4/21/2020|949.0|949.8|948.1|949.0|     12.28|     16.93|
|4/22/2020|952.5|953.0|952.0|952.5|     14.26|     15.48|
|4/24/2020|954.5|954.9|954.1|954.5|     11.82|     16.67|
|4/27/2020|992.1|992.5|991.6|992.1|      9.13|     10.46|
|4/28/2020|984.0|984.6|983.4|984.0|     13.59|     13.47|
|5/27/2020|987

In [217]:
from pyspark.ml.linalg import Vector  
from pyspark.ml.feature import VectorAssembler

In [218]:
vector_assmebler=VectorAssembler(inputCols=['Negativity', 'Positivity'] 
                                ,outputCol='features')
df_transformed=vector_assmebler.transform(df)

df_transformed.show()

+---------+-----+-----+-----+-----+----------+----------+--------------------+
|     Date| Open| High|  Low|Close|Negativity|Positivity|            features|
+---------+-----+-----+-----+-----+----------+----------+--------------------+
| 4/1/2020|839.9|840.4|839.3|839.8|     12.71|     16.98|[12.7100000381469...|
| 4/2/2020|837.1|837.8|836.6|837.1|     15.67|     15.28|[15.6700000762939...|
| 4/3/2020|854.0|854.4|853.5|853.9|     15.75|     15.48|[15.75,15.4799995...|
| 4/6/2020|857.0|857.4|856.6|857.0|     12.03|     11.66|[12.0299997329711...|
| 4/7/2020|889.3|889.8|888.7|889.2|     12.46|     15.75|[12.4600000381469...|
| 4/8/2020|878.6|879.0|878.2|878.6|     15.68|     15.53|[15.6800003051757...|
| 4/9/2020|905.8|906.3|905.4|905.8|     13.43|     16.51|[13.4300003051757...|
|4/13/2020|929.6|930.1|929.1|929.6|     14.07|     14.52|[14.0699996948242...|
|4/15/2020|999.2|999.8|998.6|999.2|     12.82|     18.43|[12.8199996948242...|
|4/21/2020|949.0|949.8|948.1|949.0|     12.28|     1

In [219]:
df_neg_model=df_transformed.select('features','Close')
df_neg_model.show( truncate= False)

+---------------------------------------+-----+
|features                               |Close|
+---------------------------------------+-----+
|[12.710000038146973,16.979999542236328]|839.8|
|[15.670000076293945,15.279999732971191]|837.1|
|[15.75,15.479999542236328]             |853.9|
|[12.029999732971191,11.65999984741211] |857.0|
|[12.460000038146973,15.75]             |889.2|
|[15.680000305175781,15.529999732971191]|878.6|
|[13.430000305175781,16.510000228881836]|905.8|
|[14.069999694824219,14.520000457763672]|929.6|
|[12.819999694824219,18.43000030517578] |999.2|
|[12.279999732971191,16.93000030517578] |949.0|
|[14.260000228881836,15.479999542236328]|952.5|
|[11.819999694824219,16.670000076293945]|954.5|
|[9.130000114440918,10.460000038146973] |992.1|
|[13.59000015258789,13.470000267028809] |984.0|
|[13.09000015258789,14.779999732971191] |987.6|
|[13.699999809265137,12.59000015258789] |995.6|
+---------------------------------------+-----+



In [220]:
(training, test) = df_neg_model.randomSplit([0.8, 0.2], seed = 1234)
print (training.first())

Row(features=DenseVector([9.13, 10.46]), Close=992.0999755859375)


In [221]:
#Linear regression model
from pyspark.ml.regression import LinearRegression
Linear_Regression=LinearRegression(labelCol='Close')
regression = Linear_Regression.fit(training)

In [222]:
print(f"intercept: {regression.intercept}")
print(f"coefficiencts: {regression.coefficients}")

intercept: 1141.07137578496
coefficiencts: [-16.343478459274024,0.301810121813221]


In [223]:
training_predictions=regression.evaluate(training)

print("Training R squared:", training_predictions.r2)

Training R squared: 0.20296121453774474


In [224]:
test_prediction  = regression.evaluate(test)

In [225]:
print("Testing MSE: ", test_prediction.meanSquaredError)
print("Root Mean Square Error (RMSE) using Linear Regression: ", test_prediction.rootMeanSquaredError)

Testing MSE:  1404.9560734500637
Root Mean Square Error (RMSE) using Linear Regression:  37.48274367559109


In [226]:
#Predictions 
df_prediction = test_prediction.predictions
df_prediction.orderBy('prediction',ascending=False).show(10, truncate = False)

+---------------------------------------+-----+-----------------+
|features                               |Close|prediction       |
+---------------------------------------+-----+-----------------+
|[11.819999694824219,16.670000076293945]|954.5|952.9226401376276|
|[12.460000038146973,15.75]             |889.2|942.1851429775097|
+---------------------------------------+-----+-----------------+



In [227]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'Close')
dt_model = dt.fit(training)
dt_predictions = dt_model.transform(test)
dt_evaluator = RegressionEvaluator(labelCol="Close", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) using Decision Tree Model = %g" % rmse)

Root Mean Squared Error (RMSE) using Decision Tree Model = 80.8773


In [228]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'Close', maxIter=10)
gbt_model = gbt.fit(training)
gbt_predictions = gbt_model.transform(test)
gbt_predictions.select('prediction', 'Close', 'features').show(5)

+----------+-----+--------------------+
|prediction|Close|            features|
+----------+-----+--------------------+
|     857.0|954.5|[11.8199996948242...|
|     949.0|889.2|[12.4600000381469...|
+----------+-----+--------------------+



In [229]:
gbt_evaluator = RegressionEvaluator(
    labelCol="Close", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) using Gradient Booting Decision Tree model = %g" % rmse)

Root Mean Squared Error (RMSE) using Gradient Booting Decision Tree model = 80.8773
