In [31]:
# Spark init
!wget -q https://dlcdn.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz 
!tar xf spark-3.1.2-bin-hadoop2.7.tgz
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/default-java"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"
!pip install -q findspark
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [32]:
import pyspark
from pyspark.sql.types import StructField
from pyspark.sql.types import *

custom_schema = StructType([
        StructField("Date", StringType(), True),
        StructField("Open", FloatType(), True),
        StructField("High", FloatType(), True),
        StructField("Low", FloatType(), True), 
        StructField("Close", FloatType(), True),
        StructField("Negativity", FloatType(), True),
        StructField("Positivity", FloatType(), True)
    ])

In [33]:
df = spark.read.csv("/content/drive/MyDrive/data/NegPosData.csv", sep=',', schema = custom_schema, header=True)

In [34]:
df = df.dropna()
df.show()

+----------+----+----+----+-----+----------+----------+
|      Date|Open|High| Low|Close|Negativity|Positivity|
+----------+----+----+----+-----+----------+----------+
|2020-04-01|66.0|66.0|66.0| 66.0|      0.67|      0.27|
|2020-04-02|66.0|66.0|66.0| 66.0|       0.0|       0.5|
|2020-04-03|67.0|67.0|67.0| 67.0|      0.26|     0.115|
|2020-04-07|69.0|69.0|69.0| 69.0|       0.0|     0.275|
|2020-04-08|69.0|69.0|69.0| 69.0|       0.0|       0.0|
|2020-04-09|66.0|66.0|66.0| 66.0|     0.201|       0.0|
|2020-04-13|71.0|71.0|71.0| 71.0|       0.0|       0.0|
|2020-04-16|76.0|76.0|76.0| 76.0|    0.1178|      0.76|
|2020-04-17|75.0|75.0|75.0| 75.0|     0.251|    0.1233|
|2020-04-21|74.0|74.0|74.0| 74.0|     0.203|      0.36|
|2020-04-23|75.0|75.0|75.0| 75.0|       0.0|       0.0|
|2020-04-28|72.0|72.0|72.0| 72.0|     0.416|       0.0|
|2020-04-29|72.0|72.0|72.0| 72.0|     0.668|       0.0|
|2020-04-30|72.0|72.0|72.0| 72.0|     0.127|       0.0|
|2020-05-01|72.0|72.0|71.0| 72.0|     0.459|    

In [35]:
from pyspark.ml.linalg import Vector  
from pyspark.ml.feature import VectorAssembler

In [36]:
vector_assmebler=VectorAssembler(inputCols=['Negativity', 'Positivity'] 
                                ,outputCol='features')
df_transformed=vector_assmebler.transform(df)

df_transformed.show()

+----------+----+----+----+-----+----------+----------+--------------------+
|      Date|Open|High| Low|Close|Negativity|Positivity|            features|
+----------+----+----+----+-----+----------+----------+--------------------+
|2020-04-01|66.0|66.0|66.0| 66.0|      0.67|      0.27|[0.67000001668930...|
|2020-04-02|66.0|66.0|66.0| 66.0|       0.0|       0.5|           [0.0,0.5]|
|2020-04-03|67.0|67.0|67.0| 67.0|      0.26|     0.115|[0.25999999046325...|
|2020-04-07|69.0|69.0|69.0| 69.0|       0.0|     0.275|[0.0,0.2750000059...|
|2020-04-08|69.0|69.0|69.0| 69.0|       0.0|       0.0|           (2,[],[])|
|2020-04-09|66.0|66.0|66.0| 66.0|     0.201|       0.0|[0.20100000500679...|
|2020-04-13|71.0|71.0|71.0| 71.0|       0.0|       0.0|           (2,[],[])|
|2020-04-16|76.0|76.0|76.0| 76.0|    0.1178|      0.76|[0.11779999732971...|
|2020-04-17|75.0|75.0|75.0| 75.0|     0.251|    0.1233|[0.25099998712539...|
|2020-04-21|74.0|74.0|74.0| 74.0|     0.203|      0.36|[0.20299999415874...|

In [37]:
df_neg_model=df_transformed.select('features','Close')
df_neg_model.show( truncate= False)

+-----------------------------------------+-----+
|features                                 |Close|
+-----------------------------------------+-----+
|[0.6700000166893005,0.27000001072883606] |66.0 |
|[0.0,0.5]                                |66.0 |
|[0.25999999046325684,0.11500000208616257]|67.0 |
|[0.0,0.2750000059604645]                 |69.0 |
|(2,[],[])                                |69.0 |
|[0.20100000500679016,0.0]                |66.0 |
|(2,[],[])                                |71.0 |
|[0.11779999732971191,0.7599999904632568] |76.0 |
|[0.25099998712539673,0.12330000102519989]|75.0 |
|[0.2029999941587448,0.36000001430511475] |74.0 |
|(2,[],[])                                |75.0 |
|[0.41600000858306885,0.0]                |72.0 |
|[0.6679999828338623,0.0]                 |72.0 |
|[0.12700000405311584,0.0]                |72.0 |
|[0.45899999141693115,0.0]                |72.0 |
|[0.0,0.5019999742507935]                 |70.0 |
|[0.0,0.19099999964237213]                |72.0 |


In [38]:
(training, test) = df_neg_model.randomSplit([0.8, 0.2], seed = 1234)
print (training.first())

Row(features=SparseVector(2, {}), Close=67.0)


In [39]:
#Linear regression model
from pyspark.ml.regression import LinearRegression
Linear_Regression=LinearRegression(labelCol='Close')
regression = Linear_Regression.fit(training)

In [40]:
print(f"intercept: {regression.intercept}")
print(f"coefficiencts: {regression.coefficients}")

intercept: 75.81966933332883
coefficiencts: [-1.4196434934190554,1.418872987280451]


In [41]:
training_predictions=regression.evaluate(training)

print("Training R squared:", training_predictions.r2)

Training R squared: 0.005408814463458866


In [42]:
test_prediction  = regression.evaluate(test)

In [43]:
print("Testing MSE: ", test_prediction.meanSquaredError)

Testing MSE:  38.42201552450965


In [44]:
rmse = test_prediction.rootMeanSquaredError
print("Root Mean Squared Error (RMSE) on Lindear Regression test data = %g" % rmse)

Root Mean Squared Error (RMSE) on Lindear Regression test data = 6.19855


In [45]:
#Predictions 
df_prediction = test_prediction.predictions
df_prediction.orderBy('prediction',ascending=False).show(10, truncate = False)

+----------------------------------------+-----+-----------------+
|features                                |Close|prediction       |
+----------------------------------------+-----+-----------------+
|[0.0,0.9539999961853027]                |70.0 |77.1732741577818 |
|[0.0,0.7710000276565552]                |73.0 |76.9136204457632 |
|[0.11060000211000443,0.8700000047683716]|78.0 |76.89707626566094|
|[0.11779999732971191,0.7599999904632568]|76.0 |76.73077879039664|
|[0.0,0.6079999804496765]                |69.0 |76.68234408185592|
|[0.20999999344348907,0.781000018119812] |81.0 |76.62968403779448|
|[0.4359999895095825,0.9700000286102295] |71.0 |76.57701162334709|
|[0.0,0.5139999985694885]                |83.0 |76.54897004676127|
|[0.0,0.45399999618530273]               |76.0 |76.4638376641416 |
|[0.0,0.4519999921321869]                |83.0 |76.46099991241617|
+----------------------------------------+-----+-----------------+
only showing top 10 rows



In [46]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'Close')
dt_model = dt.fit(training)
dt_predictions = dt_model.transform(test)
dt_evaluator = RegressionEvaluator(labelCol="Close", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 6.48074


In [47]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'Close', maxIter=10)
gbt_model = gbt.fit(training)
gbt_predictions = gbt_model.transform(test)
gbt_predictions.select('prediction', 'Close', 'features').show(5)

+-----------------+-----+---------+
|       prediction|Close| features|
+-----------------+-----+---------+
|75.59253229853176| 67.0|(2,[],[])|
|75.59253229853176| 69.0|(2,[],[])|
|75.59253229853176| 72.0|(2,[],[])|
|75.59253229853176| 72.0|(2,[],[])|
|75.59253229853176| 72.0|(2,[],[])|
+-----------------+-----+---------+
only showing top 5 rows



In [48]:
gbt_evaluator = RegressionEvaluator(
    labelCol="Close", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 6.6151
