In [19]:
# Spark init
!wget -q https://dlcdn.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz 
!tar xf spark-3.1.2-bin-hadoop2.7.tgz
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/default-java"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"
!pip install -q findspark
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
import pyspark
from pyspark.sql.types import StructField
from pyspark.sql.types import *

custom_schema = StructType([
        StructField("Date", StringType(), True),
        StructField("Open", FloatType(), True),
        StructField("High", FloatType(), True),
        StructField("Low", FloatType(), True), 
        StructField("Close", FloatType(), True),
        StructField("Negativity", FloatType(), True),
        StructField("Positivity", FloatType(), True)
    ])

In [22]:
df = spark.read.csv("/content/drive/MyDrive/data/Finance_industry_consolidated.csv", sep=',', schema = custom_schema, header=True)

In [23]:
df = df.dropna()
df.show()

+---------+-----+-----+-----+-----+----------+----------+
|     Date| Open| High|  Low|Close|Negativity|Positivity|
+---------+-----+-----+-----+-----+----------+----------+
| 4/2/2020| 38.4| 38.5| 38.4| 38.4|      15.7|      15.3|
| 4/3/2020|119.2|119.3|119.1|119.2|      15.7|      15.5|
| 4/6/2020|129.7|129.8|129.6|129.7|      12.0|      11.7|
| 4/7/2020|144.3|144.5|144.2|144.3|      12.5|      15.8|
| 4/8/2020|138.1|138.2|138.0|138.1|      15.7|      15.5|
| 4/9/2020|137.5|137.6|137.4|137.5|      13.4|      16.5|
|4/13/2020|135.7|135.8|135.6|135.7|      14.1|      14.5|
|4/14/2020|136.4|136.5|136.3|136.4|      12.0|      14.8|
|4/15/2020|133.3|133.4|133.2|133.3|      12.8|      18.4|
|4/16/2020|131.8|131.8|131.7|131.8|      21.1|      21.7|
|4/17/2020|132.6|132.7|132.5|132.6|      22.7|      33.2|
|4/20/2020|128.5|128.6|128.4|128.5|      10.9|      12.0|
|4/21/2020|123.8|123.8|123.7|123.8|      12.3|      16.9|
|4/22/2020|131.9|132.0|131.8|131.9|      14.3|      15.5|
|4/23/2020|135

In [24]:
from pyspark.ml.linalg import Vector  
from pyspark.ml.feature import VectorAssembler

In [25]:
vector_assmebler=VectorAssembler(inputCols=['Open', 'High', 'Low', 'Close'] 
                                ,outputCol='features')
df_transformed=vector_assmebler.transform(df)

df_transformed.show()

+---------+-----+-----+-----+-----+----------+----------+--------------------+
|     Date| Open| High|  Low|Close|Negativity|Positivity|            features|
+---------+-----+-----+-----+-----+----------+----------+--------------------+
| 4/2/2020| 38.4| 38.5| 38.4| 38.4|      15.7|      15.3|[38.4000015258789...|
| 4/3/2020|119.2|119.3|119.1|119.2|      15.7|      15.5|[119.199996948242...|
| 4/6/2020|129.7|129.8|129.6|129.7|      12.0|      11.7|[129.699996948242...|
| 4/7/2020|144.3|144.5|144.2|144.3|      12.5|      15.8|[144.300003051757...|
| 4/8/2020|138.1|138.2|138.0|138.1|      15.7|      15.5|[138.100006103515...|
| 4/9/2020|137.5|137.6|137.4|137.5|      13.4|      16.5|[137.5,137.600006...|
|4/13/2020|135.7|135.8|135.6|135.7|      14.1|      14.5|[135.699996948242...|
|4/14/2020|136.4|136.5|136.3|136.4|      12.0|      14.8|[136.399993896484...|
|4/15/2020|133.3|133.4|133.2|133.3|      12.8|      18.4|[133.300003051757...|
|4/16/2020|131.8|131.8|131.7|131.8|      21.1|      

In [26]:
df_neg_model=df_transformed.select('features','Negativity')
df_neg_model.show( truncate= False)

+-----------------------------------------------------------------------------+----------+
|features                                                                     |Negativity|
+-----------------------------------------------------------------------------+----------+
|[38.400001525878906,38.5,38.400001525878906,38.400001525878906]              |15.7      |
|[119.19999694824219,119.30000305175781,119.0999984741211,119.19999694824219] |15.7      |
|[129.6999969482422,129.8000030517578,129.60000610351562,129.6999969482422]   |12.0      |
|[144.3000030517578,144.5,144.1999969482422,144.3000030517578]                |12.5      |
|[138.10000610351562,138.1999969482422,138.0,138.10000610351562]              |15.7      |
|[137.5,137.60000610351562,137.39999389648438,137.5]                          |13.4      |
|[135.6999969482422,135.8000030517578,135.60000610351562,135.6999969482422]   |14.1      |
|[136.39999389648438,136.5,136.3000030517578,136.39999389648438]              |12.0      |

In [27]:
df_pos_model=df_transformed.select('features','Positivity')
df_pos_model.show( truncate= False)

+-----------------------------------------------------------------------------+----------+
|features                                                                     |Positivity|
+-----------------------------------------------------------------------------+----------+
|[38.400001525878906,38.5,38.400001525878906,38.400001525878906]              |15.3      |
|[119.19999694824219,119.30000305175781,119.0999984741211,119.19999694824219] |15.5      |
|[129.6999969482422,129.8000030517578,129.60000610351562,129.6999969482422]   |11.7      |
|[144.3000030517578,144.5,144.1999969482422,144.3000030517578]                |15.8      |
|[138.10000610351562,138.1999969482422,138.0,138.10000610351562]              |15.5      |
|[137.5,137.60000610351562,137.39999389648438,137.5]                          |16.5      |
|[135.6999969482422,135.8000030517578,135.60000610351562,135.6999969482422]   |14.5      |
|[136.39999389648438,136.5,136.3000030517578,136.39999389648438]              |14.8      |

In [28]:
(training, test) = df_neg_model.randomSplit([0.8, 0.2], seed = 1234)
print (training.first())

Row(features=DenseVector([38.4, 38.5, 38.4, 38.4]), Negativity=15.699999809265137)


In [29]:
#Linear regression model
from pyspark.ml.regression import LinearRegression
Linear_Regression=LinearRegression(labelCol='Negativity')
regression = Linear_Regression.fit(training)

In [30]:
print(f"intercept: {regression.intercept}")
print(f"coefficiencts: {regression.coefficients}")

intercept: 15.329727057615207
coefficiencts: [0.03847862667219166,-3.944818926548029,-5.010474844323829,8.896465244640801]


In [31]:
training_predictions=regression.evaluate(training)

print("Training R squared:", training_predictions.r2)

Training R squared: 0.05668574059551579


In [32]:
test_prediction  = regression.evaluate(test)

In [33]:
print("Testing MSE: ", test_prediction.meanSquaredError)

Testing MSE:  5.349744448151289


In [34]:
#Predictions 
df_prediction = test_prediction.predictions
df_prediction.orderBy('prediction',ascending=False).show(10, truncate = False)

+----------------------------------------------------------------------------+----------+------------------+
|features                                                                    |Negativity|prediction        |
+----------------------------------------------------------------------------+----------+------------------+
|[119.19999694824219,119.30000305175781,119.0999984741211,119.19999694824219]|15.7      |13.010552961437304|
|[147.39999389648438,147.39999389648438,147.3000030517578,147.39999389648438]|14.1      |12.831153599009701|
|[129.6999969482422,129.8000030517578,129.60000610351562,129.6999969482422]  |12.0      |12.79684078917993 |
|[151.89999389648438,151.89999389648438,151.8000030517578,151.89999389648438]|12.6      |12.739579050994768|
|[152.10000610351562,152.10000610351562,152.0,152.10000610351562]            |12.7      |12.735585276449944|
|[153.5,153.5,153.39999389648438,153.5]                                      |11.4      |12.707095541273281|
|[137.3999938964843

In [35]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'Negativity')
dt_model = dt.fit(training)
dt_predictions = dt_model.transform(test)
dt_evaluator = RegressionEvaluator(labelCol="Negativity", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on negative test data = %g" % rmse)

Root Mean Squared Error (RMSE) on negative test data = 2.29333


In [36]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'Negativity', maxIter=10)
gbt_model = gbt.fit(training)
gbt_predictions = gbt_model.transform(test)
gbt_predictions.select('prediction', 'Negativity', 'features').show(5)

+------------------+----------+--------------------+
|        prediction|Negativity|            features|
+------------------+----------+--------------------+
|14.127029355596385|      15.7|[119.199996948242...|
|14.127029355596385|      12.0|[129.699996948242...|
| 11.95657473066898|      11.6|[137.399993896484...|
| 11.95657473066898|      10.7|[137.699996948242...|
| 11.95657473066898|      16.0|[137.699996948242...|
+------------------+----------+--------------------+
only showing top 5 rows



In [37]:
gbt_evaluator = RegressionEvaluator(
    labelCol="Negativity", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) on negative test data = %g" % rmse)

Root Mean Squared Error (RMSE) on negative test data = 2.33668


In [38]:
(training, test) = df_pos_model.randomSplit([0.8, 0.2], seed = 1234)
print (training.first())

Row(features=DenseVector([38.4, 38.5, 38.4, 38.4]), Positivity=15.300000190734863)


In [39]:
#Linear regression model
from pyspark.ml.regression import LinearRegression
Linear_Regression=LinearRegression(labelCol='Positivity')
regression = Linear_Regression.fit(training)

In [40]:
print(f"intercept: {regression.intercept}")
print(f"coefficiencts: {regression.coefficients}")

intercept: 15.124311079803125
coefficiencts: [4.507992847177579,-0.09980922453324602,1.396303737353174,-5.813558554808753]


In [41]:
training_predictions=regression.evaluate(training)

print("Training R squared:", training_predictions.r2)

Training R squared: 0.003804751746369406


In [42]:
test_prediction  = regression.evaluate(test)

In [43]:
print("Testing MSE: ", test_prediction.meanSquaredError)

Testing MSE:  5.465836408626888


In [44]:
#Predictions 
df_prediction = test_prediction.predictions
df_prediction.orderBy('prediction',ascending=False).show(10, truncate = False)

+----------------------------------------------------------------------------+----------+------------------+
|features                                                                    |Positivity|prediction        |
+----------------------------------------------------------------------------+----------+------------------+
|[119.19999694824219,119.30000305175781,119.0999984741211,119.19999694824219]|15.5      |13.893414911200399|
|[129.6999969482422,129.8000030517578,129.60000610351562,129.6999969482422]  |11.7      |13.79817801863435 |
|[183.6999969482422,183.8000030517578,183.5,183.60000610351562]              |16.6      |13.74999723349281 |
|[137.39999389648438,137.5,137.3000030517578,137.39999389648438]             |14.7      |13.7283298462708  |
|[137.6999969482422,137.8000030517578,137.60000610351562,137.6999969482422]  |13.5      |13.725608460144304|
|[137.6999969482422,137.8000030517578,137.60000610351562,137.6999969482422]  |16.5      |13.725608460144304|
|[138.1000061035156

In [45]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'Positivity')
dt_model = dt.fit(training)
dt_predictions = dt_model.transform(test)
dt_evaluator = RegressionEvaluator(labelCol="Positivity", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on positive test data = %g" % rmse)

Root Mean Squared Error (RMSE) on positive test data = 2.35281


In [46]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'Positivity', maxIter=10)
gbt_model = gbt.fit(training)
gbt_predictions = gbt_model.transform(test)
gbt_predictions.select('prediction', 'Positivity', 'features').show(5)

+-----------------+----------+--------------------+
|       prediction|Positivity|            features|
+-----------------+----------+--------------------+
|17.06882772393525|      15.5|[119.199996948242...|
|17.06882772393525|      11.7|[129.699996948242...|
|13.61524709223899|      14.7|[137.399993896484...|
|13.61524709223899|      13.5|[137.699996948242...|
|13.61524709223899|      16.5|[137.699996948242...|
+-----------------+----------+--------------------+
only showing top 5 rows



In [47]:
gbt_evaluator = RegressionEvaluator(
    labelCol="Positivity", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) on positive test data = %g" % rmse)

Root Mean Squared Error (RMSE) on positive test data = 2.39576
