In [169]:
# Spark init
!wget -q https://dlcdn.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz 
!tar xf spark-3.1.2-bin-hadoop2.7.tgz
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/default-java"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"
!pip install -q findspark
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [170]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [171]:
import pyspark
from pyspark.sql.types import StructField
from pyspark.sql.types import *

custom_schema = StructType([
        StructField("Date", StringType(), True),
        StructField("Open", FloatType(), True),
        StructField("High", FloatType(), True),
        StructField("Low", FloatType(), True), 
        StructField("Close", FloatType(), True),
        StructField("Negativity", FloatType(), True),
        StructField("Positivity", FloatType(), True)
    ])

In [172]:
df = spark.read.csv("/content/drive/MyDrive/data/Manufacturing_Industry_consolidated.csv", sep=',', schema = custom_schema, header=True)

In [173]:
df = df.dropna()
df.show()

+----------+----+----+----+-----+----------+----------+
|      Date|Open|High| Low|Close|Negativity|Positivity|
+----------+----+----+----+-----+----------+----------+
|04/02/2020|54.8|54.9|54.8| 54.8|      15.7|      15.3|
|04/03/2020|56.2|56.2|56.2| 56.2|      15.7|      15.5|
|04/06/2020|58.5|58.6|58.5| 58.5|      12.0|      11.7|
|04/07/2020|59.2|59.3|59.2| 59.2|      12.5|      15.8|
|04/08/2020|59.4|59.4|59.3| 59.4|      15.7|      15.5|
|04/09/2020|60.3|60.3|60.2| 60.3|      13.4|      16.5|
|04/13/2020|59.9|59.9|59.8| 59.9|      14.1|      14.5|
|04/14/2020|64.1|64.1|64.0| 64.1|      12.0|      14.8|
|04/15/2020|62.2|62.3|62.2| 62.2|      12.8|      18.4|
|04/16/2020|60.5|60.6|60.5| 60.5|      21.1|      21.7|
|04/17/2020|61.1|61.2|61.1| 61.1|      22.7|      33.2|
|04/20/2020|61.9|62.0|61.9| 61.9|      10.9|      12.0|
|04/21/2020|60.4|60.4|60.4| 60.4|      12.3|      16.9|
|04/22/2020|59.7|59.7|59.6| 59.7|      14.3|      15.5|
|04/23/2020|59.3|59.3|59.2| 59.3|      11.4|    

In [174]:
from pyspark.ml.linalg import Vector  
from pyspark.ml.feature import VectorAssembler

In [175]:
vector_assmebler=VectorAssembler(inputCols=['Negativity', 'Positivity'] 
                                ,outputCol='features')
df_transformed=vector_assmebler.transform(df)

df_transformed.show()

+----------+----+----+----+-----+----------+----------+--------------------+
|      Date|Open|High| Low|Close|Negativity|Positivity|            features|
+----------+----+----+----+-----+----------+----------+--------------------+
|04/02/2020|54.8|54.9|54.8| 54.8|      15.7|      15.3|[15.6999998092651...|
|04/03/2020|56.2|56.2|56.2| 56.2|      15.7|      15.5|[15.6999998092651...|
|04/06/2020|58.5|58.6|58.5| 58.5|      12.0|      11.7|[12.0,11.69999980...|
|04/07/2020|59.2|59.3|59.2| 59.2|      12.5|      15.8|[12.5,15.80000019...|
|04/08/2020|59.4|59.4|59.3| 59.4|      15.7|      15.5|[15.6999998092651...|
|04/09/2020|60.3|60.3|60.2| 60.3|      13.4|      16.5|[13.3999996185302...|
|04/13/2020|59.9|59.9|59.8| 59.9|      14.1|      14.5|[14.1000003814697...|
|04/14/2020|64.1|64.1|64.0| 64.1|      12.0|      14.8|[12.0,14.80000019...|
|04/15/2020|62.2|62.3|62.2| 62.2|      12.8|      18.4|[12.8000001907348...|
|04/16/2020|60.5|60.6|60.5| 60.5|      21.1|      21.7|[21.1000003814697...|

In [176]:
df_neg_model=df_transformed.select('features','Close')
df_neg_model.show( truncate= False)

+---------------------------------------+-----+
|features                               |Close|
+---------------------------------------+-----+
|[15.699999809265137,15.300000190734863]|54.8 |
|[15.699999809265137,15.5]              |56.2 |
|[12.0,11.699999809265137]              |58.5 |
|[12.5,15.800000190734863]              |59.2 |
|[15.699999809265137,15.5]              |59.4 |
|[13.399999618530273,16.5]              |60.3 |
|[14.100000381469727,14.5]              |59.9 |
|[12.0,14.800000190734863]              |64.1 |
|[12.800000190734863,18.399999618530273]|62.2 |
|[21.100000381469727,21.700000762939453]|60.5 |
|[22.700000762939453,33.20000076293945] |61.1 |
|[10.899999618530273,12.0]              |61.9 |
|[12.300000190734863,16.899999618530273]|60.4 |
|[14.300000190734863,15.5]              |59.7 |
|[11.399999618530273,14.300000190734863]|59.3 |
|[11.800000190734863,16.700000762939453]|58.6 |
|[9.100000381469727,10.5]               |59.2 |
|[13.600000381469727,13.5]              

In [177]:
(training, test) = df_neg_model.randomSplit([0.8, 0.2], seed = 1234)
print (training.first())

Row(features=DenseVector([3.7, 7.9]), Close=108.5)


In [178]:
#Linear regression model
from pyspark.ml.regression import LinearRegression
Linear_Regression=LinearRegression(labelCol='Close')
regression = Linear_Regression.fit(training)

In [179]:
print(f"intercept: {regression.intercept}")
print(f"coefficiencts: {regression.coefficients}")

intercept: 102.72730198947187
coefficiencts: [-1.7140186666955828,0.41044576882241146]


In [180]:
training_predictions=regression.evaluate(training)

print("Training R squared:", training_predictions.r2)

Training R squared: 0.0340722098977283


In [181]:
test_prediction  = regression.evaluate(test)

In [182]:
print("Testing MSE: ", test_prediction.meanSquaredError)
print("Root Mean Square Error (RMSE) using Linear Regression: ", test_prediction.rootMeanSquaredError)

Testing MSE:  465.2644826021804
Root Mean Square Error (RMSE) using Linear Regression:  21.569990324573176


In [183]:
#Predictions 
df_prediction = test_prediction.predictions
df_prediction.orderBy('prediction',ascending=False).show(10, truncate = False)

+--------------------------------------+-----+-----------------+
|features                              |Close|prediction       |
+--------------------------------------+-----+-----------------+
|[4.699999809265137,8.699999809265137] |105.5|98.2422926933944 |
|[5.900000095367432,7.599999904632568] |86.8 |95.73397949641354|
|[7.699999809265137,11.5]              |100.5|94.24948492429672|
|[9.800000190734863,19.100000381469727]|135.8|93.76943307001274|
|[8.0,10.0]                            |71.2 |93.11961034413132|
|[9.199999809265137,14.800000190734863]|46.2 |93.03292803965363|
|[8.0,9.699999809265137]               |138.8|92.99647653519828|
|[9.0,13.699999809265137]              |88.0 |92.92424094379234|
|[9.0,13.399999618530273]              |99.7 |92.80110713485931|
|[11.0,21.5]                           |99.3 |92.6976806855023 |
+--------------------------------------+-----+-----------------+
only showing top 10 rows



In [184]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'Close')
dt_model = dt.fit(training)
dt_predictions = dt_model.transform(test)
dt_evaluator = RegressionEvaluator(labelCol="Close", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) using Decision Tree Model = %g" % rmse)

Root Mean Squared Error (RMSE) using Decision Tree Model = 23.4461


In [186]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'Close', maxIter=10)
gbt_model = gbt.fit(training)
gbt_predictions = gbt_model.transform(test)
gbt_predictions.select('prediction', 'Close', 'features').show(5)

+------------------+-----+--------------------+
|        prediction|Close|            features|
+------------------+-----+--------------------+
| 93.83487936201642|105.5|[4.69999980926513...|
| 93.83487936201642| 86.8|[5.90000009536743...|
| 77.61572639544791|100.5|[7.69999980926513...|
| 93.10887762251447|108.6|[8.0,8.3999996185...|
|101.69677964351654|138.8|[8.0,9.6999998092...|
+------------------+-----+--------------------+
only showing top 5 rows



In [187]:
gbt_evaluator = RegressionEvaluator(
    labelCol="Close", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) using Gradient Booting Decision Tree model = %g" % rmse)

Root Mean Squared Error (RMSE) using Gradient Booting Decision Tree model = 23.82
