# Train Model

In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.ml.regression import *
from pyspark.ml.feature import VectorAssembler
import pandas as pd

appName = "Training"
spark = SparkSession.builder.appName(appName).getOrCreate()

In [None]:
dataset = spark.read.csv("../datasets/training_integrated_data.csv", header=True, inferSchema=True)
dataset.show(3)

In [None]:
dividedData = dataset.randomSplit([0.7, 0.3], 24) 
trainingData = dividedData[0]
testingData = dividedData[1]  

print ("Training data rows:", trainingData.count(), "; Testing data rows:", testingData.count())

In [None]:
chose_column = "valence"

feature_assembler = VectorAssembler(inputCols=["index", "cases", "newCases", "deaths", "positive", "negative"], outputCol="features")

def Assemble(dataset, label: StringType):
    featured = feature_assembler.transform(dataset)
    return featured.select("features", col(chose_column).alias("label"))

trainingDataFinal = Assemble(trainingData, chose_column)
testingDataFinal = Assemble(testingData, chose_column)

algorithm = GBTRegressor(
    labelCol="label", 
    featuresCol="features", 
    maxIter=20
)
model = algorithm.fit(trainingDataFinal)

In [None]:
#predict testing data using our model
prediction = model.transform(testingDataFinal)
#show some prediction results
prediction.show(10)

In [None]:
#import evaluator module for regression
from pyspark.ml.evaluation import RegressionEvaluator

#define our evaluator
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
#calculate RMSE of our trained model
rmse = evaluator.evaluate(prediction)
print ("Root Mean Square Error (RMSE):", rmse)

In [None]:
pred_dataset = spark.read.csv("../datasets/epidemic_pred.csv", header=True, inferSchema=True).drop("date")
pred_dataset = pred_dataset.withColumn("index", pred_dataset["index"] + lit(580))
pred_dataset.show(3)

In [None]:
pred_data = feature_assembler.transform(pred_dataset).select("features")
pred_data.show(3)

In [None]:
import numpy as np

predicted_sentiment = np.float64(model.transform(pred_data).select("prediction").collect())[:, 0]

In [None]:
import matplotlib.pyplot as plt

plt.rcParams['figure.dpi'] = 600
plt.rcParams['savefig.dpi'] = 300

fig, ax = plt.subplots()
ax.plot_date(range(len(predicted_sentiment)), predicted_sentiment, marker='', linestyle='-')
ax.set_aspect('auto')
fig.autofmt_xdate()

plt.show()