### Installing Dependencies

**Step 1: Install Dependencies**
We need to install following components to run pyspark seamlessly:
OpenJDK 8,
Spark Environment,
FindSpark package

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark

**Step 2: Add environment variables**
After installing dependencies, we need to some variables to the environment so that pyspark knows where to look for using dependencies. We can do that using following commands:

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.2.0-bin-hadoop3.2"

**Step 3: Initilize pyspark**
Finally, we just need to initilize pyspark which can be easily achieved using third-party package named findspark as shown below:

In [None]:
import findspark
findspark.init()

You can try running following commands to check if pyspark is properly installed or not:

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
sc

## Mounting Google Drive

In [None]:
from google.colab import drive

# Mount your Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/cloud_computing/Project

/content/drive/MyDrive/cloud_computing/Project


In [None]:
import pandas as pd

# read in CSV file with low_memory=False
merged_twts_stcks_final = pd.read_csv('/content/drive/MyDrive/cloud_computing/Project/Merged Data/merged_twts_stcks.csv', engine='python')

In [None]:
merged_twts_stcks_final.shape

(1809, 14)

In [None]:
merged_twts_stcks_final.isna().sum()

date                             0
weighted_avg_sentiment_score     0
daywise_median_positive_score    0
daywise_median_negative_score    0
daywise_positive_tweet_count     0
daywise_negative_tweet_count     0
daywise_avg_replyCount           0
daywise_avg_retweetCount         0
daywise_avg_likeCount            0
daywise_avg_quoteCount           0
company                          0
stock_price                      0
next_day_stock_price             0
trend                            0
dtype: int64

## Checking for Data Imbalance

In [None]:
merged_twts_stcks_final['trend'].value_counts()

0    950
1    859
Name: trend, dtype: int64

Data is almost balanced

In [None]:
# convert the Pandas DataFrame to a PySpark DataFrame
merged_twts_stcks_final_df = spark.createDataFrame(merged_twts_stcks_final)

  for column, series in pdf.iteritems():


In [None]:
merged_twts_stcks_final_df.count()

1809

## Traditional Machine Learning Classification Models

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, NaiveBayes, LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import MinMaxScaler

In [None]:
# Assemble features
assembler = VectorAssembler(inputCols=['weighted_avg_sentiment_score','daywise_median_positive_score', 'daywise_median_negative_score', 
                                       'daywise_positive_tweet_count', 'daywise_negative_tweet_count', 'daywise_avg_replyCount', 
                                       'daywise_avg_retweetCount', 'daywise_avg_likeCount', 'daywise_avg_quoteCount','company', 'stock_price'], 
                                        outputCol="unscaled_features")

merged_twts_stcks_final_df = assembler.transform(merged_twts_stcks_final_df)

# Scale features to be non-negative
scaler = MinMaxScaler(inputCol="unscaled_features", outputCol="features")
scaler_model = scaler.fit(merged_twts_stcks_final_df)
merged_twts_stcks_final_df = scaler_model.transform(merged_twts_stcks_final_df)

# Update the trend column to be of double type
merged_twts_stcks_final_df = merged_twts_stcks_final_df.withColumn("trend", merged_twts_stcks_final_df["trend"].cast("double"))


In [None]:
train_data, test_data = merged_twts_stcks_final_df.randomSplit([0.8, 0.2], seed=42)

In [None]:
train_data.show(10)

+----------+----------------------------+-----------------------------+-----------------------------+----------------------------+----------------------------+----------------------+------------------------+---------------------+----------------------+-------+-----------+--------------------+-----+--------------------+--------------------+
|      date|weighted_avg_sentiment_score|daywise_median_positive_score|daywise_median_negative_score|daywise_positive_tweet_count|daywise_negative_tweet_count|daywise_avg_replyCount|daywise_avg_retweetCount|daywise_avg_likeCount|daywise_avg_quoteCount|company|stock_price|next_day_stock_price|trend|   unscaled_features|            features|
+----------+----------------------------+-----------------------------+-----------------------------+----------------------------+----------------------------+----------------------+------------------------+---------------------+----------------------+-------+-----------+--------------------+-----+-----------------

In [None]:
test_data.show(10)

+----------+----------------------------+-----------------------------+-----------------------------+----------------------------+----------------------------+----------------------+------------------------+---------------------+----------------------+-------+-----------------+--------------------+-----+--------------------+--------------------+
|      date|weighted_avg_sentiment_score|daywise_median_positive_score|daywise_median_negative_score|daywise_positive_tweet_count|daywise_negative_tweet_count|daywise_avg_replyCount|daywise_avg_retweetCount|daywise_avg_likeCount|daywise_avg_quoteCount|company|      stock_price|next_day_stock_price|trend|   unscaled_features|            features|
+----------+----------------------------+-----------------------------+-----------------------------+----------------------------+----------------------------+----------------------+------------------------+---------------------+----------------------+-------+-----------------+--------------------+-----

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Number of folds for k-fold cross-validation
num_folds = 4
# Logistic Regression
log_reg = LogisticRegression(featuresCol='features', labelCol='trend')
# Decision Tree
dt = DecisionTreeClassifier(featuresCol='features', labelCol='trend')
# Random Forest
rf = RandomForestClassifier(featuresCol='features', labelCol='trend')
# Gradient-Boosted Tree
gbt = GBTClassifier(featuresCol='features', labelCol='trend')


# Set up an empty parameter grid for each classifier
log_reg_param_grid = ParamGridBuilder() \
    .addGrid(log_reg.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(log_reg.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

dt_param_grid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [5, 10, 15]) \
    .addGrid(dt.minInstancesPerNode, [1, 2, 5]) \
    .build()

rf_param_grid = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [5, 10, 15]) \
    .addGrid(rf.numTrees, [10, 20, 50]) \
    .build()

gbt_param_grid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [5, 10, 15]) \
    .addGrid(gbt.stepSize, [0.01, 0.1, 0.5]) \
    .build()

# Set up cross-validator for each classifier
log_reg_cv = CrossValidator(estimator=log_reg, estimatorParamMaps=log_reg_param_grid, evaluator=MulticlassClassificationEvaluator(labelCol='trend'), numFolds=num_folds)
dt_cv = CrossValidator(estimator=dt, estimatorParamMaps=dt_param_grid, evaluator=MulticlassClassificationEvaluator(labelCol='trend'), numFolds=num_folds)
rf_cv = CrossValidator(estimator=rf, estimatorParamMaps=rf_param_grid, evaluator=MulticlassClassificationEvaluator(labelCol='trend'), numFolds=num_folds)
gbt_cv = CrossValidator(estimator=gbt, estimatorParamMaps=gbt_param_grid, evaluator=MulticlassClassificationEvaluator(labelCol='trend'), numFolds=num_folds)


# Fit the cross-validated models
log_reg_cv_model = log_reg_cv.fit(train_data)
dt_cv_model = dt_cv.fit(train_data)
rf_cv_model = rf_cv.fit(train_data)
gbt_cv_model = gbt_cv.fit(train_data)


In [None]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType

def evaluate_model(model, test_data, model_name):
    # Make predictions on test data
    predictions = model.transform(test_data)

    # Convert the label and prediction columns to float type
    predictions = predictions.withColumn("label", predictions["trend"].cast(FloatType()))
    predictions = predictions.withColumn("prediction", predictions["prediction"].cast(FloatType()))

    # Compute metrics
    metrics = MulticlassMetrics(predictions.select("prediction", "label").rdd)
    confusion_matrix = metrics.confusionMatrix().toArray()
    f1_score = metrics.fMeasure(1.0)  # Pass 1.0 as the label to get the F1 score for label 1

    print(f"{model_name}:")
    print("Confusion Matrix:")
    print(confusion_matrix)
    print(f"F1 Score: {f1_score}")
    print("\n")

# Evaluate each model
evaluate_model(log_reg_cv_model.bestModel, test_data, "Logistic Regression K fold ")
evaluate_model(dt_cv_model.bestModel, test_data, "Decision Tree K fold")
evaluate_model(rf_cv_model.bestModel, test_data, "Random Forest K fold")
evaluate_model(gbt_cv_model.bestModel, test_data, "Gradient-Boosted Tree K fold")

Logistic Regression K fold :
Confusion Matrix:
[[152.  24.]
 [117.  35.]]
F1 Score: 0.3317535545023697


Decision Tree K fold:
Confusion Matrix:
[[88. 88.]
 [76. 76.]]
F1 Score: 0.48101265822784817


Random Forest K fold:
Confusion Matrix:
[[104.  72.]
 [ 93.  59.]]
F1 Score: 0.4169611307420495


Gradient-Boosted Tree K fold:
Confusion Matrix:
[[94. 82.]
 [69. 83.]]
F1 Score: 0.5236593059936909




In [None]:
# Logistic Regression
log_reg = LogisticRegression(featuresCol='features', labelCol='trend')
log_reg_model = log_reg.fit(train_data)

# Decision Tree
dt = DecisionTreeClassifier(featuresCol='features', labelCol='trend')
dt_model = dt.fit(train_data)

# Random Forest
rf = RandomForestClassifier(featuresCol='features', labelCol='trend')
rf_model = rf.fit(train_data)

# Gradient-Boosted Tree
gbt = GBTClassifier(featuresCol='features', labelCol='trend')
gbt_model = gbt.fit(train_data)

In [None]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType

def evaluate_model(model, test_data, model_name):
    # Make predictions on test data
    predictions = model.transform(test_data)

    # Convert the label and prediction columns to float type
    predictions = predictions.withColumn("label", predictions["trend"].cast(FloatType()))
    predictions = predictions.withColumn("prediction", predictions["prediction"].cast(FloatType()))

    # Compute metrics
    metrics = MulticlassMetrics(predictions.select("prediction", "label").rdd)
    confusion_matrix = metrics.confusionMatrix().toArray()
    f1_score = metrics.fMeasure(1.0)  # Pass 1.0 as the label to get the F1 score for label 1

    print(f"{model_name}:")
    print("Confusion Matrix:")
    print(confusion_matrix)
    print(f"F1 Score: {f1_score}")
    print("\n")

# Evaluate each model
evaluate_model(log_reg_model, test_data, "Logistic Regression")
evaluate_model(dt_cv_model, test_data, "Decision Tree")
evaluate_model(rf_cv_model, test_data, "Random Forest")
evaluate_model(gbt_cv_model, test_data, "Gradient-Boosted Tree")

Logistic Regression:
Confusion Matrix:
[[150.  26.]
 [113.  39.]]
F1 Score: 0.35944700460829493


Decision Tree:
Confusion Matrix:
[[88. 88.]
 [76. 76.]]
F1 Score: 0.48101265822784817


Random Forest:
Confusion Matrix:
[[104.  72.]
 [ 93.  59.]]
F1 Score: 0.4169611307420495


Gradient-Boosted Tree:
Confusion Matrix:
[[94. 82.]
 [69. 83.]]
F1 Score: 0.5236593059936909




## Deep Learning Models for classification

In [None]:
! pip install horovod

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting horovod
  Downloading horovod-0.27.0.tar.gz (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: horovod
  Building wheel for horovod (setup.py) ... [?25l[?25hdone
  Created wheel for horovod: filename=horovod-0.27.0-cp39-cp39-linux_x86_64.whl size=41074742 sha256=0dc5774d009c44fdbb653cdf4be028e552dd9b57e8e26137d3f452631a44f2d2
  Stored in directory: /root/.cache/pip/wheels/13/26/fa/a124a538cf1fb851ccf00e0011eee9aa825ec79b6048238506
Successfully built horovod
Installing collected packages: horovod
Successfully installed horovod-0.27.0


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

feature_count = len(train_data.select("features").first().features)

## Feed Forward Neural Network

def create_FFN():
    model = keras.Sequential()
    model.add(layers.Dense(128, activation='relu', input_shape=(11,)))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    return model



# CNN model
def create_cnn_model():
    model = keras.Sequential()
    model.add(layers.Reshape((11, 1), input_shape=(11,)))
    model.add(layers.Conv1D(filters=32, kernel_size=1, activation='relu'))
    model.add(layers.MaxPooling1D(pool_size=2))
    model.add(layers.Flatten())
    model.add(layers.Dense(50, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    return model

# LSTM model
def create_lstm_model():
    model = keras.Sequential()
    model.add(layers.LSTM(64, activation='tanh', input_shape=(1, 11)))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    return model



In [None]:
from sklearn.metrics import accuracy_score
import numpy as np
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import layers, models
import horovod.keras as hvd
from tensorflow.keras.callbacks import ModelCheckpoint

# Initialize Horovod
hvd.init()

def preprocess_data(data, feature_count):
    features = np.array(data.select("features").collect())
    labels = np.array(data.select("trend").collect())
    X = np.array([row for row in features])
    X = X.reshape(X.shape[0],11)
    y = np.squeeze(labels)
    return X, y


# Preprocess the data
train_data_preprocessed, train_labels = preprocess_data(train_data, feature_count)
test_data_preprocessed, test_labels = preprocess_data(test_data, feature_count)


In [None]:
# Training function for Horovod
def train_model(model, train_data, train_labels, test_data, test_labels, epochs=10, batch_size=32):
    optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=0.001 * hvd.size())
    optimizer = hvd.DistributedOptimizer(optimizer)

    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    callbacks = [
        hvd.callbacks.BroadcastGlobalVariablesCallback(root_rank=0),
        hvd.callbacks.MetricAverageCallback(),
    ]

    model.fit(train_data, train_labels, epochs=epochs, batch_size=batch_size, callbacks=callbacks, verbose=1 if hvd.rank() == 0 else 0)

## FFN Model Training

In [None]:
# Train and evaluate FNN model with Horovod
FFN_model = create_FFN()
train_model(FFN_model, train_data_preprocessed, train_labels, test_data_preprocessed, test_labels, epochs=20)

if hvd.rank() == 0:
    FNN_accuracy = FFN_model.evaluate(test_data_preprocessed, test_labels, verbose=0)[1]
    print("FNN Model Accuracy: {:.2f}%".format(FNN_accuracy * 100))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
FNN Model Accuracy: 47.56%


In [None]:
# Prediction 

from sklearn.metrics import f1_score

# Predict the labels for the test dataset
test_predictions = FFN_model.predict(test_data_preprocessed)

# Convert the predicted probabilities to binary labels
test_predicted_labels = np.round(test_predictions)

# Calculate the F1 score
f1 = f1_score(test_labels, test_predicted_labels)

print("F1 Score of the FFN model: ", f1)

F1 Score of the FFN model:  0.6055045871559632


In [None]:
FFN_model.save('FFN_model.h5')

## CNN Model Training

In [None]:
# # Train and evaluate CNN model with Horovod
cnn_model = create_cnn_model()
train_model(cnn_model, train_data_preprocessed, train_labels, test_data_preprocessed, test_labels, epochs=15, batch_size = 50)

if hvd.rank() == 0:
    cnn_accuracy = cnn_model.evaluate(test_data_preprocessed, test_labels, verbose=0)[1]
    print("CNN Model Accuracy: {:.2f}%".format(cnn_accuracy * 100))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
CNN Model Accuracy: 52.74%


In [None]:
# Prediction 

from sklearn.metrics import f1_score

# Predict the labels for the test dataset
test_predictions = cnn_model.predict(test_data_preprocessed)

# Convert the predicted probabilities to binary labels
test_predicted_labels = np.round(test_predictions)

# Calculate the F1 score
f1 = f1_score(test_labels, test_predicted_labels)

print("F1 Score of the cnn_model: ", f1)

F1 Score of the cnn_model:  0.2857142857142857


In [None]:
cnn_model.save('CNN_model.h5')

## Training LSTM model

In [None]:
# Train and evaluate LSTM model with Horovod

# Reshape the data for LSTM
train_data_reshaped = train_data_preprocessed.reshape(-1, 1, 11)
test_data_reshaped = test_data_preprocessed.reshape(-1, 1, 11)



lstm_model = create_lstm_model()
train_model(lstm_model, train_data_reshaped, train_labels, test_data_preprocessed, test_labels,epochs=40)

if hvd.rank() == 0:
    lstm_accuracy = lstm_model.evaluate(test_data_reshaped, test_labels, verbose=0)[1]
    print("LSTM Model Accuracy: {:.2f}%".format(lstm_accuracy * 100))

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
LSTM Model Accuracy: 52.74%


In [None]:
# Prediction 

from sklearn.metrics import f1_score

# Predict the labels for the test dataset
test_predictions = lstm_model.predict(test_data_reshaped)

# Convert the predicted probabilities to binary labels
test_predicted_labels = np.round(test_predictions)

# Calculate the F1 score
f1 = f1_score(test_labels, test_predicted_labels)

print("F1 Score of the LSTM model: ", f1)

F1 Score of the LSTM model:  0.42804428044280446


In [None]:
lstm_model.save('LSTM_model.h5')