In [1]:
import os
import pandas as pd
!pip install textblob
from textblob import TextBlob
import json
!pip install torch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score
import numpy as np

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Function to get all of the stock price data
def get_stock_prices(stocks_folder_path):
    stock_dfs = []

    # Iterate through each CSV file in the folder
    for filename in os.listdir(stocks_folder_path):
        if filename.endswith(".csv"):

            # Read the CSV file and append to the list
            stock_df = pd.read_csv(os.path.join(stocks_folder_path, filename))

            #Add name of the stock
            stock_name = filename[:-4]
            stock_df['Stock'] = stock_name

            stock_dfs.append(stock_df)

    # Concatenate all DataFrames into a single DataFrame
    all_stocks_df = pd.concat(stock_dfs, ignore_index=True)
    return all_stocks_df


In [3]:
# Function to preprocess stock price data
def preprocess_stock_prices(stock_price_df):
    stock_price = stock_price_df[['Stock', 'Date', 'Open', 'Close', 'Adj Close']].copy()
    return stock_price

In [4]:

# Function to preprocess tweet data
def preprocess_tweets(tweets_folder_path):
    processed_tweets = []
    for root, dirs, files in os.walk(tweets_folder_path):
        for file in files:
            stock_name = os.path.basename(root)
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                with open(file_path, "r", encoding="utf-8") as f:
                    for line in f:
                        try:
                            tweet = json.loads(line)
                            processed_tweet = {
                                "Stock": stock_name,
                                "Date": tweet.get("created_at", ""),
                                "text": tweet.get("text", ""),
                                "user": tweet.get("user", {}).get("screen_name", ""),
                                "lang": tweet.get("lang", ""),
                                "sentiment": get_sentiment(tweet["text"])
                            }

                            processed_tweets.append(processed_tweet)
                        except json.JSONDecodeError:
                            # Handle invalid JSON lines
                            print(f"Invalid JSON in file: {file_path}")
                            continue

    tweets_df = pd.DataFrame(processed_tweets)

    # Convert the 'datetime' column to datetime format
    tweets_df['Date'] = pd.to_datetime(tweets_df['Date'])

    # Create separate 'date' and 'time' columns
    tweets_df['Time'] = tweets_df['Date'].dt.time
    tweets_df['Date'] = tweets_df['Date'].dt.date


    return tweets_df


In [5]:
# Function to perform sentiment analysis on tweet text
def get_sentiment(tweet):
    analysis = TextBlob(tweet)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity < 0:
        return 'negative'
    else:
        return 'neutral'

In [6]:
# Define LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        output = self.fc(lstm_out[:, -1, :])
        return output


In [7]:
# Define Gru model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        gru_out, _ = self.gru(x)
        output = self.fc(gru_out[:, -1, :])  # Get the last time step output
        return output

In [8]:
# Define Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_heads, num_encoder_layers, num_decoder_layers, output_size):
        super(TransformerModel, self).__init__()
        self.transformer = nn.Transformer(d_model=hidden_size, nhead=num_heads, num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers, batch_first=True)
        self.fc_in = nn.Linear(input_size, hidden_size)  # Adjust input size to hidden size
        self.fc_out = nn.Linear(hidden_size, output_size)

    def forward(self, src):
        src = self.fc_in(src)
        output = self.transformer(src, src)  # For simplicity, using src as both src and tgt
        output = self.fc_out(output[:, -1, :])  # Output of the last token
        return output

In [9]:
# Define function to train LSTM model
def train_model(model, train_loader, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(train_loader):
            try:

                optimizer.zero_grad()
                output = model(inputs)

                # Flatten labels to ensure they have the correct shape
                labels = labels.view(-1, 1)

                loss = criterion(output, labels)

                loss.backward()
                optimizer.step()
                running_loss += loss.item()
            except Exception as e:
                print(f'Error in batch {i+1} of epoch {epoch+1}: {e}')
                raise e  # Reraise the exception after printing to stop execution

        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, running_loss / len(train_loader)))


In [25]:
# Define function to evaluate model
def evaluate_model(model, test_loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            predictions.extend(outputs.detach().numpy())
            actuals.extend(labels.numpy())
    return np.array(predictions), np.array(actuals)


In [11]:
# Define paths
stocks_folder_path = "price/raw/"
tweets_folder_path = "tweet/raw/"

stock_prices = get_stock_prices(stocks_folder_path)

# Preprocess data
stock_prices = preprocess_stock_prices(stock_prices)
tweet_data = preprocess_tweets(tweets_folder_path)

  tweets_df['Date'] = pd.to_datetime(tweets_df['Date'])


In [12]:
# Merge tweet data with stock price data based on Stock name
stock_prices['Date'] = pd.to_datetime(stock_prices['Date'])
tweet_data['Date'] = pd.to_datetime(tweet_data['Date'])
combined_data = pd.merge(stock_prices, tweet_data, on=['Stock','Date'], how='inner')

In [13]:
X = combined_data[['sentiment', 'Open', 'Adj Close']]  # Include other features along with sentiment
y = combined_data['Close']

In [14]:

# Convert categorical sentiment labels to numerical values using one-hot encoding
X = pd.get_dummies(X, columns=['sentiment','Adj Close', 'Open'], drop_first=True)


In [15]:

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Reshape data to add sequence dimension
sequence_length = 10  # Example sequence length
def create_sequences(data, seq_length):
    sequences = []
    for i in range(len(data) - seq_length):
        seq = data[i:i + seq_length]
        sequences.append(seq)
    return np.array(sequences)

In [17]:
X_train_seq = create_sequences(X_train.values, sequence_length)
y_train_seq = y_train.values[sequence_length:]
X_test_seq = create_sequences(X_test.values, sequence_length)
y_test_seq = y_test.values[sequence_length:]

In [18]:
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_seq.reshape(-1, 1), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_seq.reshape(-1, 1), dtype=torch.float32)


In [19]:
# Create data loaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [20]:
# Initialize LSTM model
input_size = X_train_seq.shape[2]
hidden_size = 64
num_layers = 2
output_size = 1
lstm_model = LSTMModel(input_size, hidden_size, num_layers, output_size)

In [21]:
# Initialize Gru model
input_size = X_train_seq.shape[2]
hidden_size = 64
num_layers = 2
output_size = 1
gru_model = GRUModel(input_size, hidden_size, num_layers, output_size)

In [22]:
# Initialize Transformer model
input_size = X_train_seq.shape[2]
hidden_size = 64
num_headers = 4
num_encoder_layers = 2
num_decoder_layers = 2
output_size = 1
transformer_model = TransformerModel(input_size, hidden_size, num_headers, num_encoder_layers, num_decoder_layers, output_size)

In [23]:
# Define loss function and optimizer
criterion = nn.MSELoss()
lstm_optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)
gru_optimizer = optim.Adam(gru_model.parameters(), lr=0.001)
transformer_optimizer = optim.Adam(transformer_model.parameters(), lr=0.001)

threshold = 0.5

In [27]:

# Train model
train_model(lstm_model, train_loader, criterion, lstm_optimizer, num_epochs=10)

# Evaluate model
evaluate_model(lstm_model, test_loader)

predictions, actuals = evaluate_model(lstm_model, test_loader)
predictions_class = (predictions > threshold).astype(int)
actuals_binary = (actuals > 0).astype(int)
accuracy = accuracy_score(actuals_binary, predictions_class)
precision = precision_score(actuals_binary, predictions_class)
recall = recall_score(actuals_binary, predictions_class)
print("LSTM Model:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)


Epoch [1/10], Loss: 11434363.2202
Epoch [2/10], Loss: 11432437.5950
Epoch [3/10], Loss: 11430331.9378
Epoch [4/10], Loss: 11428285.1878
Epoch [5/10], Loss: 11426287.5385
Epoch [6/10], Loss: 11424458.0627
Epoch [7/10], Loss: 11422533.5552
Epoch [8/10], Loss: 11420961.2485
Epoch [9/10], Loss: 11419151.0085
Epoch [10/10], Loss: 11417375.4761
LSTM Model:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0


In [30]:
# Train model
train_model(gru_model, train_loader, criterion, gru_optimizer, num_epochs=10)

# Evaluate model
evaluate_model(gru_model, test_loader)

# GRU Model Evaluation
predictions, actuals = evaluate_model(gru_model, test_loader)
predictions_class = (predictions > threshold).astype(int)
actuals_binary = (actuals > 0).astype(int)
accuracy = accuracy_score(actuals_binary, predictions_class)
precision = precision_score(actuals_binary, predictions_class)
recall = recall_score(actuals_binary, predictions_class)
print("GRU Model:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Epoch [1/10], Loss: 11459132.5494
Epoch [2/10], Loss: 11455969.8388
Epoch [3/10], Loss: 11453247.1326
Epoch [4/10], Loss: 11450561.7688
Epoch [5/10], Loss: 11447612.9988
Epoch [6/10], Loss: 11445147.0122
Epoch [7/10], Loss: 11442612.8270
Epoch [8/10], Loss: 11440519.3640
Epoch [9/10], Loss: 11438273.5848
Epoch [10/10], Loss: 11436262.0297
GRU Model:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0


In [31]:
# Train model
train_model(transformer_model, train_loader, criterion, transformer_optimizer, num_epochs=10)

# Evaluate model
evaluate_model(transformer_model, test_loader)

predictions, actuals = evaluate_model(transformer_model, test_loader)
predictions_class = (predictions > threshold).astype(int)
actuals_binary = (actuals > 0).astype(int)
accuracy = accuracy_score(actuals_binary, predictions_class)
precision = precision_score(actuals_binary, predictions_class)
recall = recall_score(actuals_binary, predictions_class)
print("Transformer Model:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Epoch [1/10], Loss: 11475297.3849
Epoch [2/10], Loss: 11474380.7110
Epoch [3/10], Loss: 11472495.1914
Epoch [4/10], Loss: 11473241.1761
Epoch [5/10], Loss: 11473160.5497
Epoch [6/10], Loss: 11473071.7157
Epoch [7/10], Loss: 11473094.4728
Epoch [8/10], Loss: 11472898.8207
Epoch [9/10], Loss: 11472698.1195
Epoch [10/10], Loss: 11472458.0236
Transformer Model:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
