# Github repository

https://github.com/noong99/stats507-coursework/tree/main/Project

## Dataset
Datasets: finance-financialmodelingprep-stock-news-sentiments-rss-feed  
https://huggingface.co/datasets/NickyNicky/finance-financialmodelingprep-stock-news-sentiments-rss-feed

In [None]:
import pandas as pd
import scipy
import seaborn as sns
import matplotlib.pyplot as plt

from datasets import load_dataset

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import  numpy as np
# from pytorch_pretrained_bert import BertTokenizr
# from bertModel import BertClassification
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from transformers import AutoTokenizer

## 1. Data Preparation

In [None]:
file_path = '../data/news_data_sampled.csv'
df = pd.read_csv(file_path)

## 2. LSTM using BERT embedding

### 2-1. Data Preparation with LSTM Model

Let's split the data into train, test and validation.  
Use the train data to train the model, the validation data to check the performance of the model, and the test data to check how the model performs on new data.

In [None]:
# Set Title_Text as texts, sentiment as labels
texts = df['Title_Text'].values
labels = df['sentiment'].values
scores = df['sentimentScore'].values

# Split the data into train, validation, test set
# Set train:valid:test = 6:2:2 and apply stratify
xtrain, xtemp, ytrain, ytemp, scores_train, scores_temp= train_test_split(texts, labels, scores, test_size = 0.4, random_state = 129, stratify = labels)
xtest, xvalid, ytest, yvalid, scores_valid, scores_test= train_test_split(xtemp, ytemp, scores_temp, test_size=0.5, random_state = 129, stratify = ytemp)

# Check how many data in one each dataset
print(f"Train size: {len(xtrain)}")
print(f"Validation size: {len(xvalid)}")
print(f"Train size: {len(xtest)}")

Train size: 2400
Validation size: 800
Train size: 800


### 2-2. PyTorch Dataset and DataLoader

In [None]:

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, scores, tokenizer, max_len=128):
        self.x = texts # Texts
        self.y = labels # Sentiment
        self.scores = scores # SentimentScore
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.x) # length of Texts

    def __getitem__(self, index):
        x = self.x[index]
        y = self.y[index]
        score = self.scores[index]
        encoding = self.tokenizer(
            x,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(y, dtype=torch.long),
            "score": torch.tensor(score, dtype=torch.float32),
        }

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

train_dataset = SentimentDataset(xtrain, ytrain, scores_train, tokenizer)
valid_dataset = SentimentDataset(xvalid, yvalid, scores_valid, tokenizer)
test_dataset = SentimentDataset(xtest, ytest, scores_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


### 2-3. Define Model

Define LSTM model using BERT embedding

In [None]:
from transformers import AutoModel

class LSTMWithBERT(nn.Module):
    def __init__(self, bert_model_name, hidden_dim, output_dim):
        super(LSTMWithBERT, self).__init__()
        self.bert = AutoModel.from_pretrained(bert_model_name)
        self.lstm = nn.LSTM(self.bert.config.hidden_size, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim + 1, output_dim)  # +1 for sentimentScore

    def forward(self, input_ids, attention_mask, scores):
        bert_output = self.bert(input_ids, attention_mask=attention_mask)
        
        sequence_output = bert_output.last_hidden_state  # (batch_size, seq_len, hidden_size)
        _, (hidden, _) = self.lstm(sequence_output)
        
        # combine sentimentScore
        hidden_with_score = torch.cat((hidden.squeeze(0), scores.unsqueeze(1)), dim=1)
        logits = self.fc(hidden_with_score)
        return logits


In [None]:
# Initiate the model
hidden_dim = 128
output_dim = len(set(labels))
bert_model_name = "bert-base-uncased"

model_lstmbert= LSTMWithBERT(bert_model_name, hidden_dim, output_dim)
model_lstmbert

LSTMWithBERT(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)

### 2.4 Fine-tuning

In [None]:
for param in model_lstmbert.bert.parameters():
    param.requires_grad = True

Fine-tuning often benefits from a learning rate scheduler, improves convergence and prevents overfitting.  
So let's use a scheduler, set criterion as cross entropy loss and optimizer using AdamW.

In [None]:
# loss functionm, optimizer and scheduler
# Use CrossEntropyLoss as criterion and AdamW optimizer instead of Adam optimizer to prevent overfitting
# Set the learning rate commonly used for BERT
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model_lstmbert.parameters()), lr=2e-5) 
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)  # Adjust LR every 2 epochs

Use the same train and evaluation function, but add attention_mask for input.

In [None]:
# Model Training
def train_epoch(model, data_loader, optimizer, criterion, scheduler = None, max_grad_norm = 1.0 ):
    model.train()
    losses = []
    for batch in data_loader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["label"]
        scores = batch["score"]

        outputs = model(input_ids, attention_mask, scores)
        loss = criterion(outputs, labels)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()

    # Scheduler step (if provided)
    if scheduler:
        scheduler.step() # update the learning rate after each epoch
    
    return np.mean(losses)

# Model evaluation
def evaluate_model(model, data_loader, criterion):
    model.eval()
    losses = []
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            labels = batch["label"]
            scores = batch["score"]

            outputs = model(input_ids, attention_mask, scores)
            loss = criterion(outputs, labels)
            losses.append(loss.item())

            predictions.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    return np.mean(losses), accuracy_score(true_labels, predictions)


### 2.5 Model train and evaluation

In [None]:
for epoch in range(5): # Set the number of epochs to a commonly used value(5)
    train_loss = train_epoch(model_lstmbert, train_loader, optimizer, criterion, scheduler)
    val_loss, val_acc = evaluate_model(model_lstmbert, valid_loader, criterion)
    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")


Epoch 1, Train Loss: 0.3619, Val Loss: 0.3704, Val Acc: 0.8925
Epoch 2, Train Loss: 0.3554, Val Loss: 0.3652, Val Acc: 0.8925
Epoch 3, Train Loss: 0.3516, Val Loss: 0.3648, Val Acc: 0.8925
Epoch 4, Train Loss: 0.3492, Val Loss: 0.3644, Val Acc: 0.8925
Epoch 5, Train Loss: 0.3492, Val Loss: 0.3644, Val Acc: 0.8925


In [None]:
# Model evaluation
test_loss, test_acc = evaluate_model(model_lstmbert, test_loader, criterion)
print(f"Fine-tuned BERT-LSTM Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")

Fine-tuned BERT-LSTM Test Loss: 0.3687, Test Accuracy: 0.8912
