# **Dataset Preparation**

In [None]:
import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
# Dataset obtained from Kaggle
# https://www.kaggle.com/datasets/ankurzing/sentiment-analysis-for-financial-news

# Read data in as a pandas DataFrame
path = 'all-data.csv'
column_names = ['label', 'text']
df = pd.read_csv(path, encoding='latin-1', header=None, names=column_names)
data = df
labels = data['label'].tolist()
sentences = data['text'].tolist()

In [None]:
# Split the data

# Convert labels to numeric
labels = [0 if label == 'negative' else 1 for label in labels] # 0 for neg, 1 for pos

train_sentences, test_sentences, train_labels, test_labels = train_test_split(
    sentences, labels, test_size=0.2, random_state=42
)

In [None]:
# Tokenize and Encode

# Initialize BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # pretrained tokenizer

train_encodings = tokenizer(train_sentences, truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(test_sentences, truncation=True, padding=True, return_tensors='pt')

In [None]:
# Convert to TensorDataset
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels))
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(test_labels))

# **Model Training**

In [None]:
# Data Loaders
batch_size = 16
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

In [None]:
# Load BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2) #pos, neg
optimizer = AdamW(model.parameters(), lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Training

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
epochs = 5

for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = [tensor.to(device) for tensor in batch]

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()


In [None]:
# Model Evaluation

model.eval()

predictions, true_labels = [], []

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    input_ids, attention_mask, labels = batch

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    predictions.append(logits)
    true_labels.append(label_ids)

# **Results**

In [None]:
# Calculate Metrics
predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)
preds_flat = np.argmax(predictions, axis=1).flatten()
labels_flat = true_labels.flatten()

accuracy = accuracy_score(labels_flat, preds_flat)
precision, recall, f1, _ = precision_recall_fscore_support(labels_flat, preds_flat, average='binary')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.9587628865979382
Precision: 0.9648526077097506
Recall: 0.9895348837209302
F1 Score: 0.9770378874856488


# **Testing on Examples**

In [None]:
import requests
from bs4 import BeautifulSoup
import torch
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader

In [None]:
# Function to scrape and process text from a URL

def scrape_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    # Find the relevant text; the exact method might vary depending on website structure
    text = soup.find_all('p')  # Assuming the main content is in <p> tags
    return ' '.join([para.get_text() for para in text])


In [None]:
urls = ['https://finance.yahoo.com/news/google-is-deleting-old-gmail-accounts-heres-how-to-save-yours-192619365.html',
        'https://finance.yahoo.com/news/1-canada-google-reach-deal-173201247.html',
        'https://www.cnbc.com/2023/12/12/us-pension-funds-heavily-invested-in-china-despite-crackdown.html',
        'https://www.fnlondon.com/articles/banks-stalling-pay-overhaul-after-bonus-cap-scrap-no-one-wants-to-be-an-outlier-20231211',
        'https://www.goldmansachs.com/intelligence/pages/the-global-economy-will-perform-better-than-many-expect-in-2024.html']

In [None]:
texts = [scrape_text_from_url(url) for url in urls]


# Tokenize and encode the text
encodings = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')
#print(encodings)
dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'])
dataloader = DataLoader(dataset, batch_size=1)  # Batch size of 1 for individual processing

In [None]:
# Predict sentiment
model.eval()
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# Results

predictions = []
for batch in dataloader:
    input_ids, attention_mask = [tensor.to(device) for tensor in batch]

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).cpu().numpy()[0]
    predictions.append(prediction)

# Map numeric predictions back to sentiment labels
predicted_labels = ['positive' if prediction == 1 else 'negative' for prediction in predictions]

for url, sentiment in zip(urls, predicted_labels):
    print(f"URL: {url}\nPredicted Sentiment: {sentiment}\n")

URL: https://finance.yahoo.com/news/google-is-deleting-old-gmail-accounts-heres-how-to-save-yours-192619365.html
Predicted Sentiment: positive

URL: https://finance.yahoo.com/news/1-canada-google-reach-deal-173201247.html
Predicted Sentiment: positive

URL: https://www.cnbc.com/2023/12/12/us-pension-funds-heavily-invested-in-china-despite-crackdown.html
Predicted Sentiment: negative

URL: https://www.fnlondon.com/articles/banks-stalling-pay-overhaul-after-bonus-cap-scrap-no-one-wants-to-be-an-outlier-20231211
Predicted Sentiment: positive

URL: https://www.goldmansachs.com/intelligence/pages/the-global-economy-will-perform-better-than-many-expect-in-2024.html
Predicted Sentiment: positive

