In [1]:
# imports
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import re

In [None]:
# load data set
df = pd.read_csv("data.csv", encoding='latin-1') 
df.columns = ['label', 'text']
print(df.head())
print(df.columns)

      label                                               text
0   neutral  According to Gran , the company has no plans t...
1   neutral  Technopolis plans to develop in stages an area...
2  negative  The international electronic industry company ...
3  positive  With the new production plant the company woul...
4  positive  According to the company 's updated strategy f...
Index(['label', 'text'], dtype='object')


: 

In [3]:
# clean text
def clean_text(text):
    text = text.lower() # convert string to lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # remove links
    text = re.sub(r'\@w+|\#','', text)  # remove mentions and hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove numbers and punctuation
    return text
df['text'] = df['text'].astype(str).apply(clean_text) # updates text column to string and cleans

# encode labels
encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['label'])  

print("Label classes:", encoder.classes_)

Label classes: ['negative' 'neutral' 'positive']


In [4]:
# train/tests split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)
# here were training it so it can predict what label each text goes under
print("Unique labels:", sorted(set(train_labels)))


Unique labels: [0, 1, 2]


In [5]:
# create data set class
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset
from collections import Counter
import pickle


# splits text at any whitespace
def tokenizer(text):
    return text.split()

# goes through all of the text and yeilds the tokens
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)


def build_vocab(texts, min_freq=1):
    counter = Counter()
    for text in texts:
        counter.update(tokenizer(text))
    # start indices at 2 so we can reserve 0 for PAD and 1 for UNK
    vocab = {word: i+2 for i, (word, freq) in enumerate(counter.items()) if freq >= min_freq}
    vocab["<pad>"] = 0
    vocab["<unk>"] = 1
    return vocab

vocab_dict = build_vocab(train_texts, min_freq=2)  
print("Vocab size:", len(vocab_dict))

def numericalize(text, vocab):
    tokens = tokenizer(text)  # split into words
    return [vocab.get(token, vocab["<unk>"]) for token in tokens]



with open("vocab_dict.pkl", "wb") as f:
    pickle.dump(vocab_dict, f)



Vocab size: 4039


In [6]:
# import numpy as np
# import torch

# def load_glove_embeddings(glove_file_path, vocab, embedding_dim = 50):
#     embeddings_index = {}
#     with open(glove_file_path, encoding="utf8") as f:
#         for line in f:
#             values = line.split() #split at every space
#             word = values[0] #the word is the first thing in each line
#             vector = np.asarray(values[1:], dtype="float32") #convert rest of inputs in line to numpy array
#             embeddings_index[word] = vector
    
#     embedding_matrix = np.zeros((max(vocab.values()) + 1, embedding_dim)) #nump array filled with 0's initially and has a dimention of 50 for each word
#     # function adds word's vector into our matrix
#     for word, idx in vocab.items():
#         vector = embeddings_index.get(word)
#         if vector is not None:
#             embedding_matrix[idx] = vector
#         else:
#             embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))

#     return torch.tensor(embedding_matrix, dtype=torch.float32)

# embedding_matrix = load_glove_embeddings("glove.6B.50d.txt", vocab_dict, embedding_dim=50)




In [7]:

class NewsDataset(Dataset):
    def __init__(self, texts, labels, vocab):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab

    # number of samples in dataset
    def __len__(self):
        return len(self.texts)

    # call when we want only one sample
    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        tokens = torch.tensor(numericalize(text, self.vocab), dtype=torch.long) #self.vocab is for mapping words to numbers
        return tokens, torch.tensor(label, dtype=torch.long) #returns tokens and labels
    
# make data into dataset object
train_dataset = NewsDataset(train_texts, train_labels, vocab_dict)
test_dataset = NewsDataset(test_texts, test_labels, vocab_dict)

def collate_batch(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=0) #makes all lists same length by adding zeros at end
    labels = torch.tensor(labels, dtype=torch.long)
    return texts, labels

# train 32 healines per batch
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch) # reshuffle every epoch
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_batch)

print("Train batches:", len(train_loader))
print("Test batches:", len(test_loader))





Train batches: 122
Test batches: 31


In [8]:
import torch.nn as nn

class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)  #turns vocab id into dense vector
        #self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.fc = nn.Linear(embed_dim, num_classes) # input is average embedding of a sentence and output is num_classes(2)cwhich is buy or sell
    
    def forward(self, x):
        # x = (batch size, seq len)
        embedded = self.embedding(x) # gives us original output and embed_dim
        pooling = embedded.mean(dim = 1) # average embedding across all words in teh sentence, used to get one vec per sentence
        output = self.fc(pooling) # passes vector into classifier
        return output

In [None]:
#initialize model, loss, optimizer
vocab_size = max(vocab_dict.values()) + 1 # amount of unique words
embed_dim = 50 # size of each word vector, the larger the number the more expressive the word is
num_classes = len(set(train_labels)) # number of classes -> 2(buy/sell)
model = TextClassifier(vocab_size, embed_dim, num_classes)
criterion = nn.CrossEntropyLoss() # diff between prediction and target
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001) # optimizer that updates weights using gradients


In [10]:
# model training loop
def train_model(self, train_loader, criterion, optimizer, epochs = 5):
    model.train() # put in training mode
    for epoch in range(epochs):
        total_loss = 0
        for texts, label in train_loader:
            optimizer.zero_grad() # reset gradients
            outputs = model(texts) # forward pass
            loss = criterion(outputs, label) # calculate error
            loss.backward() # back propagation
            optimizer.step() # update weights
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

In [11]:
def evaluate_model(self, test_loader):
    model.eval() # put in evaluation mode
    correct, total = 0, 0
    with torch.no_grad(): # gradients not needed in eval mode
        for texts, labels in test_loader:
            outputs = model(texts) # forward pass
            _, predicted = torch.max(outputs, 1) # get prediction index 0 = sell 1 = buy
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Test Accuracy: {100 * correct / total:.2f}%")

In [12]:
# Train the model
train_model(model, train_loader, criterion, optimizer, epochs=30)

torch.save(model.state_dict(), "sentiment_model.pth")
checkpoint = torch.load("sentiment_model.pth", map_location="cpu")
for key, value in checkpoint.items():
    print(key, value.shape)

# Evaluate on test set
evaluate_model(model, test_loader)


Epoch 1/30, Loss: 114.2123
Epoch 2/30, Loss: 110.9316
Epoch 3/30, Loss: 107.5460
Epoch 4/30, Loss: 103.7880
Epoch 5/30, Loss: 100.0297
Epoch 6/30, Loss: 96.4197
Epoch 7/30, Loss: 92.7011
Epoch 8/30, Loss: 88.9235
Epoch 9/30, Loss: 85.2008
Epoch 10/30, Loss: 81.9970
Epoch 11/30, Loss: 78.3204
Epoch 12/30, Loss: 75.2570
Epoch 13/30, Loss: 72.4520
Epoch 14/30, Loss: 69.0984
Epoch 15/30, Loss: 65.8315
Epoch 16/30, Loss: 63.0408
Epoch 17/30, Loss: 59.9910
Epoch 18/30, Loss: 57.7102
Epoch 19/30, Loss: 54.6150
Epoch 20/30, Loss: 52.2284
Epoch 21/30, Loss: 50.3947
Epoch 22/30, Loss: 47.7134
Epoch 23/30, Loss: 45.6004
Epoch 24/30, Loss: 43.5257
Epoch 25/30, Loss: 41.5999
Epoch 26/30, Loss: 40.1510
Epoch 27/30, Loss: 38.5052
Epoch 28/30, Loss: 36.9869
Epoch 29/30, Loss: 35.6788
Epoch 30/30, Loss: 33.9636
embedding.weight torch.Size([8404, 50])
fc.weight torch.Size([3, 50])
fc.bias torch.Size([3])
Test Accuracy: 75.67%


In [13]:
# use model
import torch.nn.functional as F
def prediction(text, model, vocab, max_len = 50):
    model.eval()
    with torch.no_grad():
        # tokenize
        tokens = text.lower().split()
        token_ids = [vocab_dict.get(word, vocab["<unk>"]) for word in tokens]

        # pad
        if len(tokens) < max_len:
            token_ids += [vocab["<pad>"]] * (max_len - len(token_ids))
        else:
            token_ids = token_ids[:max_len]

        input_tensor = torch.tensor([token_ids])
        output = model(input_tensor)

        probs = F.softmax(output, dim=1) # rescales input and has it sum to 1, probability distribution
        predicted_class = torch.argmax(probs, dim=1).item() # picks class with highest probability
        return predicted_class, probs.numpy()

In [14]:
# # Recreate vocab + model
# model = TextClassifier(vocab_size, embed_dim, num_classes)
# model.load_state_dict(torch.load("sentiment_model.pth"))
# model.eval()

# # Then predict
# sentiment = prediction("Tesla stock decreases alot. ", model, vocab_dict)
# print(sentiment)

In [15]:
sample_text = "Stock prices soar after company reports record earnings"
pred_class, pred_probs = prediction(sample_text, model, vocab_dict)

print("Prediction:", pred_class)
print("Probabilities:", pred_probs)
example_texts = [
    "With the new production plant the company would increase its capacity to meet the expected increase",
    "Operating profit rose to EUR 13.1 mn from EUR 8.7 mn in the corresponding period in 2007 representing",
    "The international electronic industry company Elcoteq has laid off tens of employees from its Tallin"

]

for txt in example_texts:
    pred_class, probs = prediction(txt, model, vocab_dict)
    print(f"Text: {txt}")
    print(f"Predicted Class: {pred_class}, Probabilities: {probs}\n")


Prediction: 2
Probabilities: [[0.16540395 0.41472107 0.41987497]]
Text: With the new production plant the company would increase its capacity to meet the expected increase
Predicted Class: 2, Probabilities: [[0.0046184  0.08148839 0.9138933 ]]

Text: Operating profit rose to EUR 13.1 mn from EUR 8.7 mn in the corresponding period in 2007 representing
Predicted Class: 2, Probabilities: [[0.12834062 0.0231664  0.848493  ]]

Text: The international electronic industry company Elcoteq has laid off tens of employees from its Tallin
Predicted Class: 0, Probabilities: [[0.69263464 0.12259816 0.18476723]]



In [16]:
# If you used sklearn LabelEncoder earlier and saved it as `encoder`:
print("Label encoder classes (index -> label):")
for i, lbl in enumerate(encoder.classes_):
    print(i, lbl)

# If you used manual mapping like {"negative":0, "neutral":1, "positive":2}:
label_map = {0: "negative", 1: "neutral", 2: "positive"}
print(label_map)

Label encoder classes (index -> label):
0 negative
1 neutral
2 positive
{0: 'negative', 1: 'neutral', 2: 'positive'}


In [17]:
import torch, torch.nn.functional as F, numpy as np

def debug_predict(text, model, vocab, label_map, max_len=50):
    model.eval()
    tokens = text.lower().split()
    token_ids = [vocab.get(w, vocab.get("<unk>", 1)) for w in tokens]
    print("Text:", text)
    print("Tokens:", tokens)
    print("Token IDs:", token_ids)
    unk_count = sum(1 for w in tokens if w not in vocab)
    print(f"UNKs: {unk_count}/{len(tokens)} ({unk_count/len(tokens):.2%})")

    # pad/truncate
    if len(token_ids) < max_len:
        token_ids += [vocab.get("<pad>", 0)] * (max_len - len(token_ids))
    else:
        token_ids = token_ids[:max_len]

    with torch.no_grad():
        out = model(torch.tensor([token_ids]))
        probs = F.softmax(out, dim=1).cpu().numpy()[0]
        pred = int(np.argmax(probs))

    print("Predicted index:", pred, "Label:", label_map[pred])
    print("Probabilities:", {label_map[i]: float(probs[i]) for i in range(len(probs))})
    return pred, probs

# Use it:
debug_predict("buy", model, vocab_dict, label_map)


Text: buy
Tokens: ['buy']
Token IDs: [700]
UNKs: 0/1 (0.00%)
Predicted index: 1 Label: neutral
Probabilities: {'negative': 0.06489437818527222, 'neutral': 0.7436578869819641, 'positive': 0.19144777953624725}


(1, array([0.06489438, 0.7436579 , 0.19144778], dtype=float32))

In [None]:
import pandas as pd
from collections import Counter

# Load your processed dataset
df = pd.read_csv("data.csv", encoding='latin-1')  # has 'label' and 'text' columns
df.columns = ['label', 'text']
# Optional: only keep positive/negative labels if you want
finance_df = df[df['label'] != 'neutral']

# Get all words from the text
all_words = " ".join(finance_df['text'].tolist()).lower().split()

# Count frequency
word_counts = Counter(all_words)

# Remove common stopwords
stopwords = set(["the","a","and","of","to","in","for","on","with","as","at","is","has","that", ".", ",", "eur", "'s", "its'", "said", "(', ')",
                 "it", "2009", "was", "2008", "2010", "2007", ":", "its", "-", "``", "2006"])
finance_words = [word for word, count in word_counts.most_common(1000) if word not in stopwords]

print(finance_words[:50])  # preview top words




FileNotFoundError: [Errno 2] No such file or directory: 'all-data.csv'

In [None]:
import requests

API_KEY = "f726ca01832b44599b281c99e7a3d0b8"
query = "NVDA"
url = f"https://newsapi.org/v2/everything?q={query}&language=en&sortBy=publishedAt&apiKey={API_KEY}"

response = requests.get(url)
articles = response.json()["articles"]



for a in articles[:5]:
    print(a["title"])
    sentiment = prediction(a["title"], model, vocab_dict)
    print(sentiment)


Legendary fund manager has surprising take on AI
(1, array([[0.04693203, 0.8269804 , 0.12608758]], dtype=float32))
Veteran analyst turns heads with new AMD stock target
(1, array([[0.07726745, 0.48009095, 0.4426416 ]], dtype=float32))
Legendary fund manager has surprising take on AI
(1, array([[0.04693203, 0.8269804 , 0.12608758]], dtype=float32))
Top Stock Movers Now: Nvidia, AMD, Dell, Fair Isaac, and More
(1, array([[0.07136565, 0.611557  , 0.31707728]], dtype=float32))
Nvidia CEO Jensen Huang Calls AMD's 10% OpenAI Stake Offer Surprising but Clever
(1, array([[0.00897364, 0.9599625 , 0.03106392]], dtype=float32))


In [None]:
import requests

API_KEY = "f726ca01832b44599b281c99e7a3d0b8"
query = "NVDA"
url = f"https://newsapi.org/v2/everything?q={query}&language=en&sortBy=publishedAt&apiKey={API_KEY}"

response = requests.get(url).json()
headlines = [article['title'] for article in response['articles']]

# Keep only headlines with finance words
filtered_headlines = [h for h in headlines if any(word in h.lower() for word in finance_words)]
# more important words i want to include
important_words = ["profit", "loss", "earnings", "revenue", "merger", "acquisition", "downgrade", "lawsuit", "bankruptcy", "ipo", "interest", "rate"]
filtered_headlines = [h for h in filtered_headlines if any(word in h.lower() for word in important_words)]
for line in filtered_headlines[:5]:
    print(line)
    sentiment = prediction(a["title"], model, vocab_dict)
    print(sentiment)
    


Analyst Says Concerns About NVIDIA (NVDA) ‘Circle of CapEx Spending’ Are Not ‘Accurate’
(1, array([[0.00897364, 0.9599625 , 0.03106392]], dtype=float32))
Analyst Says Concerns About NVIDIA (NVDA) ‘Circle of CapEx Spending’ Are Not ‘Accurate’
(1, array([[0.00897364, 0.9599625 , 0.03106392]], dtype=float32))
Goldman Sachs strategist: No stock market bubble, yet
(1, array([[0.00897364, 0.9599625 , 0.03106392]], dtype=float32))
Bank of America Reiterates “Buy” on NVIDIA (NVDA), Calls It Top AI Pick
(1, array([[0.00897364, 0.9599625 , 0.03106392]], dtype=float32))
Intel Stock (INTC) Gets Price Target Lift on Strategic Moves and Foundry Expansion
(1, array([[0.00897364, 0.9599625 , 0.03106392]], dtype=float32))
