In [1]:
# imports
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import re

In [18]:
# load data set
df = pd.read_csv("data.csv", encoding='latin-1') 
df.columns = ['label', 'text']

df2 = pd.read_csv("data.csv", encoding='latin-1') 
df2 = df2.rename(columns={'Sentence': 'text', 'Sentiment': 'label'})
# Combine datasets
df = pd.concat([df, df2], ignore_index=True)
print(df.head())
print(df.columns)

      label                                               text neutral  "Hi"
0   neutral  According to Gran , the company has no plans t...     NaN   NaN
1   neutral  Technopolis plans to develop in stages an area...     NaN   NaN
2  negative  The international electronic industry company ...     NaN   NaN
3  positive  With the new production plant the company woul...     NaN   NaN
4  positive  According to the company 's updated strategy f...     NaN   NaN
Index(['label', 'text', 'neutral', ' "Hi"'], dtype='object')


In [19]:
# clean text
def clean_text(text):
    text = text.lower() # convert string to lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # remove links
    text = re.sub(r'\@w+|\#','', text)  # remove mentions and hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove numbers and punctuation
    return text
df['text'] = df['text'].astype(str).apply(clean_text) # updates text column to string and cleans

# encode labels
df = df.dropna(subset=['label'])
encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['label'])  

print("Label classes:", encoder.classes_)

Label classes: ['negative' 'neutral' 'positive']


In [20]:
# train/tests split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)
# here were training it so it can predict what label each text goes under
print("Unique labels:", sorted(set(train_labels)))


Unique labels: [0, 1, 2]


In [21]:
# create data set class
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset
from collections import Counter
import pickle


# splits text at any whitespace
def tokenizer(text):
    return text.split()

# goes through all of the text and yeilds the tokens
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)


def build_vocab(texts, min_freq=2):

    counter = Counter()
    for text in texts:
        counter.update(tokenizer(text))
    # start indices at 2 so we can reserve 0 for PAD and 1 for UNK
    vocab = {word: i+2 for i, (word, freq) in enumerate(counter.items()) if freq >= min_freq}
    vocab["<pad>"] = 0
    vocab["<unk>"] = 1
    return vocab

vocab_dict = build_vocab(train_texts, min_freq=2)  

words_to_keep = list(vocab_dict.keys())

# Create a new dictionary assigning completely new sequential indices
new_vocab_dict = {}
for i, word in enumerate(words_to_keep):
    if word not in ["<pad>", "<unk>"]:
        new_vocab_dict[word] = i + 2

new_vocab_dict["<pad>"] = 0
new_vocab_dict["<unk>"] = 1
vocab_dict = new_vocab_dict
final_length = len(vocab_dict)
final_max_index = max(vocab_dict.values())


def numericalize(text, vocab):
    tokens = tokenizer(text)  # split into words
    return [vocab.get(token, vocab["<unk>"]) for token in tokens]

with open("vocab.pkl", "wb") as f:
    pickle.dump(vocab_dict, f)


In [22]:

class NewsDataset(Dataset):
    def __init__(self, texts, labels, vocab):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab

    # number of samples in dataset
    def __len__(self):
        return len(self.texts)

    # call when we want only one sample
    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        tokens = torch.tensor(numericalize(text, self.vocab), dtype=torch.long) #self.vocab is for mapping words to numbers
        return tokens, torch.tensor(label, dtype=torch.long) #returns tokens and labels
    
# make data into dataset object
train_dataset = NewsDataset(train_texts, train_labels, vocab_dict)
test_dataset = NewsDataset(test_texts, test_labels, vocab_dict)

def collate_batch(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=0) #makes all lists same length by adding zeros at end
    labels = torch.tensor(labels, dtype=torch.long)
    return texts, labels

# train 32 healines per batch
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch) # reshuffle every epoch
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_batch)

print("Train batches:", len(train_loader))
print("Test batches:", len(test_loader))

Train batches: 122
Test batches: 31


In [23]:
import torch.nn as nn

class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)  #turns vocab id into dense vector
        #self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.fc = nn.Linear(embed_dim, num_classes) # input is average embedding of a sentence and output is num_classes(2)cwhich is buy or sell
    
    def forward(self, x):
        # x = (batch size, seq len)
        embedded = self.embedding(x) # gives us original output and embed_dim
        pooling = embedded.mean(dim = 1) # average embedding across all words in teh sentence, used to get one vec per sentence
        output = self.fc(pooling) # passes vector into classifier
        return output

In [24]:
#initialize model, loss, optimizer
vocab_size = final_max_index + 1 # amount of unique words
embed_dim = 50 # size of each word vector, the larger the number the more expressive the word is
num_classes = len(set(train_labels)) # number of classes -> 2(buy/sell)

model = TextClassifier(vocab_size, embed_dim, num_classes)

criterion = nn.CrossEntropyLoss() # diff between prediction and target
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001) # optimizer that updates weights using gradients


In [25]:
# model training loop
def train_model(self, train_loader, criterion, optimizer, epochs = 5):
    model.train() # put in training mode
    for epoch in range(epochs):
        total_loss = 0
        for texts, label in train_loader:
            optimizer.zero_grad() # reset gradients
            outputs = model(texts) # forward pass
            loss = criterion(outputs, label) # calculate error
            loss.backward() # back propagation
            optimizer.step() # update weights
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

In [26]:
def evaluate_model(self, test_loader):
    model.eval() # put in evaluation mode
    correct, total = 0, 0
    with torch.no_grad(): # gradients not needed in eval mode
        for texts, labels in test_loader:
            outputs = model(texts) # forward pass
            _, predicted = torch.max(outputs, 1) # get prediction index 0 = sell 1 = buy
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Test Accuracy: {100 * correct / total:.2f}%")

In [27]:
# Train the model
train_model(model, train_loader, criterion, optimizer, epochs=40)

torch.save(model.state_dict(), "sentiment_model.pth")


# Evaluate on test set
evaluate_model(model, test_loader)


Epoch 1/40, Loss: 123.9223
Epoch 2/40, Loss: 111.7427
Epoch 3/40, Loss: 108.0877
Epoch 4/40, Loss: 105.0638
Epoch 5/40, Loss: 100.8953
Epoch 6/40, Loss: 97.0212
Epoch 7/40, Loss: 93.5396
Epoch 8/40, Loss: 89.9924
Epoch 9/40, Loss: 87.0621
Epoch 10/40, Loss: 84.4632
Epoch 11/40, Loss: 80.8204
Epoch 12/40, Loss: 77.9808
Epoch 13/40, Loss: 74.7600
Epoch 14/40, Loss: 71.8023
Epoch 15/40, Loss: 69.1842
Epoch 16/40, Loss: 66.3111
Epoch 17/40, Loss: 63.6448
Epoch 18/40, Loss: 61.0526
Epoch 19/40, Loss: 58.5524
Epoch 20/40, Loss: 55.4626
Epoch 21/40, Loss: 53.4188
Epoch 22/40, Loss: 51.2652
Epoch 23/40, Loss: 48.2278
Epoch 24/40, Loss: 46.7174
Epoch 25/40, Loss: 44.4867
Epoch 26/40, Loss: 42.6324
Epoch 27/40, Loss: 40.5574
Epoch 28/40, Loss: 38.8397
Epoch 29/40, Loss: 37.2100
Epoch 30/40, Loss: 35.9975
Epoch 31/40, Loss: 34.3856
Epoch 32/40, Loss: 33.1798
Epoch 33/40, Loss: 31.8744
Epoch 34/40, Loss: 31.3598
Epoch 35/40, Loss: 29.2919
Epoch 36/40, Loss: 28.0347
Epoch 37/40, Loss: 27.2143
Epoch

In [28]:
checkpoint = torch.load("sentiment_model.pth", map_location="cpu")
for key, value in checkpoint.items():
    print(key, value.shape)

embedding.weight torch.Size([4039, 50])
fc.weight torch.Size([3, 50])
fc.bias torch.Size([3])


In [29]:
# use model
import torch.nn.functional as F
def prediction(text, model, vocab, max_len = 50):
    model.eval()
    with torch.no_grad():
        # tokenize
        tokens = text.lower().split()
        token_ids = [vocab_dict.get(word, vocab["<unk>"]) for word in tokens]

        # pad
        if len(tokens) < max_len:
            token_ids += [vocab["<pad>"]] * (max_len - len(token_ids))
        else:
            token_ids = token_ids[:max_len]

        input_tensor = torch.tensor([token_ids])
        output = model(input_tensor)

        probs = F.softmax(output, dim=1) # rescales input and has it sum to 1, probability distribution
        predicted_class = torch.argmax(probs, dim=1).item() # picks class with highest probability
        return predicted_class, probs.numpy()

In [30]:
sample_text = "Stock prices soar after company reports record earnings"
pred_class, pred_probs = prediction(sample_text, model, vocab_dict)

print("Prediction:", pred_class)
print("Probabilities:", pred_probs)
example_texts = [
    "With the new production plant the company would increase its capacity to meet the expected increase",
    "Operating profit rose to EUR 13.1 mn from EUR 8.7 mn in the corresponding period in 2007 representing",
    "The international electronic industry company Elcoteq has laid off tens of employees from its Tallin"
]

for txt in example_texts:
    pred_class, probs = prediction(txt, model, vocab_dict)
    print(f"Text: {txt}")
    print(f"Predicted Class: {pred_class}, Probabilities: {probs}\n")


Prediction: 2
Probabilities: [[0.21469253 0.27129132 0.5140162 ]]
Text: With the new production plant the company would increase its capacity to meet the expected increase
Predicted Class: 2, Probabilities: [[0.0039546  0.09592258 0.9001228 ]]

Text: Operating profit rose to EUR 13.1 mn from EUR 8.7 mn in the corresponding period in 2007 representing
Predicted Class: 2, Probabilities: [[0.14368698 0.01943773 0.8368752 ]]

Text: The international electronic industry company Elcoteq has laid off tens of employees from its Tallin
Predicted Class: 0, Probabilities: [[0.6123134  0.11277669 0.27490994]]



In [31]:
# If you used sklearn LabelEncoder earlier and saved it as `encoder`:
print("Label encoder classes (index -> label):")
for i, lbl in enumerate(encoder.classes_):
    print(i, lbl)

# If you used manual mapping like {"negative":0, "neutral":1, "positive":2}:
label_map = {0: "negative", 1: "neutral", 2: "positive"}
print(label_map)

Label encoder classes (index -> label):
0 negative
1 neutral
2 positive
{0: 'negative', 1: 'neutral', 2: 'positive'}
