In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from torch.nn.utils.rnn import pad_sequence
from collections import Counter


In [2]:
# Load pre-cleaned datasets
df_train = pd.read_csv("sent_train.csv")
df_valid = pd.read_csv("sent_valid.csv")

train_texts = df_train['text'].tolist()
train_labels = df_train['label'].tolist()

val_texts = df_valid['text'].tolist()
val_labels = df_valid['label'].tolist()


In [3]:
tokenized = [text.split() for text in train_texts]
word_counts = Counter(word for sentence in tokenized for word in sentence)
vocab = {word: i+2 for i, (word, _) in enumerate(word_counts.items())}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1

def encode(text):
    return [vocab.get(word, 1) for word in text.split()]


In [4]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = [torch.tensor(encode(text)) for text in texts]
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

def collate_fn(batch):
    texts, labels = zip(*batch)
    padded = pad_sequence(texts, batch_first=True, padding_value=0)
    return padded, torch.tensor(labels)

train_ds = SentimentDataset(train_texts, train_labels)
val_ds = SentimentDataset(val_texts, val_labels)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False, collate_fn=collate_fn)


In [5]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        _, (h_n, _) = self.lstm(x)
        out = self.fc(h_n[-1])
        return out

model = LSTMClassifier(vocab_size=len(vocab), embed_dim=100, hidden_dim=128, output_dim=3)


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(5):
    # Training
    model.train()
    for batch in train_loader:
        x, y = [b.to(device) for b in batch]
        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in val_loader:
            x, y = [b.to(device) for b in batch]
            output = model(x)
            loss = criterion(output, y)
            val_loss += loss.item() * y.size(0)
            preds = output.argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)
            y_true.extend(y.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())

    val_acc = correct / total
    val_loss /= total
    print(f"Epoch {epoch+1}: Val Loss = {val_loss:.4f}, Val Acc = {val_acc:.4f}")


Epoch 1: Val Loss = 0.9470, Val Acc = 0.5369
Epoch 2: Val Loss = 0.8642, Val Acc = 0.5867
Epoch 3: Val Loss = 1.1153, Val Acc = 0.5352
Epoch 4: Val Loss = 1.0341, Val Acc = 0.5360
Epoch 5: Val Loss = 1.4325, Val Acc = 0.5419


In [7]:
print("\nFinal Evaluation on Validation Set:")
print(classification_report(y_true, y_pred, target_names=["Bearish", "Bullish", "Neutral"]))


Final Evaluation on Validation Set:
              precision    recall  f1-score   support

     Bearish       0.33      0.36      0.35       347
     Bullish       0.31      0.72      0.44       475
     Neutral       0.90      0.53      0.66      1566

    accuracy                           0.54      2388
   macro avg       0.52      0.54      0.48      2388
weighted avg       0.70      0.54      0.57      2388

