In [13]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
# AdamW has been moved to torch.optim
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import re



class SSTDataset(Dataset):
    def __init__(self, filepath):
        with open(filepath, 'r', encoding='utf-8') as f:
            data = f.readlines()

        self.texts = []
        self.labels = []

        for line in data:
            # Extract sentiment score (0-4)
            score = int(re.search(r'\((\d+)', line).group(1))

            # PROPERLY extract text from parse tree
            text = re.sub(r'\(\d+', '', line)    # Remove score markers
            text = re.sub(r'[()]', '', text)      # Remove parentheses
            text = re.sub(r'\s{2,}', ' ', text)   # Collapse multiple spaces
            text = text.strip()

            if 0 <= score <= 4 and len(text) > 0:
                self.labels.append(score)
                self.texts.append(text)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
    encoding = tokenizer(
        self.texts[idx],
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'  # Already returns (1, seq_len)
    )
    return {
        'input_ids': encoding['input_ids'].squeeze(0),  # Shape: (seq_len,)
        'attention_mask': encoding['attention_mask'].squeeze(0),
        'label': torch.tensor(self.labels[idx], dtype=torch.long)
    }
# =====================
# 2. Modified Training Setup
# =====================

def train_model(model, train_loader, val_loader, optimizer, device, num_epochs=10):
    model = model.to(device)
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

    # Add label smoothing to prevent overconfidence
    criterion = torch.nn.CrossEntropyLoss(label_smoothing=0.1)

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        epoch_train_loss = 0
        train_correct = 0
        train_total = 0

        for batch in train_loader:
            optimizer.zero_grad()
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
            labels = batch['label'].to(device)

            outputs = model(**inputs)
            loss = criterion(outputs.logits, labels)  # Use modified criterion
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            # Calculate metrics
            _, predicted = torch.max(outputs.logits, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
            epoch_train_loss += loss.item()

        # Validation phase
        model.eval()
        epoch_val_loss = 0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for batch in val_loader:
                inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
                labels = batch['label'].to(device)

                outputs = model(**inputs)
                loss = criterion(outputs.logits, labels)

                _, predicted = torch.max(outputs.logits, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
                epoch_val_loss += loss.item()

        # Calculate metrics
        train_loss = epoch_train_loss / len(train_loader)
        train_acc = train_correct / train_total
        val_loss = epoch_val_loss / len(val_loader)
        val_acc = val_correct / val_total

        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)

        print(f'Epoch {epoch+1}/{num_epochs}')
        print(f'Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}')
        print(f'Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}\n')

    return model, history

# =====================
# 3. Data Verification Steps
# =====================

# Check dataset statistics
def verify_dataset(dataset):
    print(f"Total samples: {len(dataset)}")
    print("Label distribution:")
    print(pd.Series(dataset.labels).value_counts().sort_index())
    print("\nSample texts:")
    for i in range(3):
        print(f"Text: {dataset.texts[i]} | Label: {dataset.labels[i]}")

# Load datasets
sst_train = SSTDataset('/content/train.txt')
sst_val = SSTDataset('/content/dev.txt')  # MUST BE DIFFERENT FILE
sst_test = SSTDataset('/content/test.txt')

# Verify datasets
print("Training Set:")
verify_dataset(sst_train)
print("\nValidation Set:")
verify_dataset(sst_val)

Training Set:
Total samples: 8544
Label distribution:
0    1092
1    2218
2    1624
3    2322
4    1288
Name: count, dtype: int64

Sample texts:
Text: The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal . | Label: 3
Text: The gorgeously elaborate continuation of `` The Lord of the Rings '' trilogy is so huge that a column of words can not adequately describe co-writer\/director Peter Jackson 's expanded vision of J.R.R. Tolkien 's Middle-earth . | Label: 4
Text: Singer\/composer Bryan Adams contributes a slew of songs -- a few potential hits , a few more simply intrusive to the story -- but the whole package certainly captures the intended , er , spirit of the piece . | Label: 3

Validation Set:
Total samples: 1101
Label distribution:
0    139
1    289
2    229
3    279
4    165
Name: count, dtype: int64

Sample texts:
Text: It 's a lovely film with lovely per

In [19]:
# Initialize model and optimizer
sst_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)
optimizer = AdamW(sst_model.parameters(), lr=1e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

from torch.utils.data import DataLoader

def collate_fn(batch):
    return {
        'input_ids': torch.stack([item['input_ids'] for item in batch]),
        'attention_mask': torch.stack([item['attention_mask'] for item in batch]),
        'label': torch.stack([item['label'] for item in batch])
    }

train_loader = DataLoader(sst_train, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(sst_val, batch_size=32, collate_fn=collate_fn)
sst_test = SSTDataset('/content/test.txt')
sst_test_loader = DataLoader(sst_test, batch_size=32, collate_fn=collate_fn)

# Start training
trained_model, training_history = train_model(
    model=sst_model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    device=device,
    num_epochs=10
)
trained_model, training_history = train_model(
    model=sst_model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    device=device,
    num_epochs=10
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Train Loss: 1.4017 | Train Acc: 0.4089
Val Loss: 1.2533 | Val Acc: 0.4868

Epoch 2/10
Train Loss: 1.1693 | Train Acc: 0.5535
Val Loss: 1.2288 | Val Acc: 0.5059

Epoch 3/10
Train Loss: 1.0603 | Train Acc: 0.6285
Val Loss: 1.2284 | Val Acc: 0.5268

Epoch 4/10
Train Loss: 0.9543 | Train Acc: 0.7017
Val Loss: 1.2797 | Val Acc: 0.5277

Epoch 5/10
Train Loss: 0.8591 | Train Acc: 0.7609
Val Loss: 1.3711 | Val Acc: 0.5159

Epoch 6/10
Train Loss: 0.7687 | Train Acc: 0.8178
Val Loss: 1.4390 | Val Acc: 0.4977

Epoch 7/10
Train Loss: 0.6948 | Train Acc: 0.8539
Val Loss: 1.5131 | Val Acc: 0.4859

Epoch 8/10
Train Loss: 0.6364 | Train Acc: 0.8876
Val Loss: 1.5599 | Val Acc: 0.5023

Epoch 9/10
Train Loss: 0.5846 | Train Acc: 0.9144
Val Loss: 1.6415 | Val Acc: 0.4950

Epoch 10/10
Train Loss: 0.5524 | Train Acc: 0.9291
Val Loss: 1.6782 | Val Acc: 0.4914

Epoch 1/10
Train Loss: 0.5287 | Train Acc: 0.9400
Val Loss: 1.7552 | Val Acc: 0.4950

Epoch 2/10
Train Loss: 0.5093 | Train Acc: 0.9462
Val

In [20]:

def evaluate_model(model, test_loader, device):
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
            labels = batch['label'].to(device)

            outputs = model(**inputs)
            loss = torch.nn.functional.cross_entropy(outputs.logits, labels)
            test_loss += loss.item()

            _, predicted = torch.max(outputs.logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = test_loss / len(test_loader)
    accuracy = correct / total

    print(f'\nTest Loss: {avg_loss:.4f} | Test Accuracy: {accuracy:.4f}\n')
    return avg_loss, accuracy

test_loss, test_acc = evaluate_model(trained_model, sst_test_loader, device)


Test Loss: 1.6281 | Test Accuracy: 0.5258

