In [1]:
from torchtext.datasets import AG_NEWS
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F


ag_news = load_dataset('ag_news')

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
print(tokenizer)

def tokenize_function(example):
    return tokenizer(
        example['text'],        # Tokenize the 'text' field
        padding='max_length',    # Pad to the maximum length (512 tokens by default)
        truncation=True,         # Truncate sentences longer than the max length
        max_length=512           # Set the maximum length to 512 tokens
)

tokenized_datasets = ag_news.map(tokenize_function, batched=True)

# Remove the original 'text' column since we have tokenized data now
tokenized_datasets = tokenized_datasets.remove_columns(['text'])

# Set the dataset format to PyTorch tensors
tokenized_datasets.set_format('torch')

# Step 4: Create DataLoader for batching
BATCH_SIZE = 256

# Create DataLoader for the training set
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=BATCH_SIZE, shuffle=True)

# Create DataLoader for the test set
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=BATCH_SIZE)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/valentin/.pyenv/versions/3.12.7/envs/nlpworkshop/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/valentin/.pyenv/versions/3.12.7/envs/nlpworkshop/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/valentin/.pyenv/versions/3.12.7/envs/nlpworkshop/lib/python3.12/site-p

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [2]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(tokenizer.vocab_size, 4)
        self.linear = nn.Linear(4, 4)

    def forward(self, input_ids):
        x = self.embedding(input_ids)
        #x = self.linear(x)
        x = torch.sum(x, dim=1, keepdim=False)
        #x = F.softmax(x, dim=1)
        return x

model = Model()

In [7]:
epochs = 100
running_loss = 0
running_accuracy = 0

# Move model to GPU if available
device = torch.device('cuda')
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()

for epoch in range(epochs):
    batch_idx = 0
    for batch in train_dataloader:
        # Get inputs: input_ids, attention_mask, and labels
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Zero the gradients before the backward pass
        optimizer.zero_grad()

        # Forward pass: compute the model's predictions
        outputs = model(input_ids)

        # Get predicted classes by taking the argmax
        predicted_classes = torch.argmax(outputs, dim=1)
        
        # Calculate accuracy
        correct_predictions = (predicted_classes == labels).sum().item()
        accuracy = correct_predictions / labels.size(0)  # Average accuracy
        running_accuracy += accuracy

        # Get the loss value
        loss = criterion(outputs, labels)

        # Backward pass: compute the gradients
        loss.backward()

        # Step the optimizer (update the model's weights)
        optimizer.step()

        running_loss += loss.item()

        # Print loss every 100 batches
        if (batch_idx + 1) % 100 == 0:
            print(f"Batch {batch_idx + 1}, Loss: {running_loss / (batch_idx + 1)}")
        batch_idx += 1
    epoch_loss = running_loss / len(train_dataloader)
    epoch_accuracy = running_accuracy / len(train_dataloader)
    print(f"Epoch {epoch + 1} finished with average loss: {epoch_loss}, and an average accuracy of {epoch_accuracy}")
    running_loss = epoch_loss
    running_accuracy = epoch_accuracy

validation_accuracy = 0
validation_loss = 0

model.eval()
for batch in test_dataloader:
    input_ids = batch["input_ids"].to(device)
    labels = batch["label"].to(device)
    
    outputs = model(input_ids)
    
    predicted_classes = torch.argmax(outputs, dim=1)
    correct_predictions = (predicted_classes == labels).sum().item()
    accuracy = correct_predictions / labels.size(0)
    validation_accuracy += accuracy

    loss = criterion(outputs, labels)
    validation_loss += loss

validation_accuracy /= len(test_dataloader)
validation_loss /= len(test_dataloader)
print("validation accuracy", validation_accuracy)
print("validation loss", validation_loss)

# Step 8: Save the trained model    
torch.save(model.state_dict(), './new_training_run')

print("Training complete and model saved!")
        

Batch 100, Loss: 0.23083771459758282
Batch 200, Loss: 0.22770953945815564
Batch 300, Loss: 0.22654648818075657
Batch 400, Loss: 0.2273319841362536
Epoch 1 finished with average loss: 0.22799715388621858, and an average accuracy of 0.9364339019189766
Batch 100, Loss: 0.24218751504906078
Batch 200, Loss: 0.23278089880887698
Batch 300, Loss: 0.2299418532347946
Batch 400, Loss: 0.22574243416981912
Epoch 2 finished with average loss: 0.22501130437013875, and an average accuracy of 0.9392051495776524
Batch 100, Loss: 0.22957064144662592
Batch 200, Loss: 0.22498534176673282
Batch 300, Loss: 0.22713624179479672
Batch 400, Loss: 0.22403307803062492
Epoch 3 finished with average loss: 0.22133283749265453, and an average accuracy of 0.9403909846828238
Batch 100, Loss: 0.22149594154805372
Batch 200, Loss: 0.21778645621910905
Batch 300, Loss: 0.2168228138630603
Batch 400, Loss: 0.2160601931918423
Epoch 4 finished with average loss: 0.2176468176084931, and an average accuracy of 0.9407932998962676
B