In [1]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch
from datasets import load_dataset
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from collections import Counter
import re
from torch.nn.utils.rnn import pad_sequence

In [2]:
#curl -L "https://www.dropbox.com/scl/fi/qrc1mwvtqv6lculyblou2/test.csv?rlkey=uzqhburw0mikq7xe08tpl9nvp&st=epeyiu23&raw=1" --output test.csv
#curl -L "https://www.dropbox.com/scl/fi/pglp39zt6nokf2lsqjeqj/train.csv?rlkey=oufor4gckdmrh9ntw3cqswhnq&st=cb80evb3&dl=0" --output train.csv

In [3]:

class AG_NEWS_Dataset(Dataset):
    def __init__(self, data, vocab, transform=None):
        self.data = data
        self.vocab = vocab
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        label, text = self.data[idx]
        if self.transform:
            text = self.transform(text)
        numerical_text = [self.vocab.get(word, self.vocab['<unk>']) for word in text] #Corrected line
        return torch.tensor(label - 1, dtype=torch.int64), torch.tensor(numerical_text, dtype=torch.int64)

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text.split()

def build_vocab(data, min_freq=30):
    counter = Counter()
    for _, text in data:
        counter.update(clean_text(text))
    vocab = {word: idx + 2 for idx, (word, count) in enumerate(counter.items()) if count >= min_freq}
    vocab['<pad>'] = 0
    vocab['<unk>'] = 1
    return vocab

def download_and_extract(url, extract_path):
    response = requests.get(url, stream=True)
    file = tarfile.open(fileobj=io.BytesIO(response.content), mode='r:gz')
    file.extractall(path=extract_path)

def load_ag_news(root='.'):

    if not os.path.exists(root):
        os.makedirs(root)


    train_data = []
    test_data = []

    with open(os.path.join(root, 'train.csv'), 'r', encoding='utf-8') as f:
        for line in f:
            label, title, description = line.strip().split('","', 2)
            label = int(label.replace('"', ''))
            title = title.replace('"', '')
            description = description.replace('"', '')
            train_data.append((label, title + " " + description))

    with open(os.path.join('test.csv'), 'r', encoding='utf-8') as f:
        for line in f:
            label, title, description = line.strip().split('","', 2)
            label = int(label.replace('"', ''))
            title = title.replace('"', '')
            description = description.replace('"', '')
            test_data.append((label, title + " " + description))

    return train_data, test_data

def collate_batch(batch, pad_idx=0):
    labels, texts = zip(*batch)
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=pad_idx)
    labels = torch.stack(labels)
    return labels, padded_texts

def get_dataloaders(batch_size=64, min_freq=30, root='.'):
    train_data, test_data = load_ag_news(root)
    vocab = build_vocab(train_data, min_freq)

    train_dataset = AG_NEWS_Dataset(train_data, vocab, transform=clean_text)
    test_dataset = AG_NEWS_Dataset(test_data, vocab, transform=clean_text)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=lambda batch: collate_batch(batch, pad_idx=vocab['<pad>']))
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=lambda batch: collate_batch(batch, pad_idx=vocab['<pad>']))

    return train_dataloader, test_dataloader, vocab, 4


train_loader, test_loader, vocabulary, num_classes = get_dataloaders()

for labels, texts in train_loader:
    print("Batch Labels:", labels.shape, labels)
    print("Batch Texts:", texts.shape, texts)
    print("Vocab Size:", len(vocabulary))
    print("Number of Classes:", num_classes)
    break

Batch Labels: torch.Size([64]) tensor([2, 3, 2, 3, 1, 2, 0, 2, 2, 1, 2, 3, 0, 2, 0, 1, 1, 3, 1, 2, 0, 0, 1, 0,
        1, 3, 3, 2, 0, 2, 1, 0, 1, 3, 2, 0, 2, 1, 2, 3, 3, 2, 0, 1, 1, 0, 0, 0,
        2, 2, 3, 3, 2, 0, 2, 1, 3, 2, 3, 2, 1, 0, 3, 3])
Batch Texts: torch.Size([64, 54]) tensor([[12107,   495,     1,  ...,     0,     0,     0],
        [ 4673,  8596, 25356,  ...,     0,     0,     0],
        [ 8055,  1079, 19492,  ...,     0,     0,     0],
        ...,
        [ 1785,  8076,  1569,  ...,     0,     0,     0],
        [    1,  2471,  9835,  ...,     0,     0,     0],
        [ 4250,  8383,  4614,  ...,     0,     0,     0]])
Vocab Size: 10845
Number of Classes: 4


In [4]:


class TorchVanillaRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, num_layers, pad_idx):
        super(TorchVanillaRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=pad_idx)
        self.rnn = nn.RNN(embed_size, hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)  # Corrected line
        return self.fc(hidden[-1])  # Use the last hidden state for classification

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

vocab_size = len(vocabulary) # Vocabulary size from BERT tokenizer
embed_size = 256   # Embedding size
hidden_size = 512  # RNN hidden state size
output_size = num_classes    # Number of classes (AG News has 4 categories)
num_layers = 1    # Multi-layer RNN
num_epochs = 10
PAD_IDX=0
# Instantiate the model
model = TorchVanillaRNN(
        vocab_size=vocab_size, 
        embed_size=embed_size,
        hidden_size=hidden_size, 
        output_size=output_size, 
        num_layers=num_layers, 
        pad_idx = PAD_IDX
    ).to(device)

# Trainable embeddings
model.embedding.weight.data.copy_(torch.randn(vocab_size, embed_size))  # Simple init
model.embedding.weight.requires_grad = True  # Ensure embeddings are trainable


criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_losses = []
test_losses = []

for epoch in range(num_epochs):
    print("Running epoch:", epoch)
    model.train()
    running_loss = 0.0
    batch_counter = 0  # Add a batch counter to assess speed
    for batch in train_loader:
        if batch_counter % 1000 == 0:
            print("Running batch:", batch_counter)
        batch_counter = batch_counter + 1
        input_ids = batch[1].to(device)  # Shape: (batch_size, seq_len)
        labels = batch[0].to(device)        # Shape: (batch_size,)
        input_ids[input_ids >= vocab_size] = 1
        optimizer.zero_grad()
        outputs  = model(input_ids)  # Convert to float for matrix multiplication
        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Add this line
        optimizer.step()
        
        running_loss += loss.item() * input_ids.size(0)
        # Break out of training for brevity in the class, you can remove this
    
    epoch_train_loss = running_loss / (batch_counter * 64)
    train_losses.append(epoch_train_loss)
    
    # Evaluate on test set
    model.eval()
    running_loss_test = 0.0
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        batch_counter = 0 
        for batch in test_loader:
            batch_counter = batch_counter + 1
            input_ids = batch[1].to(device)
            input_ids[input_ids >= vocab_size] = 1
            labels = batch[0].to(device)
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            running_loss_test += loss.item() * input_ids.size(0)
            _, predicted = torch.max(outputs, 1)  # Get the predicted class
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.size(0)
    epoch_test_loss = running_loss_test / (batch_counter * 64)
    test_losses.append(epoch_test_loss)
    epoch_accuracy = correct_predictions / total_predictions
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_train_loss:.4f}, Test Loss: {epoch_test_loss:.4f}, Test Accuracy: {epoch_accuracy:.4f}")

# Plot the training and test losses over epochs
plt.figure(figsize=(10, 6))
plt.plot(range(1, num_epochs+1), train_losses, label="Train Loss")
plt.plot(range(1, num_epochs+1), test_losses, label="Test Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Test Loss Over Epochs")
plt.legend()
plt.show()

Using device: cuda
Running epoch: 0
Running batch: 0
Running batch: 1000
Epoch 1/10, Train Loss: 1.4127, Test Loss: 1.4013, Test Accuracy: 0.2545
Running epoch: 1
Running batch: 0
Running batch: 1000
Epoch 2/10, Train Loss: 1.4101, Test Loss: 1.3950, Test Accuracy: 0.2491
Running epoch: 2
Running batch: 0
Running batch: 1000
Epoch 3/10, Train Loss: 1.4113, Test Loss: 1.3979, Test Accuracy: 0.2507
Running epoch: 3
Running batch: 0
Running batch: 1000
Epoch 4/10, Train Loss: 1.4123, Test Loss: 1.3885, Test Accuracy: 0.2533
Running epoch: 4
Running batch: 0
Running batch: 1000
Epoch 5/10, Train Loss: 1.4128, Test Loss: 1.3988, Test Accuracy: 0.2524
Running epoch: 5
Running batch: 0
Running batch: 1000
Epoch 6/10, Train Loss: 1.4105, Test Loss: 1.3993, Test Accuracy: 0.2571
Running epoch: 6
Running batch: 0
Running batch: 1000
Epoch 7/10, Train Loss: 1.4100, Test Loss: 1.3850, Test Accuracy: 0.2538
Running epoch: 7
Running batch: 0
Running batch: 1000



KeyboardInterrupt



In [45]:
class TorchMultiLayerLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, output_size, num_layers=1, pad_idx=0):
        """
        Implements a multi-layer LSTM for AG News classification.

        Args:
            vocab_size (int): Size of the vocabulary.
            embedding_size (int): Size of the word embeddings.
            hidden_size (int): Number of hidden units per layer.
            output_size (int): Number of output classes (4 for AG News).
            num_layers (int): Number of stacked LSTM layers.
            dropout (float): Dropout probability applied between layers.
        """
        super(TorchMultiLayerLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout

        # Use PyTorch's built-in LSTM
        self.lstm = nn.LSTM(
            input_size=embedding_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )

        # Fully connected output layer
        self.fc = nn.Linear(hidden_size, output_size)

        # Initialize the output layer
        self.reset_parameters()

    def reset_parameters(self):
        """Applies Xavier uniform initialization to the final output layer."""
        nn.init.xavier_uniform_(self.fc.weight)
        if self.fc.bias is not None:
            nn.init.zeros_(self.fc.bias)

    def forward(self, x, h0=None, c0=None):
        """
        Forward pass of the multi-layer LSTM.

        Args:
            x (Tensor): Input tensor of shape (batch_size, seq_len).
            h0 (Tensor, optional): Initial hidden state.
            c0 (Tensor, optional): Initial cell state.

        Returns:
            output (Tensor): Final output from the last layer (batch_size, output_size).
            (h_n, c_n) (tuple): Final hidden and cell states.
        """
        # Get batch size and sequence length
        batch_size = x.size(0)
        
        # Embed the input
        embedded = self.embedding(x)  # (batch_size, seq_len, embedding_size)

        # Initialize hidden states if not provided
        if h0 is None:
            h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=x.device)
        if c0 is None:
            c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=x.device)

        # Run through LSTM
        output, (h_n, c_n) = self.lstm(embedded, (h0, c0))
        
        # Use the last hidden state for classification
        final_hidden = h_n[-1]  # (batch_size, hidden_size)
        
        # Final classification layer
        logits = self.fc(final_hidden)  # (batch_size, output_size)
        
        return logits, (h_n, c_n)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

vocab_size = tokenizer.vocab_size  # Vocabulary size from BERT tokenizer
embed_size = 256   # Embedding size
hidden_size = 256  # RNN hidden state size
output_size = 4    # Number of classes (AG News has 4 categories)
num_layers = 2    # Multi-layer LSTM
dropout = 0.3
num_epochs = 10
# Instantiate the model
model = TorchMultiLayerLSTM(
        vocab_size=vocab_size, 
        embed_size=embed_size,
        hidden_size=hidden_size, 
        output_size=output_size, 
        num_layers=num_layers, 
        dropout=dropout
    ).to(device)

# Trainable embeddings
model.embedding.weight.data.copy_(torch.randn(vocab_size, embed_size))  # Simple init
model.embedding.weight.requires_grad = True  # Ensure embeddings are trainable


criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_losses = []
test_losses = []

for epoch in range(num_epochs):
    print("Running epoch:", epoch)
    model.train()
    running_loss = 0.0
    batch_counter = 0  # Add a batch counter to assess speed
    for batch in train_loader:
        if batch_counter % 1000 == 0:
            print("Running batch:", batch_counter)
        batch_counter = batch_counter + 1
        input_ids = batch['input_ids'].to(device)  # Shape: (batch_size, seq_len)
        labels = batch['labels'].to(device)        # Shape: (batch_size,)
        
        optimizer.zero_grad()
        outputs  = model(input_ids)  # Convert to float for matrix multiplication
        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Add this line
        optimizer.step()
        
        running_loss += loss.item() * input_ids.size(0)
        # Break out of training for brevity in the class, you can remove this
    
    epoch_train_loss = running_loss / len(train_dataset)
    train_losses.append(epoch_train_loss)
    
    # Evaluate on test set
    model.eval()
    running_loss_test = 0.0
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            running_loss_test += loss.item() * input_ids.size(0)
            _, predicted = torch.max(outputs, 1)  # Get the predicted class
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.size(0)
    epoch_test_loss = running_loss_test / len(test_dataset)
    test_losses.append(epoch_test_loss)
    epoch_accuracy = correct_predictions / total_predictions
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_train_loss:.4f}, Test Loss: {epoch_test_loss:.4f}, Test Accuracy: {epoch_accuracy:.4f}")

# Plot the training and test losses over epochs
plt.figure(figsize=(10, 6))
plt.plot(range(1, num_epochs+1), train_losses, label="Train Loss")
plt.plot(range(1, num_epochs+1), test_losses, label="Test Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Test Loss Over Epochs")
plt.legend()
plt.show()

In [11]:


class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, pad_idx, num_heads=8, dropout=0.2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.pos_encoder = PositionalEncoding(embedding_dim, dropout)
        encoder_layer = nn.TransformerEncoderLayer(embedding_dim, num_heads, hidden_dim, dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.fc = nn.Linear(embedding_dim, output_dim)
        self.embedding_dim = embedding_dim  # Add this line to store embedding_dim

    def forward(self, text):
        src_pad_mask = (text == 0).transpose(0, 1)  # Generate padding mask
        embedded = self.embedding(text) * torch.sqrt(torch.tensor(self.embedding_dim, dtype=torch.float32))  # Use self.embedding_dim here
        embedded = self.pos_encoder(embedded)
        output = self.transformer_encoder(embedded, src_key_padding_mask=src_pad_mask)
        output = output.mean(dim=1)  # Average over sequence length
        return self.fc(output)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

def train(model, iterator, optimizer, criterion, device):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    batch_counter = 0
    for labels, texts in iterator:
        print("Running batch:", batch_counter)
        labels = labels.to(device)
        texts = texts.to(device)
        texts[texts >= VOCAB_SIZE] = 1
        optimizer.zero_grad()
        predictions = model(texts)
        loss = criterion(predictions, labels)
        acc = calculate_accuracy(predictions, labels)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion, device):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()

    with torch.no_grad():
        for labels, texts in iterator:
            labels = labels.to(device)
            texts = texts.to(device)

            predictions = model(texts)
            loss = criterion(predictions, labels)
            acc = calculate_accuracy(predictions, labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def calculate_accuracy(predictions, labels):
    _, predicted_classes = torch.max(predictions, 1)
    correct_predictions = (predicted_classes == labels).sum().float()
    accuracy = correct_predictions / labels.size(0)  # Corrected line
    return accuracy

if __name__ == '__main__':
    BATCH_SIZE = 64
    MIN_FREQ = 5
    EMBEDDING_DIM = 128
    HIDDEN_DIM = 256
    OUTPUT_DIM = 4
    NUM_LAYERS = 2
    NUM_EPOCHS = 10

    train_dl, test_dl, vocabulary, num_classes = get_dataloaders(batch_size=BATCH_SIZE, min_freq=MIN_FREQ)
    VOCAB_SIZE = len(vocabulary)
    PAD_IDX = vocabulary['<pad>']

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = TransformerClassifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, NUM_LAYERS, PAD_IDX).to(device)
    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss().to(device)

    for epoch in range(NUM_EPOCHS):
        train_loss, train_acc = train(model, train_dl, optimizer, criterion, device)
        valid_loss, valid_acc = evaluate(model, test_dl, criterion, device)

        print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.4f}, Val. Acc: {valid_acc*100:.2f}%')

    # Plot the training and test losses over epochs
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, NUM_EPOCHS+1), [train_loss], label="Train Loss")
    plt.plot(range(1, NUM_EPOCHS+1), [valid_loss], label="Test Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training and Test Loss Over Epochs")
    plt.legend()
    plt.show()

../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [194,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [194,0,0], thread: [1,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [194,0,0], thread: [2,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [194,0,0], thread: [3,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [194,0,0], thread: [4,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [194,0,0], thread: [5,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [194,0,0], 

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
