## 📚 Using Built-in Text Datasets in PyTorch (torchtext)
This notebook shows how to load and use the AG News dataset using `torchtext.datasets`.

In [None]:
import torch
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

### 1. Load the AG News Dataset

In [None]:
# Download the dataset
train_iter = AG_NEWS(split='train')

# Show a few examples
for i, (label, text) in enumerate(train_iter):
    print(f"Label: {label}, Text: {text}")
    if i == 2:
        break

### 2. Tokenize and Build Vocabulary

In [None]:
tokenizer = get_tokenizer('basic_english')
train_iter = AG_NEWS(split='train')

# Build vocabulary
counter = Counter()
for label, line in train_iter:
    counter.update(tokenizer(line))
vocab = Vocab(counter, specials=['<unk>', '<pad>'])
vocab.set_default_index(vocab['<unk>'])

# Check vocab size
print("Vocabulary size:", len(vocab))

### 3. Encode and Pad Batches

In [None]:
def encode(text):
    return torch.tensor([vocab[token] for token in tokenizer(text)], dtype=torch.long)

def collate_batch(batch):
    label_list, text_list = [], []
    for label, text in batch:
        label_list.append(torch.tensor(label - 1))  # Labels are 1-4
        text_list.append(encode(text))
    text_padded = pad_sequence(text_list, batch_first=True, padding_value=vocab['<pad>'])
    return text_padded, torch.tensor(label_list)

# Create DataLoader
train_iter = AG_NEWS(split='train')
train_loader = DataLoader(list(train_iter)[:1024], batch_size=16, shuffle=True, collate_fn=collate_batch)

# View one batch
for batch_text, batch_labels in train_loader:
    print("Batch text shape:", batch_text.shape)
    print("Batch labels shape:", batch_labels.shape)
    break