# torchtext

Here is an example of using torchtext to process IMDB dataset.

In [13]:
import torch
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence

# Load data, convert to list
train_data = list(IMDB(split='train'))
test_data = list(IMDB(split='test'))

# Define tokenizer
tokenizer = get_tokenizer('basic_english')

# Construct vocab
def yield_tokens(data):
    for label, text in data:
        yield tokenizer(text.lower())  # Convert to lowercase

vocab = build_vocab_from_iterator(
    yield_tokens(train_data),
    specials=['<unk>', '<pad>'],  # special symbol
    min_freq=3  # Filter low-frequency words
)
vocab.set_default_index(vocab['<unk>'])  # Default index
# print(vocab(["this", "is", "me"]))

# Define data processing pipeline
text_pipeline = lambda x: vocab(tokenizer(x.lower()))
label_pipeline = lambda x: 1 if x == 'pos' else 0

# Define batch processing function
def collate_batch(batch):
    labels, texts = [], []
    for (label, text) in batch:
        labels.append(label_pipeline(label))
        texts.append(torch.tensor(text_pipeline(text), dtype=torch.int64))
    
    # Fill text
    texts = pad_sequence(texts, batch_first=True, padding_value=vocab['<pad>'])
    labels = torch.tensor(labels, dtype=torch.int64)
    return labels, texts

# Split train set and val set
train_split, val_split = random_split(
    train_data, 
    [len(train_data) - 5000, 5000],
    generator=torch.Generator().manual_seed(42)     # Random seed
)

# Create DataLoader
batch_size = 64
train_loader = DataLoader(train_split, batch_size=batch_size, 
                         shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_split, batch_size=batch_size,
                       shuffle=False, collate_fn=collate_batch)
test_loader = DataLoader(test_data, batch_size=batch_size,
                        shuffle=False, collate_fn=collate_batch)

# Example
print(f"The size of vocab: {len(vocab)}")
sample_batch = next(iter(train_loader))
print(f"Label shape: {sample_batch[0].shape}")
print(f"Text shape: {sample_batch[1].shape}")

The size of vocab: 40252
Label shape: torch.Size([64])
Text shape: torch.Size([64, 1256])


In [6]:
import torch, torchtext, torchdata
print(f"PyTorch: {torch.__version__}")       # 1.13.1
print(f"TorchText: {torchtext.__version__}") # 0.14.1
print(f"TorchData: {torchdata.__version__}") # 0.5.1

PyTorch: 1.13.1
TorchText: 0.14.1
TorchData: 0.5.1
