In [2]:
import os
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
# Define the directory where the data is located
DATA_DIR = "./data/aclImdb"
# Tokenization
tokenizer = get_tokenizer('basic_english')
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)
# Building vocabulary
def build_vocab(data_dir, split):
    data_iter = iter_text(data_dir, split)
    vocab = build_vocab_from_iterator(yield_tokens(data_iter), specials=["<unk>"])
    vocab.set_default_index(vocab["<unk>"])
    return vocab
def iter_text(data_dir, split):
    split_dir = os.path.join(data_dir, split)
    for label in ['pos', 'neg']:
        label_dir = os.path.join(split_dir, label)
        for fname in os.listdir(label_dir):
            with open(os.path.join(label_dir, fname), 'r', encoding='utf-8') as f:
                text = f.read()
                yield label, text
# Create vocabulary
vocab = build_vocab(DATA_DIR, 'train')

In [3]:
class IMDbDataset(Dataset):
    def __init__(self, data_dir, split, vocab):
        self.vocab = vocab
        self.data = []
        self.labels = []
        self.tokenizer = get_tokenizer('basic_english')
        # Read data
        split_dir = os.path.join(data_dir, split)
        for label in ['pos', 'neg']:
            label_dir = os.path.join(split_dir, label)
            for fname in os.listdir(label_dir):
                with open(os.path.join(label_dir, fname), 'r', encoding='utf-8') as f:
                    text = f.read()
                    self.data.append(torch.tensor(vocab(self.tokenizer(text)), dtype=torch.long))
                    self.labels.append(1 if label == 'pos' else 0)
                    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]
    
    def __len__(self):
        return len(self.data)
    
# Collate function to pad sequences
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(_text, dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    return torch.tensor(label_list, dtype=torch.int64), pad_sequence(text_list, padding_value=3.0), torch.tensor(lengths, dtype=torch.int64)
# Create dataset and data loader
train_dataset = IMDbDataset(DATA_DIR, 'train', vocab)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_batch)