In [41]:
import torch
import torchtext
import torch.nn as nn
from torchtext.datasets import IMDB
torchtext.disable_torchtext_deprecation_warning()
# from torchtext.datasets import IMDB
train_dataset = IMDB(split = 'train')
test_dataset = IMDB(split = 'test')

In [2]:
# split the training dataset into seperate training and validation partitions
from torch.utils.data.dataset import random_split
torch.manual_seed(1)
train_dataset, valid_dataset = random_split(list(train_dataset), [20000, 5000])

In [3]:
# identify the unique words in the training dataset
#  map these unique words to a unique integer and encode the review text into encoded integers
import re
from collections import Counter, OrderedDict


# split the text into words
def tokenizer(text):
  text = re.sub('<[^>]*>', '', text)
  emoticons = re.findall('(?::|;|=)(?:\)|\(|D|P)', text.lower())
  text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
  tokenized = text.split()
  return tokenized

In [6]:
token_counts = Counter()
for label, line in train_dataset:
  tokens = tokenizer(line)
  token_counts.update(tokens)
print('Vocab-size: ', len(token_counts))

Vocab-size:  69019


In [37]:
# map these unique words in integer
from torchtext.vocab import vocab
sort_tuple = sorted(token_counts.items(), key = lambda x: x[1], reverse = True)
order_dict = OrderedDict(sort_tuple)
vocab= vocab(order_dict)
vocab.insert_token('<pad>', 0)
vocab.insert_token('<unk>', 1)
vocab.set_default_index(1)

In [40]:
# define the function for transformation 
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: 1.0 if x == 'pos' else 0.0

In [78]:
# wrap the encode and transformation function
def collate_batch(batch):
  label_list, text_list, lengths = [], [], []
  for _label, _text in batch:
    label_list.append(label_pipeline(_label))
    processed_text = torch.tensor(text_pipeline(_text), dtype = torch.int64)
    text_list.append(processed_text)
    lengths.append(processed_text.size(0))
  label_list = torch.tensor(label_list)
  lengths = torch.tensor(lengths)
  padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first = True)
  return padded_text_list, label_list, lengths

In [79]:
# take a small batch
from torch.utils.data import DataLoader

# collate_fn is self funtion to define batch
dataloader = DataLoader(train_dataset, batch_size = 4, shuffle = False, collate_fn = collate_batch)
# let check the first batch
text_batch1, label_list1, lengths1 = next(iter(dataloader))
text_batch1.shape

torch.Size([4, 291])

In [80]:
# let divide all three datasets into dataloader with a batch size of 32:
batch_size = 32
train_dl = DataLoader(train_dataset, batch_size = batch_size, shuffle= True, collate_fn = collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size = batch_size, shuffle= True, collate_fn= collate_batch)
test_dl = DataLoader(test_dataset, batch_size = batch_size, shuffle= True, collate_fn= collate_batch)