In [4]:
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from collections import Counter, OrderedDict
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.vocab import vocab
from torch.utils.data import DataLoader, TensorDataset
import itertools
import portalocker



In [5]:
!pip install torchdata



In [12]:
!pip install portalocker



# 1. Load the IMDb Dataset

In [6]:
!pip install portalocker # Install the missing module

import portalocker # Import the module in your code

def tokenize(text):
    return text.lower().split()

# Load the dataset
train_iter, test_iter = IMDB()

counter = Counter()

for label, text in itertools.chain(train_iter, test_iter):
    tokenized_text = tokenize(text)
    counter.update(tokenized_text)

ordered_counter = OrderedDict(counter.most_common(1000))

# Create the vocabulary
vocab = vocab(ordered_counter, min_freq=10, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])



################################################################################
The 'datapipes', 'dataloader2' modules are deprecated and will be removed in a
future torchdata release! Please see https://github.com/pytorch/data/issues/1196
to learn more and leave feedback.
################################################################################



# 2. Text to Sequence Conversion

In [7]:
def text_to_sequence(text, vocab):
    return [vocab[token] for token in tokenize(text)]

# 3. Prepare Data for Training and Testing

In [8]:
def prepare_data(data_iter, vocab, max_len=200):
    sequences = []
    labels = []
    for label, line in data_iter:
        seq = text_to_sequence(line, vocab)
        if len(seq) > max_len:
            seq = seq[:max_len]
        else:
            seq.extend([vocab["<pad>"]] * (max_len - len(seq)))  # Pad sequence to max_len
        sequences.append(seq)
        labels.append(1 if label == 'pos' else 0)  # Convert label to binary

    return torch.tensor(sequences, dtype=torch.long), torch.tensor(labels, dtype=torch.long)

train_iter, test_iter = IMDB(split=('train', 'test'))
train_seq, train_labels = prepare_data(train_iter, vocab)
test_seq, test_labels = prepare_data(test_iter, vocab)

train_dataset = TensorDataset(train_seq, train_labels)
test_dataset = TensorDataset(test_seq, test_labels)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

# 4. Define the BiLSTM Model

In [9]:
class BiLSTMNetwork(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_class, num_layers):
        super(BiLSTMNetwork, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, num_class)  # Multiply by 2 for bidirectional

    def forward(self, text):
        embedded = self.embedding(text)
        lstm_out, (hn, cn) = self.lstm(embedded)
        hn = torch.cat((hn[-2,:,:], hn[-1,:,:]), dim = 1)  # Concatenate the hidden states from both directions
        return self.fc(hn)

# Hyperparameters
embed_dim = 128
hidden_dim = 128
num_class = 2  # Positive or Negative
num_layers = 2
epochs = 5
vocab_size = len(vocab)

# Initialize model, loss, and optimizer
model = BiLSTMNetwork(vocab_size, embed_dim, hidden_dim, num_class, num_layers)

# Check if a GPU is available, and if so, use it
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model to the GPU
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 5. Training Loop

In [13]:
# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        text, labels = batch
        text, labels = text.to(device), labels.to(device)
        optimizer.zero_grad()
        output = model(text)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}')

# Evaluation
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for batch in test_loader:
        text, labels = batch
        text, labels = text.to(device), labels.to(device)
        output = model(text)
        _, predicted = torch.max(output, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy: {100 * correct / total}%')

Epoch 1, Loss: 1.4335125255976536e-07
Epoch 2, Loss: 1.2175791042976963e-07
Epoch 3, Loss: 1.193331407295966e-07
Epoch 4, Loss: 1.1922357376767098e-07
Epoch 5, Loss: 1.0290280110140153e-07
Accuracy: 100.0%
