<a href="https://colab.research.google.com/github/paryagsahni1845/deeplearning/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [74]:
!pip install datasets --quiet

import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from collections import Counter
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence


In [75]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [76]:
dataset = load_dataset("imdb")
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [77]:
def tokenize(text):
    text = text.lower()
    text = re.sub(r"<br />", " ", text)
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text.split()

# Build vocab from training data (top 30k words)
all_tokens = []
for sample in dataset['train']:
    all_tokens.extend(tokenize(sample['text']))

vocab_count = Counter(all_tokens)
vocab = {word: idx+1 for idx, (word, _) in enumerate(vocab_count.most_common(30000))}
vocab['<PAD>'] = 0
vocab_size = len(vocab)


In [80]:
from torch.utils.data import Dataset

class IMDBDataset(Dataset): # Inherit from torch.utils.data.Dataset
    def __init__(self, data, vocab):
        self.texts = [tokenize(s['text']) for s in data]
        self.labels = [s['label'] for s in data]
        self.vocab = vocab

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        tokens = self.texts[idx]
        seq = [self.vocab.get(token, 0) for token in tokens]
        label = self.labels[idx]
        return torch.tensor(seq, dtype=torch.long), torch.tensor(label, dtype=torch.long)

def collate_fn(batch):
    # batch: list of (seq, label)
    sequences, labels = zip(*batch)
    lengths = torch.tensor([len(seq) for seq in sequences])
    padded_seqs = nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=0)
    labels = torch.stack(labels)
    return padded_seqs, labels, lengths

In [81]:
train_dataset = IMDBDataset(dataset['train'], vocab)
test_dataset = IMDBDataset(dataset['test'], vocab)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, collate_fn=collate_fn)


In [82]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=200, hidden_dim=256, num_classes=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x, lengths):
        x = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (h_n, _) = self.lstm(packed)
        out = self.fc(h_n[-1])
        return out

model = LSTMClassifier(vocab_size=vocab_size).to(device)


In [83]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(5):  # train longer
    model.train()
    total_loss = 0
    for xb, yb, lengths in train_loader:
        xb, yb, lengths = xb.to(device), yb.to(device), lengths.to(device)
        optimizer.zero_grad()
        outputs = model(xb, lengths)
        loss = criterion(outputs, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


Epoch 1, Loss: 0.6191
Epoch 2, Loss: 0.4983
Epoch 3, Loss: 0.2753
Epoch 4, Loss: 0.1614
Epoch 5, Loss: 0.0787


In [84]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for xb, yb, lengths in test_loader:
        xb, yb, lengths = xb.to(device), yb.to(device), lengths.to(device)
        outputs = model(xb, lengths)
        _, predicted = torch.max(outputs, 1)
        total += yb.size(0)
        correct += (predicted == yb).sum().item()

print("Test Accuracy:", correct / total)


Test Accuracy: 0.87772
