In [1]:
from datasets import load_dataset
dataset = load_dataset('imdb')

In [2]:
from transformers import AutoTokenizer

# Load a tokenizer (we can use a pre-trained tokenizer from BERT, GPT, etc.)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset (this step converts text into word indices)
def tokenize_data(example):
    return tokenizer(example['text'], padding='max_length', truncation=True, max_length=256)

# Apply the tokenizer to the dataset
tokenized_dataset = dataset.map(tokenize_data, batched=True)

# Remove unnecessary columns and rename the 'label' column
tokenized_dataset = tokenized_dataset.remove_columns(['text']).rename_column('label', 'labels')
tokenized_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])



In [3]:
from torch.utils.data import DataLoader

# Create DataLoader for training and testing
train_loader = DataLoader(tokenized_dataset['train'], batch_size=32, shuffle=True)
test_loader = DataLoader(tokenized_dataset['test'], batch_size=32, shuffle=False)

In [4]:
import torch
import torch.optim as optim 
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class HybridTransformerCNN(nn.Module):
    def __init__(self, vocab_size, embed_size=512, num_classes=2, num_heads=8, num_layers=2):
        super(HybridTransformerCNN, self).__init__()

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_size)

        # Transformer encoder layer
        transformer_layer = TransformerEncoderLayer(d_model=embed_size, nhead=num_heads)
        self.transformer = TransformerEncoder(transformer_layer, num_layers=num_layers)

        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 256, (3, embed_size), padding=(1, 0))  # First conv layer
        self.batch_norm1 = nn.BatchNorm2d(256)  # Batch normalization after conv1
        self.pool1 = nn.MaxPool2d((2, 1))  # Max pooling layer

        self.conv2 = nn.Conv2d(256, 512, (3, 1), padding=(1, 0))  # Second conv layer
        self.batch_norm2 = nn.BatchNorm2d(512)  # Batch normalization after conv2
        self.pool2 = nn.MaxPool2d((2, 1))  # Another pooling layer

        # Fully connected layers
        self.fc1 = nn.Linear(32768, 512)  # Adjust size based on pooling output
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, num_classes)

    def forward(self, x):
        # Embedding
        x = self.embedding(x)  # (batch_size, seq_length, embed_size)

        # Transformer encoder
        x = self.transformer(x)  # Transformer expects input of shape (seq_length, batch_size, embed_size)
        x = x.unsqueeze(1)  # Add channel dimension for CNN (batch_size, 1, seq_length, embed_size)

        # Convolutional layers + normalization + pooling
        x = torch.relu(self.batch_norm1(self.conv1(x)))  # Convolution + batch norm
        x = self.pool1(x)  # Pooling after first conv

        x = torch.relu(self.batch_norm2(self.conv2(x)))  # Second convolution + batch norm
        x = self.pool2(x)  # Pooling after second conv

        # Flatten for fully connected layers
        x = x.view(x.size(0), -1)

        # Fully connected layers
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)  # No activation here (logits)

        return x


In [10]:
# Hyperparameters
vocab_size = tokenizer.vocab_size
embed_size = 128
num_heads = 16
num_layers = 8
num_classes = 2

device = torch.device("mps")

model = HybridTransformerCNN(vocab_size, embed_size, num_classes, num_heads, num_layers).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=25e-5, momentum=0.9)

# Training loop
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for batch_idx, batch in enumerate(train_loader):
        # Move data and target to the GPU
        data = batch['input_ids'].to(device)
        target = batch['labels'].to(device)

        # Forward pass
        output = model(data)
        loss = criterion(output, target)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_idx % 100 == 0:
            print(f'Epoch {epoch+1}/{num_epochs}, Step {batch_idx}, Loss: {loss.item():.4f}')


Epoch 1/10, Step 0, Loss: 0.7086
Epoch 1/10, Step 100, Loss: 0.6791
Epoch 1/10, Step 200, Loss: 0.6858
Epoch 1/10, Step 300, Loss: 0.6957
Epoch 1/10, Step 400, Loss: 0.6951
Epoch 1/10, Step 500, Loss: 0.7050
Epoch 1/10, Step 600, Loss: 0.6920


KeyboardInterrupt: 

In [6]:
# Evaluate the model
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in test_loader:
        # Move data and target to the GPU
        data = batch['input_ids'].to(device)
        target = batch['labels'].to(device)

        # Forward pass
        outputs = model(data)

        # Get the predicted class
        _, predicted = torch.max(outputs.data, 1)

        total += target.size(0)
        correct += (predicted == target).sum().item()

accuracy = 100 * correct / total
print(f'Test Accuracy: {accuracy:.2f}%')

Test Accuracy: 81.20%
