In [2]:
import torch
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import load_dataset
import torch.nn.utils.prune as prune

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the IMDb dataset
dataset = load_dataset("imdb")
test_dataset = dataset['test']  # Use only the test dataset

# Load TinyBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")
model = BertForSequenceClassification.from_pretrained("huawei-noah/TinyBERT_General_4L_312D", num_labels=2)
model.to(device)

# Define the preprocessing function
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=128)

# Preprocess the test dataset
encoded_test_dataset = test_dataset.map(preprocess_function, batched=True)

# Take a smaller sample of the test dataset (e.g., 500 samples)
sample_size = 500
encoded_test_sample = encoded_test_dataset.select(range(sample_size))

# Apply L1 Pruning Function
def apply_pruning(model, amount=0.2):
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.ln_structured(module, name="weight", amount=amount, n=2, dim=0)

# Define the Particle class for PSO
class Particle:
    def __init__(self, model):
        self.position = [param.data.clone() for param in model.parameters()]
        self.velocity = [torch.randn_like(param) * 0.01 for param in model.parameters()]
        self.best_position = [param.data.clone() for param in model.parameters()]
        self.best_score = float('-inf')

# Function to evaluate model accuracy using batching
def evaluate_model(model, dataset, batch_size=16):
    model.eval()
    total_correct = 0
    total_samples = 0

    # Create a DataLoader for batching
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)

    with torch.no_grad():
        for batch in dataloader:
            inputs = tokenizer(batch['text'], return_tensors='pt', truncation=True, padding=True, max_length=128).to(device)
            labels = torch.tensor(batch['label']).to(device)

            outputs = model(**inputs)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

            total_correct += (predictions == labels).sum().item()
            total_samples += labels.size(0)

    accuracy = total_correct / total_samples
    return accuracy

# PSO Implementation
def particle_swarm_optimization(model, num_particles=10, num_iterations=5):
    particles = [Particle(model) for _ in range(num_particles)]
    g_best_position = None
    g_best_score = float('-inf')

    for iteration in range(num_iterations):
        for particle in particles:
            # Update model parameters with the particle's position
            for param, pos in zip(model.parameters(), particle.position):
                param.data.copy_(pos)

            # Evaluate the model
            score = evaluate_model(model, encoded_test_sample)  # Use sample for evaluation

            # Update particle's best position and score
            if score > particle.best_score:
                particle.best_score = score
                particle.best_position = [param.data.clone() for param in model.parameters()]

            # Update global best
            if score > g_best_score:
                g_best_score = score
                g_best_position = [param.data.clone() for param in model.parameters()]

        # Update particle velocities and positions
        for particle in particles:
            w = 0.5  # inertia weight
            c1 = 1.5  # cognitive coefficient
            c2 = 1.5  # social coefficient

            for i in range(len(particle.position)):
                r1 = np.random.rand()
                r2 = np.random.rand()

                # Update velocity
                particle.velocity[i] = (
                    w * particle.velocity[i] +
                    c1 * r1 * (particle.best_position[i] - particle.position[i]) +
                    c2 * r2 * (g_best_position[i] - particle.position[i])
                )
                # Update position
                particle.position[i] += particle.velocity[i]

    # Update model with global best position
    for param, g_best in zip(model.parameters(), g_best_position):
        param.data.copy_(g_best)

    return g_best_score

# Apply L1 pruning before PSO
apply_pruning(model)

# Run PSO to optimize the model
best_accuracy = particle_swarm_optimization(model, num_particles=10, num_iterations=5)

# Evaluate the model on the smaller IMDb test sample
final_accuracy = evaluate_model(model, encoded_test_sample)

# Print the best accuracy after optimization
print(f"Best accuracy of the optimized model after PSO: {best_accuracy:.4f}")
print(f"Final accuracy on the IMDb test sample: {final_accuracy:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

  labels = torch.tensor(batch['label']).to(device)
  labels = torch.tensor(batch['label']).to(device)


Best accuracy of the optimized model after PSO: 1.0000
Final accuracy on the IMDb test sample: 1.0000


In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [3]:
# ... (your existing code for training and evaluation)

# Save the model and tokenizer
save_directory = "./optimized_model"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")


Model and tokenizer saved to ./optimized_model


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import load_dataset

# Load dataset
dataset = load_dataset('imdb')

# Load the saved TinyBERT model and tokenizer
save_directory = "./optimized_model"  # Use the path where you saved the model and tokenizer
model = BertForSequenceClassification.from_pretrained(save_directory, num_labels=2)
tokenizer = BertTokenizer.from_pretrained(save_directory)

# Tokenize and preprocess dataset
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

encoded_dataset = dataset.map(preprocess_function, batched=True)
encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Split into train and test sets
train_dataset = encoded_dataset['train']
test_dataset = encoded_dataset['test']

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=False)

# Split TinyBERT into two parts
class ModelPart1(nn.Module):
    def __init__(self, original_model):
        super(ModelPart1, self).__init__()
        self.embeddings = original_model.bert.embeddings
        self.encoder_part1 = nn.ModuleList(original_model.bert.encoder.layer[:2])

    def forward(self, input_ids, attention_mask):
        if attention_mask.dtype != torch.float32:
            attention_mask = attention_mask.float()
        attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)  # [batch_size, 1, 1, seq_length]
        x = self.embeddings(input_ids)
        for layer in self.encoder_part1:
            x = layer(x, attention_mask=attention_mask)[0]
        return x, attention_mask

class ModelPart2(nn.Module):
    def __init__(self, original_model):
        super(ModelPart2, self).__init__()
        self.encoder_part2 = nn.ModuleList(original_model.bert.encoder.layer[2:])
        self.classifier = original_model.classifier

    def forward(self, x, attention_mask):
        for layer in self.encoder_part2:
            x = layer(x, attention_mask=attention_mask)[0]
        return self.classifier(x[:, 0, :])

# Instantiate the split models
model_part1 = ModelPart1(model)
model_part2 = ModelPart2(model)

# Move models to the device (assuming single device for now)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_part1.to(device)
model_part2.to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(list(model_part1.parameters()) + list(model_part2.parameters()), lr=5e-5)

# Training function with pipeline parallelism
def train_pipeline(model_part1, model_part2, dataloader, criterion, optimizer, device, microbatch_size=4):
    model_part1.train()
    model_part2.train()

    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Store intermediate outputs and labels for pipeline parallelism
        microbatch_outputs = [None] * (input_ids.size(0) // microbatch_size)
        microbatch_losses = [None] * (input_ids.size(0) // microbatch_size)

        # Forward pass through model part 1 (with overlap)
        for i in range(0, input_ids.size(0), microbatch_size):
            input_ids_micro = input_ids[i:i + microbatch_size]
            attention_mask_micro = attention_mask[i:i + microbatch_size]
            labels_micro = labels[i:i + microbatch_size]

            # Forward pass through the first part of the model
            x_part1, attention_mask_part1 = model_part1(input_ids_micro, attention_mask_micro)

            # Store the result for part 2 to process later (pipelining)
            microbatch_outputs[i // microbatch_size] = (x_part1, attention_mask_part1, labels_micro)

            # If possible, process the previous microbatch through part 2
            if i // microbatch_size > 0:
                previous_output = microbatch_outputs[(i // microbatch_size) - 1]
                x_part2 = previous_output[0]
                attention_mask_part2 = previous_output[1]
                labels_part2 = previous_output[2]

                # Forward pass through the second part of the model
                outputs = model_part2(x_part2, attention_mask_part2)

                loss = criterion(outputs, labels_part2)
                microbatch_losses[(i // microbatch_size) - 1] = loss

                # Backpropagate and update weights after processing the last microbatch of this stage
                loss.backward()

        # Process the last microbatch through part 2
        last_output = microbatch_outputs[-1]
        outputs = model_part2(last_output[0], last_output[1])
        loss = criterion(outputs, last_output[2])
        loss.backward()

        optimizer.step()

# Training loop
epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train_pipeline(model_part1, model_part2, train_dataloader, criterion, optimizer, device)

# Evaluation function with accuracy calculation
def evaluate_pipeline(model_part1, model_part2, dataloader, device):
    model_part1.eval()
    model_part2.eval()

    correct = 0
    total = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Forward pass through part 1
            x_part1, attention_mask_part1 = model_part1(input_ids, attention_mask)

            # Forward pass through part 2
            outputs = model_part2(x_part1, attention_mask_part1)

            # Get predictions
            _, predicted = torch.max(outputs, dim=1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    return accuracy

# Evaluate the model
accuracy = evaluate_pipeline(model_part1, model_part2, test_dataloader, device)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Some weights of the model checkpoint at ./optimized_model were not used when initializing BertForSequenceClassification: ['bert.encoder.layer.0.attention.output.dense.weight_mask', 'bert.encoder.layer.0.attention.output.dense.weight_orig', 'bert.encoder.layer.0.attention.self.key.weight_mask', 'bert.encoder.layer.0.attention.self.key.weight_orig', 'bert.encoder.layer.0.attention.self.query.weight_mask', 'bert.encoder.layer.0.attention.self.query.weight_orig', 'bert.encoder.layer.0.attention.self.value.weight_mask', 'bert.encoder.layer.0.attention.self.value.weight_orig', 'bert.encoder.layer.0.intermediate.dense.weight_mask', 'bert.encoder.layer.0.intermediate.dense.weight_orig', 'bert.encoder.layer.0.output.dense.weight_mask', 'bert.encoder.layer.0.output.dense.weight_orig', 'bert.encoder.layer.1.attention.output.dense.weight_mask', 'bert.encoder.layer.1.attention.output.dense.weight_orig', 'bert.encoder.layer.1.attention.self.key.weight_mask', 'bert.encoder.layer.1.attention.self.key.

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Epoch 1/3
Epoch 2/3
Epoch 3/3
Test Accuracy: 78.88%
