In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Dummy dataset (same as before)
class DummyDataset(torch.utils.data.Dataset):
    def __init__(self, num_samples=1000, input_size=20):
        self.num_samples = num_samples
        self.input_size = input_size
        self.data = torch.randn(num_samples, input_size)
        self.labels = torch.randint(0, 2, (num_samples,))

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Split neural network into two halves (simple example)
class ModelPart1(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(ModelPart1, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)

    def forward(self, x):
        return self.fc1(x)

class ModelPart2(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(ModelPart2, self).__init__()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        return self.fc2(x)

# Main pipeline training function
def train_pipeline(model_part1, model_part2, dataloader, criterion, optimizer, device, microbatch_size=16):
    model_part1.train()
    model_part2.train()

    for batch_data, batch_labels in dataloader:
        # Split the batch into microbatches
        batch_size = batch_data.size(0)
        microbatch_outputs = []  # Initialize as an empty list
        losses = []

        # Forward pass (overlap microbatches)
        for i in range(0, batch_size, microbatch_size):
            # Extract microbatch
            microbatch_data = batch_data[i:i + microbatch_size].to(device)
            microbatch_labels = batch_labels[i:i + microbatch_size].to(device)

            # Forward pass through part 1
            microbatch_output_part1 = model_part1(microbatch_data)

            # Store the result for part 2 to process later (pipelining)
            microbatch_outputs.append((microbatch_output_part1, microbatch_labels))  # Use append instead

            # If possible, process the previous microbatch through part 2
            if len(microbatch_outputs) > 1:
                prev_output, prev_labels = microbatch_outputs[-2]  # Get the second last microbatch
                microbatch_output = model_part2(prev_output)

                # Compute loss and store for backprop
                loss = criterion(microbatch_output, prev_labels)
                losses.append(loss)

        # Process the last microbatch through part 2
        last_output, last_labels = microbatch_outputs[-1]
        last_microbatch_output = model_part2(last_output)
        loss = criterion(last_microbatch_output, last_labels)
        losses.append(loss)

        # Backward pass (all microbatches)
        optimizer.zero_grad()
        total_loss = sum(losses) / len(losses)  # Averaging over all losses
        total_loss.backward()
        optimizer.step()

        print(f"Loss: {total_loss.item()}")

# Device (single device, can extend to multi-device)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Parameters
input_size = 20
hidden_size = 50
output_size = 2
batch_size = 64
learning_rate = 0.01
epochs = 10

# Initialize model parts
model_part1 = ModelPart1(input_size, hidden_size).to(device)
model_part2 = ModelPart2(hidden_size, output_size).to(device)

# Dummy dataset and dataloader
dataset = DummyDataset()
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(list(model_part1.parameters()) + list(model_part2.parameters()), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train_pipeline(model_part1, model_part2, dataloader, criterion, optimizer, device)


Epoch 1/10
Loss: 0.7512070536613464
Loss: 0.7222421765327454
Loss: 0.7068084478378296
Loss: 0.7036945819854736
Loss: 0.7314670085906982
Loss: 0.7320850491523743
Loss: 0.6530672907829285
Loss: 0.7487683296203613
Loss: 0.6875000596046448
Loss: 0.7501189708709717
Loss: 0.6881973743438721
Loss: 0.6828348636627197
Loss: 0.6625186204910278
Loss: 0.6916803121566772
Loss: 0.7161464095115662
Loss: 0.7617695927619934
Epoch 2/10
Loss: 0.6731727123260498
Loss: 0.7004449963569641
Loss: 0.6652618646621704
Loss: 0.6852754354476929
Loss: 0.7097380757331848
Loss: 0.6889181733131409
Loss: 0.702801525592804
Loss: 0.721937894821167
Loss: 0.6930797100067139
Loss: 0.7155757546424866
Loss: 0.7250710129737854
Loss: 0.6887955665588379
Loss: 0.7080824971199036
Loss: 0.7083451747894287
Loss: 0.7018637657165527
Loss: 0.7226720452308655
Epoch 3/10
Loss: 0.691604495048523
Loss: 0.7033839821815491
Loss: 0.6787015199661255
Loss: 0.7099925875663757
Loss: 0.7060236930847168
Loss: 0.6907598376274109
Loss: 0.733641386032

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import load_dataset

# Load dataset
dataset = load_dataset('imdb')

# Load TinyBERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('huawei-noah/TinyBERT_General_4L_312D', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')

# Tokenize and preprocess dataset
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

encoded_dataset = dataset.map(preprocess_function, batched=True)
encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Split into train and test sets
train_dataset = encoded_dataset['train']
test_dataset = encoded_dataset['test']

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)

# Split TinyBERT into two parts
class ModelPart1(nn.Module):
    def __init__(self, original_model):
        super(ModelPart1, self).__init__()
        self.embeddings = original_model.bert.embeddings
        self.encoder_part1 = nn.ModuleList(original_model.bert.encoder.layer[:2])

    def forward(self, input_ids, attention_mask):
        if attention_mask.dtype != torch.float32:
            attention_mask = attention_mask.float()
        attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)  # [batch_size, 1, 1, seq_length]
        x = self.embeddings(input_ids)
        for layer in self.encoder_part1:
            x = layer(x, attention_mask=attention_mask)[0]
        return x, attention_mask

class ModelPart2(nn.Module):
    def __init__(self, original_model):
        super(ModelPart2, self).__init__()
        self.encoder_part2 = nn.ModuleList(original_model.bert.encoder.layer[2:])
        self.classifier = original_model.classifier

    def forward(self, x, attention_mask):
        for layer in self.encoder_part2:
            x = layer(x, attention_mask=attention_mask)[0]
        return self.classifier(x[:, 0, :])

# Instantiate the split models
model_part1 = ModelPart1(model)
model_part2 = ModelPart2(model)

# Move models to the device (assuming single device for now)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_part1.to(device)
model_part2.to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(list(model_part1.parameters()) + list(model_part2.parameters()), lr=5e-5)

# Training function with pipeline parallelism
def train_pipeline(model_part1, model_part2, dataloader, criterion, optimizer, device, microbatch_size=4):
    model_part1.train()
    model_part2.train()

    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Process microbatches
        for i in range(0, input_ids.size(0), microbatch_size):
            input_ids_micro = input_ids[i:i + microbatch_size]
            attention_mask_micro = attention_mask[i:i + microbatch_size]
            labels_micro = labels[i:i + microbatch_size]

            # Forward pass through the first part of the model
            x_part1, attention_mask_part1 = model_part1(input_ids_micro, attention_mask_micro)

            # Forward pass through the second part of the model
            outputs = model_part2(x_part1, attention_mask_part1)

            loss = criterion(outputs, labels_micro)
            loss.backward()

        optimizer.step()

# Training loop
epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train_pipeline(model_part1, model_part2, train_dataloader, criterion, optimizer, device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

In [4]:
from sklearn.metrics import accuracy_score
import torch

# Define evaluation function
def evaluate_pipeline(model_part1, model_part2, dataloader, device):
    model_part1.eval()
    model_part2.eval()

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Forward pass through model parts
            x_part1, attention_mask_part1 = model_part1(input_ids, attention_mask)
            outputs = model_part2(x_part1, attention_mask_part1)

            # Get predictions
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate accuracy
    accuracy = accuracy_score(all_labels, all_preds)
    return accuracy

# Create test dataloader
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=False)

# Evaluate the model
accuracy = evaluate_pipeline(model_part1, model_part2, test_dataloader, device)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 84.08%


Overlapping microbatch

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import load_dataset

# Load dataset
dataset = load_dataset('imdb')

# Load TinyBERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('huawei-noah/TinyBERT_General_4L_312D', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')

# Tokenize and preprocess dataset
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

encoded_dataset = dataset.map(preprocess_function, batched=True)
encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Split into train and test sets
train_dataset = encoded_dataset['train']
test_dataset = encoded_dataset['test']

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=False)

# Split TinyBERT into two parts
class ModelPart1(nn.Module):
    def __init__(self, original_model):
        super(ModelPart1, self).__init__()
        self.embeddings = original_model.bert.embeddings
        self.encoder_part1 = nn.ModuleList(original_model.bert.encoder.layer[:2])

    def forward(self, input_ids, attention_mask):
        if attention_mask.dtype != torch.float32:
            attention_mask = attention_mask.float()
        attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)  # [batch_size, 1, 1, seq_length]
        x = self.embeddings(input_ids)
        for layer in self.encoder_part1:
            x = layer(x, attention_mask=attention_mask)[0]
        return x, attention_mask

class ModelPart2(nn.Module):
    def __init__(self, original_model):
        super(ModelPart2, self).__init__()
        self.encoder_part2 = nn.ModuleList(original_model.bert.encoder.layer[2:])
        self.classifier = original_model.classifier

    def forward(self, x, attention_mask):
        for layer in self.encoder_part2:
            x = layer(x, attention_mask=attention_mask)[0]
        return self.classifier(x[:, 0, :])

# Instantiate the split models
model_part1 = ModelPart1(model)
model_part2 = ModelPart2(model)

# Move models to the device (assuming single device for now)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_part1.to(device)
model_part2.to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(list(model_part1.parameters()) + list(model_part2.parameters()), lr=5e-5)

# Training function with pipeline parallelism
def train_pipeline(model_part1, model_part2, dataloader, criterion, optimizer, device, microbatch_size=4):
    model_part1.train()
    model_part2.train()

    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Store intermediate outputs and labels for pipeline parallelism
        microbatch_outputs = [None] * (input_ids.size(0) // microbatch_size)
        microbatch_losses = [None] * (input_ids.size(0) // microbatch_size)

        # Forward pass through model part 1 (with overlap)
        for i in range(0, input_ids.size(0), microbatch_size):
            input_ids_micro = input_ids[i:i + microbatch_size]
            attention_mask_micro = attention_mask[i:i + microbatch_size]
            labels_micro = labels[i:i + microbatch_size]

            # Forward pass through the first part of the model
            x_part1, attention_mask_part1 = model_part1(input_ids_micro, attention_mask_micro)

            # Store the result for part 2 to process later (pipelining)
            microbatch_outputs[i // microbatch_size] = (x_part1, attention_mask_part1, labels_micro)

            # If possible, process the previous microbatch through part 2
            if i // microbatch_size > 0:
                previous_output = microbatch_outputs[(i // microbatch_size) - 1]
                x_part2 = previous_output[0]
                attention_mask_part2 = previous_output[1]
                labels_part2 = previous_output[2]

                # Forward pass through the second part of the model
                outputs = model_part2(x_part2, attention_mask_part2)

                loss = criterion(outputs, labels_part2)
                microbatch_losses[(i // microbatch_size) - 1] = loss

                # Backpropagate and update weights after processing the last microbatch of this stage
                loss.backward()

        # Process the last microbatch through part 2
        last_output = microbatch_outputs[-1]
        outputs = model_part2(last_output[0], last_output[1])
        loss = criterion(outputs, last_output[2])
        loss.backward()

        optimizer.step()

# Training loop
epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train_pipeline(model_part1, model_part2, train_dataloader, criterion, optimizer, device)

# Evaluation function with accuracy calculation
def evaluate_pipeline(model_part1, model_part2, dataloader, device):
    model_part1.eval()
    model_part2.eval()

    correct = 0
    total = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Forward pass through part 1
            x_part1, attention_mask_part1 = model_part1(input_ids, attention_mask)

            # Forward pass through part 2
            outputs = model_part2(x_part1, attention_mask_part1)

            # Get predictions
            _, predicted = torch.max(outputs, dim=1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    return accuracy

# Evaluate the model
accuracy = evaluate_pipeline(model_part1, model_part2, test_dataloader, device)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3
Test Accuracy: 83.38%
