<a href="https://colab.research.google.com/github/peremartra/LLMOptCost/blob/main/11/13_KnowledgeDistillationMeta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://arxiv.org/abs/2407.14679

In [1]:
!pip install -q datasets

In [2]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
import torch.nn as nn
from transformers import get_linear_schedule_with_warmup
import random
import numpy as np

In [3]:
# Function to set the random seed for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True  # Ensures deterministic behavior for CuDNN
    torch.backends.cudnn.benchmark = False     # Disables benchmark mode to ensure reproducibility

# Set the seed
set_seed(42)  # You can use any seed number you'd like

In [4]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [5]:
# Define teacher and student models
teacher_model_name = "bigscience/bloomz-560m"
#student_model_name = "oopere/bloomz-560m-pruned"

#Try double destillation
student_model_name = "oopere/bloomz-560m-pruned-kdi-agnews"


In [6]:
teacher_model = AutoModelForCausalLM.from_pretrained(teacher_model_name).to(device)
student_model = AutoModelForCausalLM.from_pretrained(student_model_name).to(device)


config.json:   0%|          | 0.00/810 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained(teacher_model_name)

In [8]:
# Dataset
dataset = load_dataset('ag_news', split='train[:1%]')  # Use a lightweight dataset
dataset2 = load_dataset('oopere/knowledge_transfer_1500_base', split='train')  # Use a lightweight dataset


In [9]:
print(dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 1200
})


In [10]:
print(dataset2)

Dataset({
    features: ['Prompt', 'Topic'],
    num_rows: 1267
})


In [11]:
# Define a collate function to handle the dynamic batching of different length sequences
def collate_fn(batch):
    texts = [item['Prompt'] for item in batch]
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    return inputs.input_ids.to(device)

In [12]:
batch_size = 8  # Adjust this number based on your memory usage; you can likely go higher
dataloader = DataLoader(dataset2, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [13]:
# Define number of epochs
num_epochs = 20  # Adjust based on available resources

In [14]:
# Prepare optimizer and scheduler
optimizer = optim.AdamW(student_model.parameters(), lr=5e-5)
num_training_steps = len(dataloader) * 3  # 3 epochs
#scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=50, num_training_steps=num_training_steps)

scheduler = CosineAnnealingLR(optimizer, T_max=len(dataloader) * num_epochs, eta_min=1e-6)  # Adjust T_max and eta_min as needed

# Loss function
criterion = nn.KLDivLoss(reduction='batchmean')

In [15]:
# Enable mixed precision
scaler = torch.cuda.amp.GradScaler()

  scaler = torch.cuda.amp.GradScaler()


In [16]:
def distill(teacher_model, student_model, dataloader, optimizer, criterion, scheduler, scaler, temperature=1.5, num_epochs=3):
    teacher_model.eval()
    student_model.train()

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        total_loss = 0

        for batch in dataloader:
            # Teacher model output
            with torch.no_grad():
                with torch.cuda.amp.autocast():  # Use mixed precision for teacher model inference
                    teacher_logits = teacher_model(batch).logits

            # Student model output
            with torch.cuda.amp.autocast():  # Mixed precision for student model training
                student_logits = student_model(batch).logits

                # Compute distillation loss
                loss = criterion(torch.log_softmax(student_logits / temperature, dim=-1),
                                 torch.softmax(teacher_logits / temperature, dim=-1))

            # Backpropagation with mixed precision
            optimizer.zero_grad()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            # Apply scheduler for first 10 epochs
            #if epoch < 10:
            #    scheduler.step()
            #else:
                # Manually reduce learning rate for the last 5 epochs (after epoch 10)
            #    for param_group in optimizer.param_groups:
            #        param_group['lr'] = param_group['lr'] * 0.20  # Reduce LR by a factor of 20
            scheduler.step()

            total_loss += loss.item()

        # Print average loss for each epoch
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader)}")


In [17]:
# Run distillation
distill(teacher_model, student_model, dataloader, optimizer, criterion, scheduler, scaler, num_epochs=num_epochs)


  with torch.cuda.amp.autocast():  # Use mixed precision for teacher model inference


Epoch 1/20


Using `past_key_values` as a tuple is deprecated and will be removed in v4.45. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
  with torch.cuda.amp.autocast():  # Mixed precision for student model training


Epoch 1/20, Loss: 1.4607786854857918
Epoch 2/20
Epoch 2/20, Loss: 1.145354353032022
Epoch 3/20
Epoch 3/20, Loss: 0.8448880114270456
Epoch 4/20
Epoch 4/20, Loss: 0.7170617303008553
Epoch 5/20
Epoch 5/20, Loss: 0.5707877590408865
Epoch 6/20
Epoch 6/20, Loss: 0.5156397539302238
Epoch 7/20
Epoch 7/20, Loss: 0.4300348710901332
Epoch 8/20
Epoch 8/20, Loss: 0.36305395883959046
Epoch 9/20
Epoch 9/20, Loss: 0.3082926976230909
Epoch 10/20
Epoch 10/20, Loss: 0.26044117054849303
Epoch 11/20
Epoch 11/20, Loss: 0.24046941738833422
Epoch 12/20
Epoch 12/20, Loss: 0.19663956990969256
Epoch 13/20
Epoch 13/20, Loss: 0.1652324212328443
Epoch 14/20
Epoch 14/20, Loss: 0.14055221457526368
Epoch 15/20
Epoch 15/20, Loss: 0.1273653031379547
Epoch 16/20
Epoch 16/20, Loss: 0.11879745390523905
Epoch 17/20
Epoch 17/20, Loss: 0.10385363874664097
Epoch 18/20
Epoch 18/20, Loss: 0.10300650735789875
Epoch 19/20
Epoch 19/20, Loss: 0.09571418922933393
Epoch 20/20
Epoch 20/20, Loss: 0.09669767486505539


In [18]:
student_model_name = "bloomz-560m-pruned-kdi-both"

In [19]:
# Save the fine-tuned student model
student_model.save_pretrained(student_model_name)
tokenizer.save_pretrained(student_model_name)

('bloomz-560m-pruned-kdi-both/tokenizer_config.json',
 'bloomz-560m-pruned-kdi-both/special_tokens_map.json',
 'bloomz-560m-pruned-kdi-both/tokenizer.json')

In [20]:
student_model.push_to_hub(student_model_name,
                  private=False,
                  use_temp_dir=False)


README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/oopere/bloomz-560m-pruned-kdi-both/commit/05355a97895e97dd05bcc607d99fef08bb3d9306', commit_message='Upload tokenizer', commit_description='', oid='05355a97895e97dd05bcc607d99fef08bb3d9306', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub(student_model_name,
                      private=False,
                      use_temp_dir=False)