In [4]:
# =============================================================================
# SECTION 1: SETUP AND DEPENDENCIES
# =============================================================================
# This section installs all the necessary libraries for the script. We use
# Hugging Face's transformers for models, datasets for data handling,
# accelerate and bitsandbytes for efficient model loading and training (especially
# 4-bit quantization), and trl for its utility in training transformers.

print("Installing necessary libraries...")
# Use -q to make the installation less verbose
!pip install -q transformers datasets peft accelerate bitsandbytes trl torch
!pip install -U bitsandbytes

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    GPT2Config,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
)
from datasets import load_dataset
import torch.nn.functional as F
import torch.nn as nn
import os

print("Libraries installed and imported successfully.")


# =============================================================================
# SECTION 2: CONFIGURATION
# =============================================================================
# All hyperparameters and configuration settings are centralized here for easy
# modification. This includes model names, distillation parameters, and
# training settings.

class DistillationConfig:
    # Model identifiers
    TEACHER_MODEL_ID = "microsoft/phi-2"
    STUDENT_MODEL_OUTPUT_DIR = "./distilled_student_model"

    # Dataset parameters
    DATASET_ID = "databricks/databricks-dolly-15k"
    DATASET_SUBSET_SIZE = 3000
    MAX_TOKEN_LENGTH = 512

    # Distillation parameters
    # Alpha controls the balance between the standard cross-entropy loss (loss_ce)
    # and the KL divergence distillation loss (loss_kl).
    # loss = (1 - alpha) * loss_ce + alpha * loss_kl
    ALPHA = 0.5

    # Temperature is used to soften the probability distributions of logits
    # from both teacher and student models. A higher temperature results in a
    # softer distribution, which can help in transferring knowledge about
    # similarities between classes.
    TEMPERATURE = 2.0

    # Training parameters
    NUM_TRAIN_EPOCHS = 1
    BATCH_SIZE = 4 # Keep low for Colab T4 GPU
    LEARNING_RATE = 5e-5
    OUTPUT_DIR = "./training_output"

config = DistillationConfig()


# =============================================================================
# SECTION 3: LOAD MODELS AND TOKENIZER
# =============================================================================
# Here, we load the teacher model with 4-bit quantization to reduce its memory
# footprint, making it feasible to run on a Colab T4 GPU. We then define the
# architecture for our student model from scratch using GPT2Config, ensuring
# it's significantly smaller than the teacher.

print("Loading tokenizer and teacher model...")

# Load the tokenizer from the teacher model. This ensures the student and
# teacher share the same vocabulary and token mappings.
tokenizer = AutoTokenizer.from_pretrained(config.TEACHER_MODEL_ID, trust_remote_code=True)
# Set a padding token if it doesn't exist. GPT-2 style models often don't have one by default.
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Define the BitsAndBytesConfig for 4-bit quantization. This is the modern
# approach, replacing the deprecated `load_in_4bit` argument.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Load the teacher model (microsoft/phi-2).
# We now pass the bnb_config to the `quantization_config` argument.
teacher_model = AutoModelForCausalLM.from_pretrained(
    config.TEACHER_MODEL_ID,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto" # Automatically maps model layers to available devices
)
# Set the teacher model to evaluation mode. This disables layers like dropout
# that are only active during training.
teacher_model.eval()
print("Teacher model loaded successfully in 4-bit.")

# Define the configuration for our student model. We make it much smaller
# than the teacher to create a lightweight, fast model.
# The `vocab_size` must match the tokenizer's vocabulary size.
# We use len(tokenizer) instead of tokenizer.vocab_size. For some models,
# vocab_size doesn't account for added special tokens, while len(tokenizer) does.
# This mismatch can lead to index-out-of-bounds errors during training.
student_config = GPT2Config(
    vocab_size=len(tokenizer),
    n_layer=6,            # Teacher (Phi-2) has 32 layers
    n_head=12,            # Teacher has 32 heads
    n_embd=768,           # Teacher has 2560 embedding dimension
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id, # <-- CRITICAL FIX: Ensure model config knows the pad token
)

# Create the student model from the defined configuration.
student_model = GPT2LMHeadModel(student_config)
# Move student model to the GPU
student_model = student_model.to(teacher_model.device)

print(f"Student model created with {student_model.num_parameters():,} parameters.")
print(f"Teacher model has {teacher_model.num_parameters():,} parameters (in 4-bit).")


# =============================================================================
# SECTION 4: PREPARE DATASET
# =============================================================================
# We load the Dolly dataset, select a small subset for quick demonstration,
# and format it into a consistent instruction-response template. This formatted
# text is then tokenized.

print("Preparing the dataset...")

# Load the dataset from Hugging Face Hub
dataset = load_dataset(config.DATASET_ID, split='train')

# For demonstration, we'll use a smaller subset of the data.
dataset = dataset.select(range(config.DATASET_SUBSET_SIZE))

# Define the tokenization function.
# This function takes a batch of examples, formats them into a prompt template,
# and then tokenizes the result.
def tokenize_function(examples):
    # The 'examples' object is a dictionary where keys are column names (e.g., 'instruction')
    # and values are lists of the data for that column.
    # We iterate through the examples and format them.
    formatted_texts = []
    for i in range(len(examples["instruction"])):
        text = f"Instruction:\n{examples['instruction'][i]}\n\nResponse:\n{examples['response'][i]}"
        formatted_texts.append(text)

    # Tokenize the formatted texts.
    # padding="max_length" ensures all sequences have the same length, which is
    # necessary for batching.
    # truncation=True ensures that sequences longer than max_length are cut.
    return tokenizer(
        formatted_texts,
        padding="max_length",
        truncation=True,
        max_length=config.MAX_TOKEN_LENGTH,
    )

# Apply the tokenization to the dataset. We use batched=True for efficiency.
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names # Remove old text columns
)

# The Trainer in Hugging Face for Causal LM automatically creates the 'labels'
# by shifting the 'input_ids'. So, we just need to provide 'input_ids'.
print("Dataset prepared and tokenized.")


# =============================================================================
# SECTION 5: CUSTOM DISTILLATION TRAINER
# =============================================================================
# This is the core of the knowledge distillation process. We create a custom
# Trainer class that overrides the `compute_loss` method. The new loss function
# combines the standard language modeling loss with a KL divergence loss that
# aligns the student's output distribution with the teacher's.

class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model
        # Use reduction='none' to get per-token loss, which we will then average manually.
        # This gives us more control and avoids potential batching issues.
        self.loss_fct = nn.KLDivLoss(reduction="none")

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # The Trainer automatically prepares the 'labels' argument for us.
        # 1. Get student's outputs. The model returns the standard cross-entropy loss
        # when `labels` are provided.
        outputs_student = model(**inputs)
        student_loss = outputs_student.loss
        student_logits = outputs_student.logits

        # 2. Get teacher's outputs
        with torch.no_grad():
            outputs_teacher = self.teacher_model(**inputs)
            teacher_logits = outputs_teacher.logits

        # 3. Align vocabulary sizes for KL divergence
        student_vocab_size = student_logits.size(-1)
        teacher_vocab_size = teacher_logits.size(-1)

        if student_vocab_size != teacher_vocab_size:
            padding_size = student_vocab_size - teacher_vocab_size
            teacher_logits = F.pad(teacher_logits, (0, padding_size), "constant", -1e9)

        # 4. Calculate the distillation loss (KL Divergence)
        # We only calculate the loss on non-padded tokens.
        # The attention_mask is 1 for real tokens and 0 for padding.
        attention_mask = inputs.get("attention_mask")
        if attention_mask is not None:
            # Reshape mask for broadcasting over the logits vocabulary dimension
            mask = attention_mask.unsqueeze(-1).expand_as(student_logits)
        else:
            mask = torch.ones_like(student_logits)

        # Soften probabilities for KL divergence
        soft_student_logits = F.log_softmax(student_logits / config.TEMPERATURE, dim=-1)
        soft_teacher_logits = F.softmax(teacher_logits / config.TEMPERATURE, dim=-1)

        # Calculate per-token KL loss
        kl_loss_per_token = self.loss_fct(soft_student_logits, soft_teacher_logits)

        # Apply the mask to ignore padding tokens and sum the loss
        masked_kl_loss = (kl_loss_per_token * mask).sum()
        num_active_tokens = mask.sum()

        # Normalize the loss by the number of active (non-padded) tokens
        loss_kl = masked_kl_loss / num_active_tokens

        # Scale the loss by temperature squared, a common practice in distillation
        loss_kl = loss_kl * (config.TEMPERATURE ** 2)

        # 5. Combine the losses
        loss = (1 - config.ALPHA) * student_loss + config.ALPHA * loss_kl

        return (loss, outputs_student) if return_outputs else loss


# =============================================================================
# SECTION 6: TRAINING
# =============================================================================
# With all components ready, we configure the training arguments and instantiate
# our custom `DistillationTrainer`. Then, we start the training process.

print("Starting training...")

# Define the training arguments.
training_args = TrainingArguments(
    output_dir=config.OUTPUT_DIR,
    num_train_epochs=config.NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=config.BATCH_SIZE,
    learning_rate=config.LEARNING_RATE,
    logging_dir='./logs',
    logging_steps=100,
    save_steps=500,
    report_to="none", # Disable reporting to services like wandb
    fp16=True, # Use mixed-precision training for speed and memory efficiency
)

# Instantiate the custom trainer
distiller = DistillationTrainer(
    model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

# Start training
distiller.train()

print("Training finished.")


# =============================================================================
# SECTION 7: EVALUATION AND INFERENCE
# =============================================================================
# After training, we save the student model and demonstrate its capabilities
# with a simple inference function on a few different prompts.

print("Saving the final student model...")

# Save the student model and tokenizer to the specified directory
# It's good practice to save the unwrapped model if using acceleration/distribution
unwrapped_model = distiller.model.module if hasattr(distiller.model, 'module') else distiller.model
unwrapped_model.save_pretrained(config.STUDENT_MODEL_OUTPUT_DIR)
tokenizer.save_pretrained(config.STUDENT_MODEL_OUTPUT_DIR)

print(f"Model saved to {config.STUDENT_MODEL_OUTPUT_DIR}")


# --- Inference Function ---
def generate_response(prompt, model, tokenizer, max_length=150):
    """
    Generates a response from the model given a prompt.
    """
    # Format the prompt for the model
    formatted_prompt = f"Instruction:\n{prompt}\n\nResponse:\n"

    # Tokenize the input and move to the correct device
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)

    # Generate output
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        eos_token_id=tokenizer.eos_token_id
    )

    # Decode and return the response text
    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Only return the generated part
    return response_text.split("Response:\n")[1].strip()

# --- Test Prompts ---
print("\n--- Testing the distilled model ---")

# Load the saved model for inference to ensure we're testing the final artifact
final_model = AutoModelForCausalLM.from_pretrained(config.STUDENT_MODEL_OUTPUT_DIR)
final_model.to(teacher_model.device) # Move to GPU

# Test Case 1: Question Answering
prompt1 = "What is knowledge distillation in the context of deep learning?"
print(f"\nPrompt 1: {prompt1}")
response1 = generate_response(prompt1, final_model, tokenizer)
print(f"Generated Response:\n{response1}")

# Test Case 2: Poetry Generation
prompt2 = "Write a short, four-line poem about the stars."
print(f"\nPrompt 2: {prompt2}")
response2 = generate_response(prompt2, final_model, tokenizer)
print(f"Generated Response:\n{response2}")

# Test Case 3: Brainstorming
prompt3 = "Brainstorm three potential names for a new brand of coffee."
print(f"\nPrompt 3: {prompt3}")
response3 = generate_response(prompt3, final_model, tokenizer)
print(f"Generated Response:\n{response3}")

print("\n--- Script execution complete ---")




Installing necessary libraries...
Libraries installed and imported successfully.
Loading tokenizer and teacher model...


ImportError: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# =============================================================================
# SECTION 1: LOAD SAVED MODEL AND TOKENIZER
# =============================================================================
# This script assumes that your main training script has already run and saved
# the distilled model to the "./distilled_student_model" directory.

# Define the path to your saved model
STUDENT_MODEL_OUTPUT_DIR = "./distilled_student_model"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("Loading the distilled model and tokenizer...")

try:
    # Load the tokenizer and model from the saved directory
    tokenizer = AutoTokenizer.from_pretrained(STUDENT_MODEL_OUTPUT_DIR)
    model = AutoModelForCausalLM.from_pretrained(STUDENT_MODEL_OUTPUT_DIR)
    model.to(DEVICE)
    print("Model and tokenizer loaded successfully.")
except OSError:
    print(f"Error: Could not find a saved model at '{STUDENT_MODEL_OUTPUT_DIR}'.")
    print("Please make sure you have run the main training script successfully before running this cell.")
    exit()

# =============================================================================
# SECTION 2: INFERENCE FUNCTION AND PROMPTS
# =============================================================================

# --- Inference Function (copied from the training script for convenience) ---
def generate_response(prompt, model, tokenizer, max_length=150):
    """
    Generates a response from the model given a prompt.
    """
    # Format the prompt into the template the model was trained on
    formatted_prompt = f"Instruction:\n{prompt}\n\nResponse:\n"

    # Tokenize the input and move it to the GPU/CPU
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)

    # Generate a response from the model
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        eos_token_id=tokenizer.eos_token_id
    )

    # Decode the generated tokens back into text
    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the generated response part
    # We split by "Response:\n" and take the second part.
    try:
        return response_text.split("Response:\n")[1].strip()
    except IndexError:
        return response_text # Fallback if the template isn't perfectly followed

# --- List of new, diverse prompts to test the model ---
new_prompts = [
    "Explain the plot of the movie 'Inception' in three sentences.",
    "Write a Python function that takes a list of numbers and returns the sum.",
    "Continue the following story: The old lighthouse stood on the cliff's edge, its light having gone out for the first time in a century. Suddenly, a strange green glow emanated from the rocks below...",
    "What are the main differences between a cat and a dog?",
    "Provide a simple recipe for making pancakes."
]

# =============================================================================
# SECTION 3: RUN INFERENCE
# =============================================================================

print("\n--- Running additional inference tests ---")

# Loop through the new prompts and generate a response for each one
for i, prompt in enumerate(new_prompts):
    print(f"\n--- Prompt {i+1} ---")
    print(f"Instruction: {prompt}")
    response = generate_response(prompt, model, tokenizer)
    print(f"Generated Response:\n{response}")

print("\n--- Additional testing complete ---")


Loading the distilled model and tokenizer...
Model and tokenizer loaded successfully.

--- Running additional inference tests ---

--- Prompt 1 ---
Instruction: Explain the plot of the movie 'Inception' in three sentences.
Generated Response:
1.  The first United States is the first be the following in the following has the following, and the most, and the most of the first people, the first.  The best the United States of the world of the " the the other the following a more of the first the first the most of the following to the first the time of the following of the most the best the same in the in the following the difference of the same of the same of the, in the following, the first.




-The as the most is a in the following was the has the has the world of the

--- Prompt 2 ---
Instruction: Write a Python function that takes a list of numbers and returns the sum.
Generated Response:
The home of the best a list of the most in the following of the first you of the following can b

In [3]:
# =============================================================================
# SECTION 1: SETUP AND DEPENDENCIES
# =============================================================================
# This section installs the required libraries. It's placed at the very top
# to ensure dependencies are available before any import statements are run.

!pip install -q evaluate rouge_score accelerate

import torch
import time
import evaluate
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
import numpy as np

# =============================================================================
# SECTION 2: LOAD MODELS AND EVALUATION DATASET
# =============================================================================
# We load both models, the tokenizer, and a small, unseen portion of the
# dataset to serve as our evaluation set.

# --- Configuration ---
STUDENT_MODEL_DIR = "./distilled_student_model"
TEACHER_MODEL_ID = "microsoft/phi-2"
EVAL_DATASET_ID = "databricks/databricks-dolly-15k"
NUM_EVAL_SAMPLES = 50 # Number of samples to use for evaluation
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("Loading models and tokenizer for evaluation...")

# --- Load Student Model ---
try:
    tokenizer = AutoTokenizer.from_pretrained(STUDENT_MODEL_DIR)
    student_model = AutoModelForCausalLM.from_pretrained(STUDENT_MODEL_DIR)
    student_model.to(DEVICE)
    print("✅ Distilled student model loaded.")
except OSError:
    print(f"❌ Error: Could not find a saved model at '{STUDENT_MODEL_DIR}'.")
    exit()

# --- Load Teacher Model ---
try:
    bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4")
    teacher_model = AutoModelForCausalLM.from_pretrained(
        TEACHER_MODEL_ID, quantization_config=bnb_config, trust_remote_code=True, device_map="auto"
    )
    print("✅ Original teacher model (phi-2) loaded.")
except Exception as e:
    print(f"❌ Error loading teacher model: {e}")
    exit()

# --- Load Evaluation Dataset ---
# We select a different slice of the dataset to ensure it wasn't used in training.
eval_dataset = load_dataset(EVAL_DATASET_ID, split=f'train[{3000}:{3000 + NUM_EVAL_SAMPLES}]')
eval_prompts = [item['instruction'] for item in eval_dataset]

print(f"✅ Loaded {len(eval_prompts)} samples for evaluation.")

# =============================================================================
# SECTION 3: CALCULATE METRICS
# =============================================================================
# Here we define functions to compute each metric and then run the evaluation.

# --- 1. Text Quality (ROUGE) ---
print("\n--- Calculating ROUGE Score ---")
rouge = evaluate.load('rouge')
student_generations = []
teacher_generations = []

for prompt in eval_prompts:
    formatted_prompt = f"Instruction:\n{prompt}\n\nResponse:\n"
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(DEVICE)

    # Student generation
    student_output = student_model.generate(**inputs, max_new_tokens=100)
    student_text = tokenizer.decode(student_output[0], skip_special_tokens=True)
    student_generations.append(student_text.split("Response:\n")[1].strip())

    # Teacher generation (will be used as the reference)
    teacher_output = teacher_model.generate(**inputs, max_new_tokens=100)
    teacher_text = tokenizer.decode(teacher_output[0], skip_special_tokens=True)
    teacher_generations.append(teacher_text.split("Response:\n")[1].strip())

rouge_scores = rouge.compute(predictions=student_generations, references=teacher_generations)
print("✅ ROUGE scores computed.")

# --- 2. Perplexity ---
print("\n--- Calculating Perplexity ---")
perplexity = evaluate.load("perplexity", module_type="metric")

# We use the raw responses from the teacher model as the text to evaluate perplexity on.
# This measures how "surprised" each model is by the high-quality text.
student_ppl = perplexity.compute(model_id=STUDENT_MODEL_DIR, add_start_token=False, predictions=teacher_generations)
# Note: Calculating perplexity for a quantized model can be complex and sometimes unsupported.
# We'll try it, but wrap in a try-except block.
try:
    # A bit of a hack: we can't pass the model object directly, so we pass its ID.
    # This may be slow as it re-downloads, but it's the most reliable way with the `evaluate` library.
    teacher_ppl = perplexity.compute(model_id=TEACHER_MODEL_ID, add_start_token=False, predictions=teacher_generations)
    print("✅ Perplexity computed for both models.")
except Exception as e:
    print(f"⚠️ Could not compute teacher perplexity (often due to quantization): {e}")
    teacher_ppl = {"mean_perplexity": "N/A"}


# --- 3. Inference Speed ---
print("\n--- Calculating Inference Speed ---")
student_times = []
teacher_times = []

# Dummy run to warm up the GPU
_ = student_model.generate(**inputs, max_new_tokens=2)
_ = teacher_model.generate(**inputs, max_new_tokens=2)

for prompt in eval_prompts:
    formatted_prompt = f"Instruction:\n{prompt}\n\nResponse:\n"
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(DEVICE)

    start_time = time.perf_counter()
    student_model.generate(**inputs, max_new_tokens=100)
    torch.cuda.synchronize() # Wait for GPU to finish
    student_times.append(time.perf_counter() - start_time)

    start_time = time.perf_counter()
    teacher_model.generate(**inputs, max_new_tokens=100)
    torch.cuda.synchronize()
    teacher_times.append(time.perf_counter() - start_time)

avg_student_time = np.mean(student_times)
avg_teacher_time = np.mean(teacher_times)
print("✅ Inference speed calculated.")

# =============================================================================
# SECTION 4: DISPLAY RESULTS
# =============================================================================

print("\n\n================= METRICS SUMMARY =================")
print(f"{'Metric':<25} | {'Student Model':<20} | {'Teacher Model (phi-2)':<25}")
print("-" * 75)

# ROUGE
print(f"{'ROUGE-1 Score':<25} | {rouge_scores['rouge1']:.4f}{'':<15} | {'1.0 (Reference)':<25}")
print(f"{'ROUGE-L Score':<25} | {rouge_scores['rougeL']:.4f}{'':<15} | {'1.0 (Reference)':<25}")

# Perplexity
student_ppl_val = student_ppl['mean_perplexity']
teacher_ppl_val = teacher_ppl['mean_perplexity']
print(f"{'Perplexity (lower=better)':<25} | {student_ppl_val:<20.2f} | {teacher_ppl_val:<25.2f}")

# Speed
speedup_factor = avg_teacher_time / avg_student_time
print(f"{'Avg. Inference Time (s)':<25} | {avg_student_time:<20.4f} | {avg_teacher_time:<25.4f}")
print("-" * 75)
print(f"🚀 Speedup Factor: The distilled model is {speedup_factor:.2f}x faster than the teacher model.")
print("===================================================")



  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
Loading models and tokenizer for evaluation...
✅ Distilled student model loaded.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Original teacher model (phi-2) loaded.
✅ Loaded 50 samples for evaluation.

--- Calculating ROUGE Score ---


Downloading builder script: 0.00B [00:00, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

✅ ROUGE scores computed.

--- Calculating Perplexity ---


Downloading builder script: 0.00B [00:00, ?B/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


✅ Perplexity computed for both models.

--- Calculating Inference Speed ---


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

KeyboardInterrupt: 

In [4]:
# =============================================================================
# SECTION 1: SETUP AND DEPENDENCIES
# =============================================================================
# This section installs all required evaluation libraries, including bert_score.

!pip install -q evaluate rouge_score accelerate bert_score

import torch
import time
import evaluate
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
import numpy as np

# =============================================================================
# SECTION 2: LOAD MODELS AND EVALUATION DATASET
# =============================================================================
# We load both models, the tokenizer, and an evaluation dataset.

# --- Configuration ---
STUDENT_MODEL_DIR = "./distilled_student_model"
TEACHER_MODEL_ID = "microsoft/phi-2"
EVAL_DATASET_ID = "databricks/databricks-dolly-15k"
NUM_EVAL_SAMPLES = 50
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("Loading models and tokenizer for evaluation...")

# --- Load Models (Student and Teacher) ---
try:
    tokenizer = AutoTokenizer.from_pretrained(STUDENT_MODEL_DIR)
    student_model = AutoModelForCausalLM.from_pretrained(STUDENT_MODEL_DIR).to(DEVICE)
    print("✅ Distilled student model loaded.")

    bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4")
    teacher_model = AutoModelForCausalLM.from_pretrained(
        TEACHER_MODEL_ID, quantization_config=bnb_config, trust_remote_code=True, device_map="auto"
    )
    print("✅ Original teacher model (phi-2) loaded.")
except Exception as e:
    print(f"❌ Error loading models: {e}")
    exit()

# --- Load Evaluation Dataset ---
eval_dataset = load_dataset(EVAL_DATASET_ID, split=f'train[{3000}:{3000 + NUM_EVAL_SAMPLES}]')
eval_prompts = [item['instruction'] for item in eval_dataset]
print(f"✅ Loaded {len(eval_prompts)} samples for evaluation.")

# =============================================================================
# SECTION 3: GENERATE RESPONSES FOR EVALUATION
# =============================================================================
# We generate responses from both models once and reuse them for all metrics.

print("\n--- Generating responses for evaluation... ---")
student_generations = []
teacher_generations = [] # These will be the "references"

for prompt in eval_prompts:
    formatted_prompt = f"Instruction:\n{prompt}\n\nResponse:\n"
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(DEVICE)

    student_output = student_model.generate(**inputs, max_new_tokens=100)
    student_text = tokenizer.decode(student_output[0], skip_special_tokens=True).split("Response:\n")[1].strip()
    student_generations.append(student_text)

    teacher_output = teacher_model.generate(**inputs, max_new_tokens=100)
    teacher_text = tokenizer.decode(teacher_output[0], skip_special_tokens=True).split("Response:\n")[1].strip()
    teacher_generations.append(teacher_text)

print("✅ Responses generated.")

# =============================================================================
# SECTION 4: CALCULATE ALL METRICS
# =============================================================================

# --- 1. Lexical Metrics (ROUGE & BLEU) ---
print("\n--- Calculating ROUGE and BLEU Scores ---")
rouge = evaluate.load('rouge')
bleu = evaluate.load('bleu')

rouge_scores = rouge.compute(predictions=student_generations, references=teacher_generations)
bleu_scores = bleu.compute(predictions=student_generations, references=teacher_generations)
print("✅ ROUGE and BLEU scores computed.")

# --- 2. Semantic Metric (BERTScore) ---
print("\n--- Calculating BERTScore (this may take a moment)... ---")
bertscore = evaluate.load("bertscore")
bert_scores = bertscore.compute(predictions=student_generations, references=teacher_generations, lang="en")
# We take the average F1 score as the primary metric
avg_bert_score_f1 = np.mean(bert_scores['f1'])
print("✅ BERTScore computed.")

# --- 3. Intrinsic Metric (Perplexity) ---
print("\n--- Calculating Perplexity ---")
perplexity = evaluate.load("perplexity", module_type="metric")
student_ppl = perplexity.compute(model_id=STUDENT_MODEL_DIR, add_start_token=False, predictions=teacher_generations)
try:
    teacher_ppl = perplexity.compute(model_id=TEACHER_MODEL_ID, add_start_token=False, predictions=teacher_generations)
    print("✅ Perplexity computed for both models.")
except Exception as e:
    print(f"⚠️ Could not compute teacher perplexity (often due to quantization): {e}")
    teacher_ppl = {"mean_perplexity": "N/A"}

# --- 4. Efficiency Metrics (Speed & Size) ---
print("\n--- Calculating Inference Speed and Model Size ---")
student_times = []
teacher_times = []
inputs = tokenizer(eval_prompts[0], return_tensors="pt").to(DEVICE) # Use one prompt for timing
_ = student_model.generate(**inputs, max_new_tokens=2); _ = teacher_model.generate(**inputs, max_new_tokens=2) # Warmup

for prompt in eval_prompts:
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    start_time = time.perf_counter()
    student_model.generate(**inputs, max_new_tokens=100)
    torch.cuda.synchronize()
    student_times.append(time.perf_counter() - start_time)

    start_time = time.perf_counter()
    teacher_model.generate(**inputs, max_new_tokens=100)
    torch.cuda.synchronize()
    teacher_times.append(time.perf_counter() - start_time)

avg_student_time = np.mean(student_times)
avg_teacher_time = np.mean(teacher_times)

student_params = student_model.num_parameters()
teacher_params = teacher_model.num_parameters() # Note: Shows original count, not 4-bit footprint
print("✅ Efficiency metrics calculated.")

# =============================================================================
# SECTION 5: DISPLAY COMPREHENSIVE RESULTS
# =============================================================================

print("\n\n========================= COMPREHENSIVE METRICS SUMMARY =========================")
print(f"{'Metric':<28} | {'Student Model':<20} | {'Teacher Model (phi-2)':<25}")
print("-" * 85)
print("--- Text Quality (vs. Teacher as Reference) ---")
print(f"{'ROUGE-L Score':<28} | {rouge_scores['rougeL']:.4f}{'':<15} | {'1.0 (Reference)':<25}")
print(f"{'BLEU Score':<28} | {bleu_scores['bleu']:.4f}{'':<15} | {'1.0 (Reference)':<25}")
print(f"{'BERTScore (F1)':<28} | {avg_bert_score_f1:.4f}{'':<15} | {'1.0 (Reference)':<25}")
print("-" * 85)
print("--- Intrinsic Performance (lower is better) ---")
print(f"{'Perplexity':<28} | {student_ppl['mean_perplexity']:<20.2f} | {teacher_ppl['mean_perplexity']:<25.2f}")
print("-" * 85)
print("--- Efficiency ---")
speedup_factor = avg_teacher_time / avg_student_time
size_reduction = 1 - (student_params / teacher_params)
print(f"{'Avg. Inference Time (s)':<28} | {avg_student_time:<20.4f} | {avg_teacher_time:<25.4f}")
print(f"{'Parameter Count':<28} | {student_params/1e6:<16.1f}M | {teacher_params/1e9:.2f}B (in 4-bit)")
print("-" * 85)
print(f"🚀 Speedup Factor: The distilled model is {speedup_factor:.2f}x faster.")
print(f"📦 Size Reduction: The distilled model has {size_reduction:.1%} fewer parameters.")
print("===================================================================================")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hLoading models and tokenizer for evaluation...
✅ Distilled student model loaded.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Original teacher model (phi-2) loaded.
✅ Loaded 50 samples for evaluation.

--- Generating responses for evaluation... ---


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

✅ Responses generated.

--- Calculating ROUGE and BLEU Scores ---


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

✅ ROUGE and BLEU scores computed.

--- Calculating BERTScore (this may take a moment)... ---


Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ BERTScore computed.

--- Calculating Perplexity ---


  0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


✅ Perplexity computed for both models.

--- Calculating Inference Speed and Model Size ---


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

✅ Efficiency metrics calculated.


Metric                       | Student Model        | Teacher Model (phi-2)    
-------------------------------------------------------------------------------------
--- Text Quality (vs. Teacher as Reference) ---
ROUGE-L Score                | 0.0642                | 1.0 (Reference)          
BLEU Score                   | 0.0000                | 1.0 (Reference)          
BERTScore (F1)               | 0.7549                | 1.0 (Reference)          
-------------------------------------------------------------------------------------
--- Intrinsic Performance (lower is better) ---
Perplexity                   | 1587.44              | 5.83                     
-------------------------------------------------------------------------------------
--- Efficiency ---
Avg. Inference Time (s)      | 0.5469               | 5.4061                   
Parameter Count              | 81.9            M | 2.78B (in 4-bit)
-----------------------------------------

In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# =============================================================================
# SECTION 1: LOAD THE ORIGINAL DISTILLED MODEL
# =============================================================================
# This section loads the first model you trained on the Dolly-15k dataset.

# --- Configuration ---
# IMPORTANT: This points to the directory of your FIRST trained model.
MODEL_DIR = "./distilled_student_model"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("Loading your fine-tuned model (trained on Dolly-15k)...")

# --- Load the Model and Tokenizer ---
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
    model = AutoModelForCausalLM.from_pretrained(MODEL_DIR).to(DEVICE)
    print(f"✅ Model loaded successfully from '{MODEL_DIR}'.")
except OSError:
    print(f"❌ Error: Could not find a saved model at '{MODEL_DIR}'.")
    print("Please make sure you have run the original training script successfully.")
    exit()

model.eval() # Set the model to evaluation mode

# =============================================================================
# SECTION 2: INTERACTIVE CHAT LOOP
# =============================================================================
# This section contains the main loop for interacting with the model.

def get_model_response(prompt, model, tokenizer, max_new_tokens=150):
    """Generates a response from the model for a given user prompt."""
    # We format the user's question into the template the model was trained on
    full_prompt = f"Instruction:\n{prompt}\n\nResponse:\n"
    inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)

    # Generate the response with improved parameters to reduce repetition
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.8,          # Slightly higher for more creativity
            top_p=0.95,
            do_sample=True,
            repetition_penalty=1.2,   # *** Penalize repeating tokens ***
            no_repeat_ngram_size=3,   # *** Prevent repeating 3-word sequences ***
            pad_token_id=tokenizer.eos_token_id # Suppress padding token warning
        )

    # Decode and clean up the response
    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    try:
        # Extract only the part after "Response:"
        return response_text.split("Response:\n")[1].strip()
    except IndexError:
        return "The model did not generate a response in the expected format."

print("\n--- Interactive Chat with General Q&A Model ---")
print("Ask a question on any topic. For example: 'Explain gravity in simple terms.'")
print("Type 'quit' or 'exit' to end the chat.")
print("--------------------------------------------------")

# Start the interactive loop
while True:
    # Get user input
    user_prompt = input("You: ")

    # Check if the user wants to exit
    if user_prompt.lower() in ['quit', 'exit']:
        print("Exiting chat. Goodbye!")
        break

    # Generate and print the model's response
    print("Model is thinking...")
    model_response = get_model_response(user_prompt, model, tokenizer)
    print(f"Model: {model_response}\n")



Loading your fine-tuned model (trained on Dolly-15k)...
✅ Model loaded successfully from './distilled_student_model'.

--- Interactive Chat with General Q&A Model ---
Ask a question on any topic. For example: 'Explain gravity in simple terms.'
Type 'quit' or 'exit' to end the chat.
--------------------------------------------------
You: camel
Model is thinking...
Model: The American can be a many the could of the best the following, a that a an an many of the world.  There are in the other of their passage you. The the game or some to the first they are one people are in a list of your at a many most " well as the most, and have in the time, and you

-2.  This the best a a very would be a a will the world and you are the best in the following. The time (5.  - you you is your.  The do not a to the great for the your list of the and two of many in the, with a one to not to you can be also the from the that, so to the

You: what is gravity?
Model is thinking...
Model: The following has is

In [8]:
# =============================================================================
# SECTION 1: SETUP AND DEPENDENCIES
# =============================================================================
print("Installing necessary libraries...")
!pip install -q transformers datasets peft accelerate bitsandbytes trl torch

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    GPT2Config,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    get_linear_schedule_with_warmup
)
from datasets import load_dataset
import torch.nn.functional as F
import torch.nn as nn
import os

print("Libraries installed and imported successfully.")


# =============================================================================
# SECTION 2: CONFIGURATION
# =============================================================================
# All hyperparameters are centralized here.
# KEY CHANGES: More data, more epochs, and a new scheduler.

class DistillationConfig:
    TEACHER_MODEL_ID = "microsoft/phi-2"
    STUDENT_MODEL_OUTPUT_DIR = "./distilled_student_model" # It will overwrite the old one

    # --- IMPROVEMENT 1: Use more data ---
    DATASET_ID = "databricks/databricks-dolly-15k"
    DATASET_SUBSET_SIZE = 10000  # Increased from 3000
    MAX_TOKEN_LENGTH = 512

    # Distillation parameters
    ALPHA = 0.5
    TEMPERATURE = 2.0

    # --- IMPROVEMENT 2: Train for longer ---
    NUM_TRAIN_EPOCHS = 3 # Increased from 1
    BATCH_SIZE = 4
    LEARNING_RATE = 5e-5
    OUTPUT_DIR = "./training_output_improved"

config = DistillationConfig()


# =============================================================================
# SECTION 3: LOAD MODELS AND TOKENIZER
# =============================================================================
print("Loading tokenizer and teacher model...")
tokenizer = AutoTokenizer.from_pretrained(config.TEACHER_MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True,
)
teacher_model = AutoModelForCausalLM.from_pretrained(
    config.TEACHER_MODEL_ID,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto"
)
teacher_model.eval()
print("Teacher model loaded successfully in 4-bit.")

student_config = GPT2Config(
    vocab_size=len(tokenizer),
    n_layer=6, n_head=12, n_embd=768,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)
student_model = GPT2LMHeadModel(student_config)
student_model = student_model.to(teacher_model.device)
print(f"Student model created with {student_model.num_parameters():,} parameters.")


# =============================================================================
# SECTION 4: PREPARE DATASET
# =============================================================================
print("Preparing the dataset...")
dataset = load_dataset(config.DATASET_ID, split='train')
dataset = dataset.select(range(config.DATASET_SUBSET_SIZE))

def tokenize_function(examples):
    formatted_texts = []
    for i in range(len(examples["instruction"])):
        text = f"Instruction:\n{examples['instruction'][i]}\n\nResponse:\n{examples['response'][i]}"
        formatted_texts.append(text)
    return tokenizer(
        formatted_texts, padding="max_length",
        truncation=True, max_length=config.MAX_TOKEN_LENGTH,
    )
tokenized_dataset = dataset.map(
    tokenize_function, batched=True, remove_columns=dataset.column_names
)
print("Dataset prepared and tokenized.")


# =============================================================================
# SECTION 5: CUSTOM DISTILLATION TRAINER (WITH SCHEDULER)
# =============================================================================
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model
        self.loss_fct = nn.KLDivLoss(reduction="batchmean")

    # --- IMPROVEMENT 3: Add a learning rate scheduler ---
    def create_scheduler(self, num_training_steps: int, optimizer: torch.optim.Optimizer = None):
        """
        Setup the scheduler. The optimizer of the trainer must have been set up before this method is called.
        """
        self.lr_scheduler = get_linear_schedule_with_warmup(
            self.optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
        )
        return self.lr_scheduler

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        outputs_student = model(**inputs)
        student_loss = outputs_student.loss
        student_logits = outputs_student.logits

        with torch.no_grad():
            outputs_teacher = self.teacher_model(**inputs)
            teacher_logits = outputs_teacher.logits

        # Robustly handle vocab size mismatch by slicing to the smallest common size
        student_vocab_size = student_logits.size(-1)
        teacher_vocab_size = teacher_logits.size(-1)
        if student_vocab_size != teacher_vocab_size:
            min_vocab_size = min(student_vocab_size, teacher_vocab_size)
            student_logits = student_logits[:, :, :min_vocab_size]
            teacher_logits = teacher_logits[:, :, :min_vocab_size]

        # Mask out padding tokens
        attention_mask = inputs.get("attention_mask")
        if attention_mask is not None:
            active_logits = attention_mask.view(-1) == 1
            student_logits = student_logits.view(-1, student_logits.size(-1))[active_logits]
            teacher_logits = teacher_logits.view(-1, teacher_logits.size(-1))[active_logits]

        soft_student_logits = F.log_softmax(student_logits / config.TEMPERATURE, dim=-1)
        soft_teacher_logits = F.softmax(teacher_logits / config.TEMPERATURE, dim=-1)

        loss_kl = self.loss_fct(soft_student_logits, soft_teacher_logits) * (config.TEMPERATURE ** 2)
        loss = (1 - config.ALPHA) * student_loss + config.ALPHA * loss_kl
        return (loss, outputs_student) if return_outputs else loss


# =============================================================================
# SECTION 6: TRAINING
# =============================================================================
print("Starting improved training run...")
training_args = TrainingArguments(
    output_dir=config.OUTPUT_DIR,
    num_train_epochs=config.NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=config.BATCH_SIZE,
    learning_rate=config.LEARNING_RATE,
    logging_steps=100,
    save_strategy="epoch",
    report_to="none",
    fp16=True,
)

distiller = DistillationTrainer(
    model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

distiller.train()
print("Training finished.")


# =============================================================================
# SECTION 7: SAVE FINAL MODEL
# =============================================================================
print(f"Saving the final, improved model to {config.STUDENT_MODEL_OUTPUT_DIR}...")
unwrapped_model = distiller.model
unwrapped_model.save_pretrained(config.STUDENT_MODEL_OUTPUT_DIR)
tokenizer.save_pretrained(config.STUDENT_MODEL_OUTPUT_DIR)
print("✅ Model saved successfully. You can now use the interactive chat script.")


Installing necessary libraries...
Libraries installed and imported successfully.
Loading tokenizer and teacher model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Teacher model loaded successfully in 4-bit.
Student model created with 81,941,760 parameters.
Preparing the dataset...


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset prepared and tokenized.
Starting improved training run...


  super().__init__(*args, **kwargs)


Step,Training Loss
100,9.3931
200,7.918
300,7.5102
400,7.2798
500,7.1297
600,7.1497
700,6.9079
800,6.8294
900,6.7681
1000,6.6884


Training finished.
Saving the final, improved model to ./distilled_student_model...
✅ Model saved successfully. You can now use the interactive chat script.


In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# =============================================================================
# SECTION 1: LOAD THE IMPROVED DISTILLED MODEL
# =============================================================================
# This section loads the model you just trained with the improved script.

# --- Configuration ---
MODEL_DIR = "./distilled_student_model" # This should be the output of your improved training
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("Loading your improved fine-tuned model (trained on Dolly-15k)...")

# --- Load the Model and Tokenizer ---
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
    model = AutoModelForCausalLM.from_pretrained(MODEL_DIR).to(DEVICE)
    print(f"✅ Model loaded successfully from '{MODEL_DIR}'.")
except OSError:
    print(f"❌ Error: Could not find a saved model at '{MODEL_DIR}'.")
    print("Please make sure the improved training script ran successfully and saved the model.")
    exit()

model.eval() # Set the model to evaluation mode

# =============================================================================
# SECTION 2: INTERACTIVE CHAT LOOP
# =============================================================================
# This section contains the main loop for interacting with the model.

def get_model_response(prompt, model, tokenizer, max_new_tokens=150):
    """Generates a response from the model for a given user prompt."""
    # We format the user's question into the template the model was trained on
    full_prompt = f"Instruction:\n{prompt}\n\nResponse:\n"
    inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)

    # Generate the response with parameters to reduce repetition and encourage creativity
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.8,
            top_p=0.95,
            do_sample=True,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode and clean up the response
    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    try:
        # Extract only the part after "Response:"
        return response_text.split("Response:\n")[1].strip()
    except IndexError:
        return "The model did not generate a response in the expected format."

print("\n--- Interactive Chat with Improved General Q&A Model ---")
print("Ask a question on any topic. For example: 'What are the main differences between Python and Java?'")
print("Type 'quit' or 'exit' to end the chat.")
print("--------------------------------------------------")

# Start the interactive loop
while True:
    # Get user input
    user_prompt = input("You: ")

    # Check if the user wants to exit
    if user_prompt.lower() in ['quit', 'exit']:
        print("Exiting chat. Goodbye!")
        break

    # Generate and print the model's response
    print("Model is thinking...")
    model_response = get_model_response(user_prompt, model, tokenizer)
    print(f"Model: {model_response}\n")



Loading your improved fine-tuned model (trained on Dolly-15k)...
✅ Model loaded successfully from './distilled_student_model'.

--- Interactive Chat with Improved General Q&A Model ---
Ask a question on any topic. For example: 'What are the main differences between Python and Java?'
Type 'quit' or 'exit' to end the chat.
--------------------------------------------------
You: camel
Model is thinking...
Model: Radotinan
Chaga is a species that refers to as a species of fish. It is an American, which was located in the city and the ball, and is known for both from the largest world.  The famous state is used by an different types of the most successful in the most notable language that are also played on the second country or with more than some other languages. This is the most popular teams that is commonly known as the United States. The most common in this type of the same countries, such as well as a team is located in American.

 has also known for its other sports, which also know

In [13]:
# =============================================================================
# SECTION 1: SETUP AND DEPENDENCIES
# =============================================================================
print("Installing necessary libraries...")
!pip install -q transformers datasets peft accelerate bitsandbytes trl torch

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    GPT2Config,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    get_linear_schedule_with_warmup
)
from datasets import load_dataset
import torch.nn.functional as F
import torch.nn as nn
import os

print("Libraries installed and imported successfully.")


# =============================================================================
# SECTION 2: CONFIGURATION (FINAL & MORE ROBUST)
# =============================================================================
# KEY CHANGES: Full dataset, more capable student, and gradient accumulation.

class DistillationConfig:
    TEACHER_MODEL_ID = "microsoft/phi-2"
    STUDENT_MODEL_OUTPUT_DIR = "./distilled_student_model" # Overwrites the old model

    # --- IMPROVEMENT 1: Use the FULL dataset ---
    DATASET_ID = "databricks/databricks-dolly-15k"
    MAX_TOKEN_LENGTH = 512

    # Distillation parameters
    ALPHA = 0.5
    TEMPERATURE = 2.0

    # Training parameters
    NUM_TRAIN_EPOCHS = 3
    BATCH_SIZE = 2 # Reduced to allow for larger model and gradient accumulation
    LEARNING_RATE = 5e-5
    OUTPUT_DIR = "./training_output_final"

    # --- IMPROVEMENT 2: GRADIENT ACCUMULATION ---
    # This simulates a larger effective batch size (BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS)
    # leading to more stable training and a better model.
    GRADIENT_ACCUMULATION_STEPS = 8

config = DistillationConfig()


# =============================================================================
# SECTION 3: LOAD MODELS AND TOKENIZER
# =============================================================================
print("Loading tokenizer and teacher model...")
tokenizer = AutoTokenizer.from_pretrained(config.TEACHER_MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True,
)
teacher_model = AutoModelForCausalLM.from_pretrained(
    config.TEACHER_MODEL_ID,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto"
)
teacher_model.eval()
print("Teacher model loaded successfully in 4-bit.")

# --- IMPROVEMENT 3: A more capable student model ---
student_config = GPT2Config(
    vocab_size=len(tokenizer),
    n_layer=8, # Increased from 6
    n_head=16, # Increased from 12
    n_embd=1024, # Increased from 768
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)
student_model = GPT2LMHeadModel(student_config)
student_model = student_model.to(teacher_model.device)
print(f"Student model created with {student_model.num_parameters():,} parameters.")


# =============================================================================
# SECTION 4: PREPARE DATASET
# =============================================================================
print("Preparing the full dataset...")
dataset = load_dataset(config.DATASET_ID, split='train') # Load the full dataset

def tokenize_function(examples):
    formatted_texts = []
    for i in range(len(examples["instruction"])):
        text = f"Instruction:\n{examples['instruction'][i]}\n\nResponse:\n{examples['response'][i]}"
        formatted_texts.append(text)
    return tokenizer(
        formatted_texts, padding="max_length",
        truncation=True, max_length=config.MAX_TOKEN_LENGTH,
    )
tokenized_dataset = dataset.map(
    tokenize_function, batched=True, remove_columns=dataset.column_names
)
print("Dataset prepared and tokenized.")


# =============================================================================
# SECTION 5: CUSTOM DISTILLATION TRAINER
# =============================================================================
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model
        self.loss_fct = nn.KLDivLoss(reduction="batchmean")

    def create_scheduler(self, num_training_steps: int, optimizer: torch.optim.Optimizer = None):
        self.lr_scheduler = get_linear_schedule_with_warmup(
            self.optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
        )
        return self.lr_scheduler

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        outputs_student = model(**inputs)
        student_loss = outputs_student.loss
        student_logits = outputs_student.logits

        with torch.no_grad():
            outputs_teacher = self.teacher_model(**inputs)
            teacher_logits = outputs_teacher.logits

        # Robustly handle vocab size mismatch
        student_vocab_size = student_logits.size(-1)
        teacher_vocab_size = teacher_logits.size(-1)
        if student_vocab_size != teacher_vocab_size:
            min_vocab_size = min(student_vocab_size, teacher_vocab_size)
            student_logits = student_logits[:, :, :min_vocab_size]
            teacher_logits = teacher_logits[:, :, :min_vocab_size]

        attention_mask = inputs.get("attention_mask")
        if attention_mask is not None:
            active_logits = attention_mask.view(-1) == 1
            student_logits = student_logits.view(-1, student_logits.size(-1))[active_logits]
            teacher_logits = teacher_logits.view(-1, teacher_logits.size(-1))[active_logits]

        soft_student_logits = F.log_softmax(student_logits / config.TEMPERATURE, dim=-1)
        soft_teacher_logits = F.softmax(teacher_logits / config.TEMPERATURE, dim=-1)

        loss_kl = self.loss_fct(soft_student_logits, soft_teacher_logits) * (config.TEMPERATURE ** 2)
        loss = (1 - config.ALPHA) * student_loss + config.ALPHA * loss_kl
        return (loss, outputs_student) if return_outputs else loss


# =============================================================================
# SECTION 6: TRAINING
# =============================================================================
print("Starting final, robust training run...")
training_args = TrainingArguments(
    output_dir=config.OUTPUT_DIR,
    num_train_epochs=config.NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=config.BATCH_SIZE,
    gradient_accumulation_steps=config.GRADIENT_ACCUMULATION_STEPS,
    learning_rate=config.LEARNING_RATE,
    logging_steps=100,
    save_strategy="epoch",
    report_to="none",
    fp16=True,
)

distiller = DistillationTrainer(
    model=student_model,
    teacher_model=teacher_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

distiller.train()
print("Training finished.")


# =============================================================================
# SECTION 7: SAVE FINAL MODEL
# =============================================================================
print(f"Saving the final, improved model to {config.STUDENT_MODEL_OUTPUT_DIR}...")
unwrapped_model = distiller.model
unwrapped_model.save_pretrained(config.STUDENT_MODEL_OUTPUT_DIR)
tokenizer.save_pretrained(config.STUDENT_MODEL_OUTPUT_DIR)
print("✅ Model saved successfully. You can now use the interactive RAG chat script.")


Installing necessary libraries...
Libraries installed and imported successfully.
Loading tokenizer and teacher model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Teacher model loaded successfully in 4-bit.
Student model created with 153,322,496 parameters.
Preparing the full dataset...


Map:   0%|          | 0/15011 [00:00<?, ? examples/s]

Dataset prepared and tokenized.
Starting final, robust training run...


  super().__init__(*args, **kwargs)


Step,Training Loss
100,69.1502
200,57.8671
300,55.659
400,53.9331
500,51.7216
600,51.0702
700,50.164
800,49.1813
900,48.4477
1000,47.737


Training finished.
Saving the final, improved model to ./distilled_student_model...
✅ Model saved successfully. You can now use the interactive RAG chat script.


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import os # Import the os module to handle file paths

# =============================================================================
# SECTION 1: LOAD THE FINAL DISTILLED MODEL
# =============================================================================
# This section loads the model you trained with the final, robust script.

# --- Configuration ---
# Use os.path.abspath to get the full, unambiguous path to the model directory.
MODEL_DIR = os.path.abspath("./distilled_student_model")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("Loading your final, fine-tuned model...")

# --- Load the Model and Tokenizer ---
try:
    # FINAL FIX: Add `local_files_only=True` to force the library to treat the path
    # as a local directory and not an online repository ID.
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, local_files_only=True)
    model = AutoModelForCausalLM.from_pretrained(MODEL_DIR, local_files_only=True).to(DEVICE)
    print(f"✅ Model loaded successfully from '{MODEL_DIR}'.")
except OSError:
    print(f"❌ Error: Could not find a saved model at '{MODEL_DIR}'.")
    print("Please make sure the final training script ran successfully and saved the model.")
    exit()

model.eval() # Set the model to evaluation mode

# =============================================================================
# SECTION 2: INTERACTIVE CHAT LOOP
# =============================================================================
# This section contains the main loop for interacting with the model.

def get_model_response(prompt, model, tokenizer, max_new_tokens=150):
    """Generates a response from the model for a given user prompt."""
    # We format the user's question into the template the model was trained on
    full_prompt = f"Instruction:\n{prompt}\n\nResponse:\n"
    inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)

    # Generate the response with parameters to reduce repetition and encourage creativity
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode and clean up the response
    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    try:
        # Extract only the part after "Response:"
        return response_text.split("Response:\n")[1].strip()
    except IndexError:
        return "The model did not generate a response in the expected format."

print("\n--- Interactive Chat with Final General Q&A Model ---")
print("Ask a question on any topic. For example: 'What is the capital of France?'")
print("Type 'quit' or 'exit' to end the chat.")
print("--------------------------------------------------")

# Start the interactive loop
while True:
    # Get user input
    user_prompt = input("You: ")

    # Check if the user wants to exit
    if user_prompt.lower() in ['quit', 'exit']:
        print("Exiting chat. Goodbye!")
        break

    # Generate and print the model's response
    print("Model is thinking...")
    model_response = get_model_response(user_prompt, model, tokenizer)
    print(f"Model: {model_response}\n")



Loading your final, fine-tuned model...


HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/content/distilled_student_model'. Use `repo_type` argument if needed.