## Helpful Code for Training Models in Huggingface

> Import Library

In [None]:
!pip install evaluate, rouge_score

In [None]:
import os
import sys
import json
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
import accelerate   # for distributed training
from diffusers import StableDiffusionPipeline
import evaluate     # custom evaluation script
import torchmetrics
import arxiv  

# Huggingface Transformers
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    Trainer, 
    TrainingArguments,
    DataCollatorForSeq2Seq
)

> Data Collection

In [None]:
# Define the search query
search_text = "Deep Learning for Ageing Research" 

# Search for papers on arXiv
search = arxiv.Search(query=search_text, max_results=50, sort_by=arxiv.SortCriterion.Relevance)

# Collect the results
result_list = []
for result in search.results():
    result_list.append({
        "title": result.title,
        "published": result.published,
        "abstract": result.summary,
        "url": result.pdf_url,
        "categories": result.categories
    })

# Save the results to a JSON file (optional)
with open('arxiv_papers.json', 'w') as f:
    json.dump(result_list, f, indent=4)


> Prepare the Dataset

In [None]:
# Create a list of dictionaries with 'document' and 'summary' keys
# For demonstration, we'll use the abstract as both the document and the summary
# In practice, you'd want a more meaningful summary
train_data = []
for paper in result_list:
    train_data.append({
        "document": paper["abstract"],
        "summary": paper["abstract"]  # Replace with actual summaries if available
    })

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(pd.DataFrame(train_data))

# Split the dataset into training and evaluation sets
split_dataset = dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']


> Tokenization

In [None]:

model_name = "facebook/bart-large-cnn"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the maximum sequence length
max_input_length = 512
max_target_length = 128

# Tokenization function
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["document"], 
        max_length=max_input_length, 
        truncation=True
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"], 
            max_length=max_target_length, 
            truncation=True
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the tokenization
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)


> Data Collator

In [None]:
# Set up a data collator to dynamically pad the inputs during training:
# it loads the data from the dataset and pads it to the maximum length of the samples

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=AutoModelForSeq2SeqLM.from_pretrained(model_name), 
    padding=True
)


> Define Training Arguments

In [None]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./models",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="steps",
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    eval_steps=500,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU is available
    load_best_model_at_end=True,
    metric_for_best_model="rouge2",
    greater_is_better=True
)


> Initialize the Trainer

In [None]:
# Load the pre-trained model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)


rouge = evaluate.load("rouge") # Load evaluation metric

# rouge = torchmetrics.text.ROUGEScore()  # Initialize TorchMetrics ROUGE


# Define a compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = [[(label if label != -100 else tokenizer.pad_token_id) for label in doc] for doc in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    #------------------------------- ROUGE Score (using evaluate) -------------------------------#
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(tokenizer.tokenize(pred)) for pred in decoded_preds]
    decoded_labels = ["\n".join(tokenizer.tokenize(label)) for label in decoded_labels]
    
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract the median scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    #--------------------------------------------------------------------------#
    #------------------------------- ROUGE Score (using TorchMetrics) -------------------------------#
    # # Update the ROUGE metric
    # rouge.reset()  # Reset metrics to ensure no accumulation from previous evaluations
    # rouge.update(predictions=decoded_preds, references=decoded_labels)
    # rouge_scores = rouge.compute()
    
    # # Extract the scores
    # result = {
    #     "rouge1": rouge_scores["rouge1"].mid.fmeasure * 100,
    #     "rouge2": rouge_scores["rouge2"].mid.fmeasure * 100,
    #     "rougeL": rouge_scores["rougeL"].mid.fmeasure * 100,
    # }
    #--------------------------------------------------------------------------#
    return result

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


> Train and Evaluate the Model

In [None]:
# Start training
trainer.train()

# Evaluate the model
trainer.evaluate()


In [None]:
model.save_pretrained("model-saved")  # Save the model
tokenizer.save_pretrained("tokenizer-saved")  # Save the tokenizer


# Load the saved model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("model-saved").to(device)
tokenizer = AutoTokenizer.from_pretrained("tokenizer-saved").to(device)

> Visualizing AI Generated Images, Audio, and Video

In [None]:
from IPython.display import display, Markdown, Audio, Image, Video
import matplotlib.pyplot as plt
import librosa
import librosa.display
import cv2

# Display Generated Text (Assuming you have the text)
generated_text = "Once upon a time, in a land far away..."
display(Markdown(f"### **Generated Text:**\n{generated_text}"))

# Display Generated Image
generated_image_path = "generated_image.png"
display(Image(filename=generated_image_path, width=400, height=300))

# Display Generated Audio
generated_audio_path = "generated_audio.mp3"
display(Audio(filename=generated_audio_path, autoplay=False))

# Display Audio Waveform
y, sr = librosa.load(generated_audio_path)
plt.figure(figsize=(14, 5))
librosa.display.waveshow(y, sr=sr)
plt.title("AI-Generated Audio Waveform")
plt.xlabel("Time (seconds)")
plt.ylabel("Amplitude")
plt.show()

# Display Generated Video
generated_video_path = "generated_video.mp4"
display(Video(filename=generated_video_path, embed=True, width=640, height=480))

# Display First Frame of Video
cap = cv2.VideoCapture(generated_video_path)
ret, frame = cap.read()

if ret:
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    plt.imshow(frame)
    plt.axis('off')
    plt.title("First Frame of the AI-Generated Video")
    plt.show()
else:
    print("Failed to read the video.")

cap.release()


TypeError: a bytes-like object is required, not 'NoneType'

<IPython.core.display.Video object>

In [33]:
from IPython.display import Audio, display

# Play audio from a URL
audio_url = "https://www.soundjay.com/ambient/sounds/boarding-accouncement-1.mp3"
display(Audio(url=audio_url, autoplay=True))




## Finetuning LLM

> Parameter-Efficient Fine-Tuning with LoRA and Other Methods

In [None]:
# Parameter-Efficient Fine-Tuning Methods for LLMs
# ================================================

# LoRA and Other PEFT Methods - The first example demonstrates several Parameter-Efficient Fine-Tuning techniques including:
    # LoRA (Low-Rank Adaptation) which adds small trainable rank decomposition matrices
        # LoRA is a parameter-efficient fine-tuning method that introduces
        # new trainable parameters to modify a model's behavior without
        # increasing its overall size. By doing so, LoRA maintains the original
        # parameter count, reducing the memory overhead typically associated
        # with training large models. It works by adding low-rank matrix
        # adaptations to the model's existing layers, allowing for significant
        # performance improvements while keeping resource consumption in
        # check.
    # Prefix Tuning which prepends trainable vectors to intermediate activations
    # Prompt Tuning which adds trainable tokens to the input

import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    PrefixTuningConfig,
    PromptTuningConfig,
    PromptTuningInit,
    PeftModel
)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Configuration
BASE_MODEL_NAME = "meta-llama/Llama-2-7b-hf"  # Example foundation model
DATASET_NAME = "your_dataset"  # Replace with your dataset
OUTPUT_DIR = "./peft_output"
PEFT_METHOD = "lora"  # Options: "lora", "prefix_tuning", "prompt_tuning"

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token  # Ensure pad token is set

# Set up tokenizer for proper truncation
tokenizer.model_max_length = 1024  # Adjust based on your GPU memory

print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    torch_dtype=torch.float16,  # Use half precision to save memory
    device_map="auto"  # Automatically distribute model across available GPUs
)

# 1. LoRA Configuration 
# Low-Rank Adaptation - adds trainable rank decomposition matrices to existing weights
if PEFT_METHOD == "lora":
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,                     # Rank of update matrices
        lora_alpha=32,           # Parameter scaling factor
        lora_dropout=0.1,        # Dropout probability for LoRA layers
        target_modules=["q_proj", "v_proj"]  # Which modules to apply LoRA to
    )

# 2. Prefix Tuning Configuration
# Adds trainable continuous prefix vectors to activations
elif PEFT_METHOD == "prefix_tuning":
    peft_config = PrefixTuningConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        num_virtual_tokens=20,   # Number of virtual tokens to add
        prefix_projection=True   # Whether to use a two-layer MLP for reparameterization
    )

# 3. Prompt Tuning Configuration
# Adds trainable soft prompt embeddings to the input
elif PEFT_METHOD == "prompt_tuning":
    peft_config = PromptTuningConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        num_virtual_tokens=20,   # Number of virtual prompt tokens to add
        prompt_tuning_init=PromptTuningInit.TEXT,  # Initialize from text
        prompt_tuning_init_text="Solve the following task: "  # Text to initialize from
    )

else:
    raise ValueError(f"PEFT method {PEFT_METHOD} not supported")

# Apply the PEFT method to the model
print(f"Applying {PEFT_METHOD} configuration...")
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # Print number of trainable parameters

# Load and prepare dataset
# Replace this with your own dataset loading logic
def load_custom_dataset():
    # Example: loading a dataset from Hugging Face
    try:
        dataset = load_dataset(DATASET_NAME)
        return dataset
    except:
        # Placeholder for custom dataset loading
        print("Replace this with your custom dataset loading code")
        # Example structure for a toy dataset
        return {
            "train": [{"text": "Example instruction. Example response."}],
            "validation": [{"text": "Example validation instruction. Example validation response."}]
        }

# Format data for instruction fine-tuning
def format_instruction(example):
    # Adapt this to your specific dataset format
    formatted_text = f"### Instruction: {example['instruction']}\n### Response: {example['response']}"
    return {"text": formatted_text}

# Load and process dataset
print("Loading and processing dataset...")
dataset = load_custom_dataset()

# Apply formatting and tokenization
def tokenize_function(examples):
    # Tokenize the texts
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=tokenizer.model_max_length
    )

# Process dataset if it's in the expected Hugging Face format
if hasattr(dataset, "map"):
    # Format to instruction style if needed
    if "instruction" in dataset["train"].column_names:
        dataset = dataset.map(format_instruction)
    
    # Tokenize
    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=dataset["train"].column_names
    )
else:
    print("Using placeholder dataset - replace with your data")
    # Create a minimal example for demonstration
    tokenized_dataset = {
        "train": [{"input_ids": tokenizer("Example text for training.").input_ids, "attention_mask": [1] * 10}],
        "validation": [{"input_ids": tokenizer("Example text for validation.").input_ids, "attention_mask": [1] * 10}]
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    per_device_train_batch_size=4,  # Adjust based on your GPU memory
    per_device_eval_batch_size=4,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    learning_rate=3e-4,
    weight_decay=0.01,
    warmup_steps=0,
    logging_steps=100,
    num_train_epochs=3,
    gradient_accumulation_steps=4,  # Increase to simulate larger batch sizes
    fp16=True,  # Mixed precision training
    report_to="tensorboard",
    remove_unused_columns=True,
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Not using masked language modeling
)

# Initialize Trainer
print("Setting up trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"] if hasattr(dataset, "map") else tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"] if hasattr(dataset, "map") else tokenized_dataset["validation"],
    data_collator=data_collator,
)

# Train the model
print("Starting training...")
trainer.train()

# Save the fine-tuned model
print("Saving model...")
trainer.save_model(os.path.join(OUTPUT_DIR, "final_model"))

# --- Using the fine-tuned model for inference ---
def load_finetuned_model(model_path):
    # Load the base model
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    
    # Load the PEFT adapter
    model = PeftModel.from_pretrained(base_model, model_path)
    return model, tokenizer

# Example inference
def generate_text(model, tokenizer, prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
model, tokenizer = load_finetuned_model(os.path.join(OUTPUT_DIR, "final_model"))
prompt = "Write a summary of recent advancements in artificial intelligence."
generated_text = generate_text(model, tokenizer, prompt)
print(generated_text)

> QLoRA Fine-Tuning Implementation

In [None]:
# QLoRA Fine-Tuning Implementation
# ===============================
# QLoRA combines quantization with LoRA for even more efficient fine-tuning

# QLoRA Implementation - The second example shows QLoRA, which combines:
    # 4-bit quantization to dramatically reduce memory requirements
    # LoRA adapters for parameter-efficient training
    # This approach allows fine-tuning of much larger models on consumer hardware

import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments, 
    Trainer, 
    DataCollatorForLanguageModeling
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    PeftModel
)

# Configuration
MODEL_NAME = "meta-llama/Llama-2-7b-hf"
DATASET_NAME = "your-dataset"  # Replace with your dataset
OUTPUT_DIR = "./qlora_output"

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Setup 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,               # Load model in 4-bit precision
    bnb_4bit_use_double_quant=True,  # Use double quantization for 4-bit
    bnb_4bit_quant_type="nf4",       # Normalized float 4-bit quantization
    bnb_4bit_compute_dtype=torch.bfloat16  # Compute in bfloat16
)

# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"  # Automatically distribute model across available GPUs
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token

# Prepare model for k-bit training
# This addresses some issues with quantized training
model = prepare_model_for_kbit_training(model)

# Define LoRA configuration
peft_config = LoraConfig(
    r=8,                     # Rank of update matrices
    lora_alpha=32,           # Parameter scaling factor
    lora_dropout=0.1,        # Dropout probability
    # Target attention modules that match these patterns
    target_modules=[
        "q_proj",
        "v_proj",
        "k_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj"
    ],
    bias="none",              # Don't train bias parameters
    task_type="CAUSAL_LM"     # Task type
)

# Apply LoRA adapters to the model
model = get_peft_model(model, peft_config)

# Print trainable parameters info
model.print_trainable_parameters()

# Load and prepare dataset
# This is a placeholder - replace with your actual dataset loading code
def load_and_prepare_dataset():
    # Example - using Hugging Face datasets
    try:
        dataset = load_dataset(DATASET_NAME)
        
        # Tokenize function
        def tokenize_function(examples):
            # Format instruction-response pairs if needed
            # This is a simple example - adjust for your specific format
            if "instruction" in examples and "response" in examples:
                texts = [f"### Instruction: {i}\n### Response: {r}" 
                        for i, r in zip(examples["instruction"], examples["response"])]
            else:
                texts = examples["text"]
            
            return tokenizer(
                texts,
                padding="max_length",
                truncation=True,
                max_length=512  # Adjust based on your needs
            )
        
        # Apply tokenization
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=[col for col in dataset["train"].column_names 
                           if col not in ["input_ids", "attention_mask"]]
        )
        
        return tokenized_dataset
    
    except Exception as e:
        print(f"Error loading dataset: {e}")
        print("Using a minimal example dataset for demonstration")
        
        # Return a minimal example dataset for demonstration
        from datasets import Dataset
        import numpy as np
        
        # Create random token IDs as an example
        sample_size = 100
        sample_length = 128
        
        # Create sample input IDs and attention mask
        inputs = {"input_ids": [], "attention_mask": []}
        for _ in range(sample_size):
            input_ids = [tokenizer.bos_token_id] + np.random.randint(
                100, 10000, sample_length - 2
            ).tolist() + [tokenizer.eos_token_id]
            attention_mask = [1] * sample_length
            
            inputs["input_ids"].append(input_ids)
            inputs["attention_mask"].append(attention_mask)
        
        # Create HF datasets
        train_dataset = Dataset.from_dict(inputs)
        eval_dataset = Dataset.from_dict({k: v[:10] for k, v in inputs.items()})
        
        return {"train": train_dataset, "validation": eval_dataset}

# Load dataset
print("Loading and preparing dataset...")
dataset = load_and_prepare_dataset()

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=2,  # Can be smaller due to 4-bit quantization
    gradient_accumulation_steps=8,  # Accumulate gradients to simulate larger batch
    warmup_steps=100,
    weight_decay=0.01,
    logging_steps=10,
    save_strategy="steps",
    save_steps=200,
    evaluation_strategy="steps",
    eval_steps=200,
    save_total_limit=3,
    learning_rate=2e-4,
    fp16=True,  # Mixed precision
    report_to="tensorboard",
    remove_unused_columns=True,
    push_to_hub=False,  # Set to True if you want to push to HF Hub
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
)

# Train model
print("Starting QLoRA fine-tuning...")
trainer.train()

# Save the fine-tuned model adapters
model.save_pretrained(os.path.join(OUTPUT_DIR, "final_model"))

# Optionally save the merged model (base + adapters)
# Note: This requires more memory as it creates a full copy of the model
def save_merged_model():
    # Load the base model
    base_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map="auto",
        torch_dtype=torch.float16  # Use half-precision to save memory
    )
    
    # Load the adapters into the base model
    model = PeftModel.from_pretrained(base_model, os.path.join(OUTPUT_DIR, "final_model"))
    
    # Merge adapters with the base model
    merged_model = model.merge_and_unload()
    
    # Save the merged model
    merged_model.save_pretrained(os.path.join(OUTPUT_DIR, "merged_model"))
    tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "merged_model"))
    
    print(f"Merged model saved to {os.path.join(OUTPUT_DIR, 'merged_model')}")

# Uncomment to save the merged model
# print("Merging and saving the full model (requires more memory)...")
# save_merged_model()

# Inference example
def inference_example():
    # Load fine-tuned model with adapters
    base_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto"
    )
    
    model = PeftModel.from_pretrained(base_model, os.path.join(OUTPUT_DIR, "final_model"))
    
    # Generate text
    prompt = "### Instruction: Explain quantum computing in simple terms.\n### Response:"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=200,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Prompt: {prompt}")
    print(f"Generated response: {response}")

# Uncomment to run an inference example
# print("Running inference with the fine-tuned model...")
# inference_example()

> Adapter-Based Fine-Tuning with Transformers

In [None]:
# Adapter-Based Fine-Tuning with Transformers
# ===========================================
# This script demonstrates using adapter-based fine-tuning with the adapters library

# Adapter-Based Fine-Tuning - The third example implements:
    # Traditional adapter approaches (Pfeiffer, Houlsby)
    # IA³ (Infused Adapter by Inhibiting and Amplifying Inner Activations)
    # Adapter composition for combining multiple fine-tuning adaptations


import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from transformers.adapters import (
    AdapterConfig,
    PfeifferConfig,
    HoulsbyConfig,
    IA3Config
)

# Configuration
MODEL_NAME = "gpt2-medium"  # Using a smaller model for demonstration
DATASET_NAME = "your-dataset"  # Replace with actual dataset
OUTPUT_DIR = "./adapters_output"
ADAPTER_TYPE = "pfeiffer"  # Options: "pfeiffer", "houlsby", "ia3"

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# Configure adapters
if ADAPTER_TYPE == "pfeiffer":
    # Pfeiffer adapters - add bottleneck adapters after attention block
    adapter_config = PfeifferConfig(
        reduction_factor=16,   # Size reduction for adapter bottleneck
        non_linearity="relu"   # Activation function
    )
elif ADAPTER_TYPE == "houlsby":
    # Houlsby adapters - add adapters both after attention and feed-forward
    adapter_config = HoulsbyConfig(
        reduction_factor=16,
        non_linearity="relu"
    )
elif ADAPTER_TYPE == "ia3":
    # IA³ (Infused Adapter by Inhibiting and Amplifying Inner Activations)
    adapter_config = IA3Config()
else:
    raise ValueError(f"Adapter type {ADAPTER_TYPE} not supported.")

# Add adapter to model
adapter_name = "custom_task_adapter"
model.add_adapter(adapter_name, config=adapter_config)

# Activate adapter for training
model.train_adapter(adapter_name)

# Print number of trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
all_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params} ({trainable_params/all_params:.2%})")

# Load and prepare dataset
def load_and_prepare_dataset():
    # Example - using Hugging Face datasets
    try:
        dataset = load_dataset(DATASET_NAME)
        
        # Tokenize function
        def tokenize_function(examples):
            # Format based on your dataset structure
            # This example assumes 'text' field in the dataset
            return tokenizer(
                examples["text"],
                padding="max_length",
                truncation=True,
                max_length=512  # Adjust based on your needs
            )
        
        # Apply tokenization
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=[col for col in dataset["train"].column_names 
                           if col not in ["input_ids", "attention_mask"]]
        )
        
        return tokenized_dataset
    
    except Exception as e:
        print(f"Error loading dataset: {e}")
        print("Using a minimal example dataset for demonstration")
        
        # Return a minimal example dataset for demonstration
        from datasets import Dataset
        import numpy as np
        
        # Create random token IDs as an example
        sample_size = 100
        sample_length = 128
        
        # Create sample input IDs and attention mask
        inputs = {"input_ids": [], "attention_mask": []}
        for _ in range(sample_size):
            input_ids = [tokenizer.bos_token_id] + np.random.randint(
                100, 10000, sample_length - 2
            ).tolist() + [tokenizer.eos_token_id]
            attention_mask = [1] * sample_length
            
            inputs["input_ids"].append(input_ids)
            inputs["attention_mask"].append(attention_mask)
        
        # Create HF datasets
        train_dataset = Dataset.from_dict(inputs)
        eval_dataset = Dataset.from_dict({k: v[:10] for k, v in inputs.items()})
        
        return {"train": train_dataset, "validation": eval_dataset}

# Load dataset
print("Loading and preparing dataset...")
dataset = load_and_prepare_dataset()

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    warmup_steps=100,
    weight_decay=0.01,
    logging_steps=10,
    save_strategy="steps",
    save_steps=200,
    evaluation_strategy="steps",
    eval_steps=200,
    save_total_limit=3,
    fp16=True,  # Mixed precision training
    report_to="tensorboard",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
)

# Train the model
print(f"Starting adapter-based fine-tuning with {ADAPTER_TYPE} adapters...")
trainer.train()

# Save the adapter
model.save_adapter(os.path.join(OUTPUT_DIR, adapter_name), adapter_name)

print(f"Adapter saved to {os.path.join(OUTPUT_DIR, adapter_name)}")

# Demonstrate adapter inference
def inference_with_adapter():
    # Load the pre-trained model
    inference_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
    
    # Load the fine-tuned adapter
    inference_model.load_adapter(os.path.join(OUTPUT_DIR, adapter_name))
    
    # Activate the adapter for inference
    inference_model.set_active_adapters(adapter_name)
    
    # Generate text
    prompt = "The future of artificial intelligence involves"
    inputs = tokenizer(prompt, return_tensors="pt")
    
    outputs = inference_model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=100,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Prompt: {prompt}")
    print(f"Generated response: {response}")
    
    # Compare with base model (no adapter)
    inference_model.set_active_adapters(None)  # Deactivate adapter
    
    outputs_base = inference_model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=100,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    
    response_base = tokenizer.decode(outputs_base[0], skip_special_tokens=True)
    print(f"Base model response: {response_base}")

# Uncomment to test inference
# print("\nTesting inference with the fine-tuned adapter...")
# inference_with_adapter()

# Example of adapter composition (combining multiple adapters)
def adapter_composition_example():
    # This demonstrates how to combine multiple adapters for different tasks
    
    # Load the model
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
    
    # Add and train first adapter (e.g., for domain adaptation)
    domain_adapter = "domain_adapter"
    model.add_adapter(domain_adapter, config=PfeifferConfig(reduction_factor=16))
    # Train domain adapter (simplified)
    model.train_adapter(domain_adapter)
    # ... training code here ...
    model.save_adapter(os.path.join(OUTPUT_DIR, domain_adapter), domain_adapter)
    
    # Add and train second adapter (e.g., for task-specific adaptation)
    task_adapter = "task_adapter"
    model.add_adapter(task_adapter, config=PfeifferConfig(reduction_factor=16))
    # Train task adapter with domain adapter frozen
    model.train_adapter(task_adapter)
    # ... training code here ...
    model.save_adapter(os.path.join(OUTPUT_DIR, task_adapter), task_adapter)
    
    # Stack adapters for inference
    model.set_active_adapters([domain_adapter, task_adapter])
    
    # Generate with stacked adapters
    prompt = "The new technology enables"
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=100
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    print(f"Stacked adapters response: {response}")

# Note: This function is just an example and won't run properly without multiple trained adapters
# print("\nAdapter composition example (conceptual)...")
# adapter_composition_example()

> BitFit Fine-Tuning Implementation

In [None]:
# BitFit Fine-Tuning Implementation
# ================================
# BitFit only trains the bias terms in a pre-trained model, keeping all weights frozen

import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)

# Configuration
MODEL_NAME = "gpt2-medium"  # Using a smaller model for demonstration
DATASET_NAME = "your-dataset"  # Replace with your actual dataset
OUTPUT_DIR = "./bitfit_output"

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# Freeze all parameters (weights)
for param in model.parameters():
    param.requires_grad = False

# Unfreeze only the bias terms
for name, param in model.named_parameters():
    if "bias" in name:
        param.requires_grad = True

# Print number of trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
all_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params} ({trainable_params/all_params:.2%} of all parameters)")

# Load and prepare dataset
def load_and_prepare_dataset():
    try:
        dataset = load_dataset(DATASET_NAME)
        
        # Tokenize function
        def tokenize_function(examples):
            return tokenizer(
                examples["text"],
                padding="max_length",
                truncation=True,
                max_length=512
            )
        
        # Apply tokenization
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=[col for col in dataset["train"].column_names 
                           if col not in ["input_ids", "attention_mask"]]
        )
        
        return tokenized_dataset
    
    except Exception as e:
        print(f"Error loading dataset: {e}")
        print("Using a minimal example dataset for demonstration")
        
        # Create a minimal example dataset
        from datasets import Dataset
        import numpy as np
        
        # Sample data
        sample_size = 100
        sample_length = 128
        
        # Create sample input IDs and attention mask
        inputs = {"input_ids": [], "attention_mask": []}
        for _ in range(sample_size):
            input_ids = [tokenizer.bos_token_id] + np.random.randint(
                100, 10000, sample_length - 2
            ).tolist() + [tokenizer.eos_token_id]
            attention_mask = [1] * sample_length
            
            inputs["input_ids"].append(input_ids)
            inputs["attention_mask"].append(attention_mask)
        
        # Create datasets
        train_dataset = Dataset.from_dict(inputs)
        eval_dataset = Dataset.from_dict({k: v[:10] for k, v in inputs.items()})
        
        return {"train": train_dataset, "validation": eval_dataset}

# Load dataset
print("Loading and preparing dataset...")
dataset = load_and_prepare_dataset()

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,  # Can use higher learning rates with BitFit
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=10,
    save_strategy="steps",
    save_steps=200,
    evaluation_strategy="steps",
    eval_steps=200,
    save_total_limit=3,
    fp16=True,
    report_to="tensorboard",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
)

# Train the model
print("Starting BitFit fine-tuning...")
trainer.train()

# Save the fine-tuned model
print("Saving BitFit fine-tuned model...")
trainer.save_model(os.path.join(OUTPUT_DIR, "final_model"))
tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "final_model"))

# Inference example
def inference_example():
    # Load the fine-tuned model
    model = AutoModelForCausalLM.from_pretrained(os.path.join(OUTPUT_DIR, "final_model"))
    tokenizer = AutoTokenizer.from_pretrained(os.path.join(OUTPUT_DIR, "final_model"))
    
    # Set the model to evaluation mode
    model.eval()
    
    # Generate text
    prompt = "The future of technology will be shaped by"
    inputs = tokenizer(prompt, return_tensors="pt")
    
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=100,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Prompt: {prompt}")
    print(f"Generated response: {response}")

# Uncomment to run an inference example
# print("Running inference with the BitFit fine-tuned model...")
# inference_example()

> Selective Layer Fine-Tuning Implementation

In [None]:
# Selective Layer Fine-Tuning Implementation
# =========================================
# This script demonstrates fine-tuning only specific layers of a pre-trained model

import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)

# Configuration
MODEL_NAME = "gpt2-medium"  # Using a smaller model for demonstration
DATASET_NAME = "your-dataset"  # Replace with your actual dataset
OUTPUT_DIR = "./selective_layer_output"

# Fine-tuning options
NUM_LAYERS_TO_FREEZE = 8  # Number of layers to freeze from the bottom
# Or for top layers only:
# NUM_LAYERS_TO_UNFREEZE = 4  # Number of top layers to fine-tune

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# Print model structure for inspection
print("Model architecture:")
for name, _ in model.named_modules():
    print(name)

# Helper function to freeze bottom layers
def freeze_bottom_layers(model, num_layers_to_freeze):
    """Freeze the bottom layers of the model, keeping top layers trainable"""
    # This is specifically for GPT-2 architecture,
    # adapt this for other model architectures as needed
    
    # Freeze embeddings
    for param in model.transformer.wte.parameters():
        param.requires_grad = False
    
    for param in model.transformer.wpe.parameters():
        param.requires_grad = False
    
    # Freeze the specified number of layers from the bottom
    for i in range(num_layers_to_freeze):
        for param in model.transformer.h[i].parameters():
            param.requires_grad = False
    
    # Verify which layers are frozen/unfrozen
    for i, layer in enumerate(model.transformer.h):
        trainable = any(param.requires_grad for param in layer.parameters())
        status = "Trainable" if trainable else "Frozen"
        print(f"Layer {i}: {status}")
    
    # Count trainable parameters
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    all_params = sum(p.numel() for p in model.parameters())
    print(f"Trainable parameters: {trainable_params} ({trainable_params/all_params:.2%} of all parameters)")
    
    return model

# Alternative: Helper function to only unfreeze top layers
def unfreeze_top_layers(model, num_layers_to_unfreeze):
    """Freeze all layers except the top N layers"""
    
    # First, freeze everything
    for param in model.parameters():
        param.requires_grad = False
    
    # Then unfreeze the top layers
    total_layers = len(model.transformer.h)
    for i in range(total_layers - num_layers_to_unfreeze, total_layers):
        for param in model.transformer.h[i].parameters():
            param.requires_grad = True
    
    # Also unfreeze the final layer norm and output layers
    for param in model.transformer.ln_f.parameters():
        param.requires_grad = True
    
    # For causal language models, the output layer is often tied to the input embeddings
    # Check if the model has a separate output layer
    if hasattr(model, 'lm_head') and not model.config.tie_word_embeddings:
        for param in model.lm_head.parameters():
            param.requires_grad = True
    
    # Verify which layers are frozen/unfrozen
    for i, layer in enumerate(model.transformer.h):
        trainable = any(param.requires_grad for param in layer.parameters())
        status = "Trainable" if trainable else "Frozen"
        print(f"Layer {i}: {status}")
    
    # Count trainable parameters
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    all_params = sum(p.numel() for p in model.parameters())
    print(f"Trainable parameters: {trainable_params} ({trainable_params/all_params:.2%} of all parameters)")
    
    return model

# Apply selective layer freezing
print(f"Freezing bottom {NUM_LAYERS_TO_FREEZE} layers...")
model = freeze_bottom_layers(model, NUM_LAYERS_TO_FREEZE)

# If you want to use the alternative approach, uncomment this:
# print(f"Unfreezing only the top {NUM_LAYERS_TO_UNFREEZE} layers...")
# model = unfreeze_top_layers(model, NUM_LAYERS_TO_UNFREEZE)

# Load and prepare dataset
def load_and_prepare_dataset():
    try:
        dataset = load_dataset(DATASET_NAME)
        
        # Tokenize function
        def tokenize_function(examples):
            return tokenizer(
                examples["text"],
                padding="max_length",
                truncation=True,
                max_length=512
            )
        
        # Apply tokenization
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=[col for col in dataset["train"].column_names 
                           if col not in ["input_ids", "attention_mask"]]
        )
        
        return tokenized_dataset
    
    except Exception as e:
        print(f"Error loading dataset: {e}")
        print("Using a minimal example dataset for demonstration")
        
        # Create a minimal example dataset
        from datasets import Dataset
        import numpy as np
        
        # Sample data
        sample_size = 100
        sample_length = 128
        
        # Create sample input IDs and attention mask
        inputs = {"input_ids": [], "attention_mask": []}
        for _ in range(sample_size):
            input_ids = [tokenizer.bos_token_id] + np.random.randint(
                100, 10000, sample_length - 2
            ).tolist() + [tokenizer.eos_token_id]
            attention_mask = [1] * sample_length
            
            inputs["input_ids"].append(input_ids)
            inputs["attention_mask"].append(attention_mask)
        
        # Create datasets
        train_dataset = Dataset.from_dict(inputs)
        eval_dataset = Dataset.from_dict({k: v[:10] for k, v in inputs.items()})
        
        return {"train": train_dataset, "validation": eval_dataset}

# Load dataset
print("Loading and preparing dataset...")
dataset = load_and_prepare_dataset()

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=10,
    save_strategy="steps",
    save_steps=200,
    evaluation_strategy="steps",
    eval_steps=200,
    save_total_limit=3,
    fp16=True,
    report_to="tensorboard",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
)

# Train the model
print("Starting selective layer fine-tuning...")
trainer.train()

# Save the fine-tuned model
print("Saving selectively fine-tuned model...")
trainer.save_model(os.path.join(OUTPUT_DIR, "final_model"))
tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "final_model"))

# Inference example
def inference_example():
    # Load the fine-tuned model
    model = AutoModelForCausalLM.from_pretrained(os.path.join(OUTPUT_DIR, "final_model"))
    tokenizer = AutoTokenizer.from_pretrained(os.path.join(OUTPUT_DIR, "final_model"))
    
    # Set the model to evaluation mode
    model.eval()
    
    # Generate text
    prompt = "In the coming decades, artificial intelligence will"
    inputs = tokenizer(prompt, return_tensors="pt")
    
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=100,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Prompt: {prompt}")
    print(f"Generated response: {response}")

# Uncomment to run an inference example
# print("Running inference with the selectively fine-tuned model...")
# inference_example()

> Multi-Task Fine-Tuning Implementation

In [None]:
# Multi-Task Fine-Tuning Implementation
# ====================================
# This script demonstrates fine-tuning a model on multiple tasks simultaneously

import os
import torch
import numpy as np
from datasets import load_dataset, DatasetDict, concatenate_datasets
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import get_peft_model, LoraConfig, TaskType

# Configuration
MODEL_NAME = "gpt2-medium"  # Using a smaller model for demonstration
OUTPUT_DIR = "./multitask_output"

# Tasks configuration - replace with your actual datasets and task formats
TASKS = {
    "summarization": {
        "dataset": "your-summarization-dataset",  # Replace with actual dataset
        "instruction": "Summarize the following text: ",
        "separator": "\nSummary: "
    },
    "sentiment": {
        "dataset": "your-sentiment-dataset",  # Replace with actual dataset
        "instruction": "Analyze the sentiment of the following text: ",
        "separator": "\nSentiment: "
    },
    "qa": {
        "dataset": "your-qa-dataset",  # Replace with actual dataset
        "instruction": "Answer the following question: ",
        "separator": "\nAnswer: "
    }
}

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# Optionally apply LoRA for parameter-efficient fine-tuning
USE_LORA = True  # Set to False for full fine-tuning

if USE_LORA:
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=["c_attn", "c_proj"]  # Adapt these target modules to your model architecture
    )
    
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

# Function to create a multi-task dataset
def create_multitask_dataset():
    # This function combines datasets from multiple tasks into a single training dataset
    
    # For demonstration purposes, we'll create synthetic data
    # In a real scenario, you would load actual datasets using load_dataset()
    
    try:
        all_datasets = {}
        
        for task_name, task_config in TASKS.items():
            try:
                # Try to load the real dataset (replace this with your actual dataset loading)
                # dataset = load_dataset(task_config["dataset"])
                
                # For demonstration, we'll create a synthetic dataset
                print(f"Creating synthetic data for {task_name} task...")
                
                # Create a minimal example dataset
                from datasets import Dataset
                
                # Sample data size
                sample_size = 100
                
                # Generate synthetic data based on task type
                if task_name == "summarization":
                    texts = [
                        f"Long document {i} with lots of information that needs to be summarized. It contains multiple sentences and details about various topics." 
                        for i in range(sample_size)
                    ]
                    summaries = [f"Concise summary of document {i}." for i in range(sample_size)]
                    
                    # Format according to instruction template
                    formatted_texts = [
                        f"{task_config['instruction']}{text}{task_config['separator']}{summary}"
                        for text, summary in zip(texts, summaries)
                    ]
                
                elif task_name == "sentiment":
                    texts = [f"Sample review text {i} expressing an opinion." for i in range(sample_size)]
                    sentiments = np.random.choice(["positive", "negative", "neutral"], size=sample_size)
                    
                    # Format according to instruction template
                    formatted_texts = [
                        f"{task_config['instruction']}{text}{task_config['separator']}{sentiment}"
                        for text, sentiment in zip(texts, sentiments)
                    ]
                
                elif task_name == "qa":
                    questions = [f"Question {i} about a specific topic?" for i in range(sample_size)]
                    answers = [f"Detailed answer to question {i}." for i in range(sample_size)]
                    
                    # Format according to instruction template
                    formatted_texts = [
                        f"{task_config['instruction']}{question}{task_config['separator']}{answer}"
                        for question, answer in zip(questions, answers)
                    ]
                
                # Create dataset dictionary with 'text' field
                dataset_dict = {"text": formatted_texts}
                
                # Create Dataset object
                dataset = Dataset.from_dict(dataset_dict)
                
                # Split into train and validation
                dataset = dataset.train_test_split(test_size=0.1)
                dataset = DatasetDict({
                    "train": dataset["train"],
                    "validation": dataset["test"]
                })
                
                all_datasets[task_name] = dataset
            
            except Exception as e:
                print(f"Error creating dataset for {task_name}: {e}")
        
        # Combine all datasets for training
        if all_datasets:
            combined_train = concatenate_datasets([ds["train"] for ds in all_datasets.values()])
            combined_val = concatenate_datasets([ds["validation"] for ds in all_datasets.values()])
            
            return DatasetDict({
                "train": combined_train,
                "validation": combined_val
            })
        else:
            raise ValueError("No datasets could be created")
        
    except Exception as e:
        print(f"Error creating multitask dataset: {e}")
        print("Using a minimal example dataset for demonstration")
        
        # Create a fallback minimal dataset if everything else fails
        from datasets import Dataset
        
        # Sample data
        sample_texts = [
            "This is an example for task A.",
            "This is another example for task B.",
            "A third example for task C."
        ]
        
        # Create dataset
        dummy_dataset = Dataset.from_dict({"text": sample_texts})
        dummy_split = dummy_dataset.train_test_split(test_size=0.2)
        
        return DatasetDict({
            "train": dummy_split["train"],
            "validation": dummy_split["test"]
        })

# Tokenize function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

# Load and prepare multitask dataset
print("Creating and preparing multitask dataset...")
dataset = create_multitask_dataset()

# Tokenize the dataset
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

print(f"Train dataset size: {len(tokenized_dataset['train'])}")
print(f"Validation dataset size: {len(tokenized_dataset['validation'])}")

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=10,
    save_strategy="steps",
    save_steps=200,
    evaluation_strategy="steps",
    eval_steps=200,
    save_total_limit=3,
    fp16=True,
    report_to="tensorboard",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
)

# Train the model
print("Starting multi-task fine-tuning...")
trainer.train()

# Save the fine-tuned model
print("Saving multi-task fine-tuned model...")
trainer.save_model(os.path.join(OUTPUT_DIR, "final_model"))
tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "final_model"))

# Inference examples for different tasks
def inference_examples():
    # Load the fine-tuned model
    if USE_LORA:
        from peft import PeftModel
        
        base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
        model = PeftModel.from_pretrained(base_model, os.path.join(OUTPUT_DIR, "final_model"))
    else:
        model = AutoModelForCausalLM.from_pretrained(os.path.join(OUTPUT_DIR, "final_model"))
    
    tokenizer = AutoTokenizer.from_pretrained(os.path.join(OUTPUT_DIR, "final_model"))
    
    # Set the model to evaluation mode
    model.eval()
    
    # Test each task
    for task_name, task_config in TASKS.items():
        print(f"\nTesting {task_name} task:")
        
        # Create a task-specific prompt
        if task_name == "summarization":
            prompt = f"{task_config['instruction']}The researchers conducted a comprehensive study on climate change impacts across different regions. They found that coastal areas are particularly vulnerable to rising sea levels, while inland agricultural zones face increased drought risks. The report highlights the need for adaptive strategies tailored to local conditions.{task_config['separator']}"
        
        elif task_name == "sentiment":
            prompt = f"{task_config['instruction']}The new restaurant had amazing food but the service was extremely slow and the prices were too high for what they offered.{task_config['separator']}"
        
        elif task_name == "qa":
            prompt = f"{task_config['instruction']}What are the main advantages of transformer-based language models compared to RNNs?{task_config['separator']}"
        
        # Generate response
        inputs = tokenizer(prompt, return_tensors="pt")
        
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=100,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Prompt: {prompt}")
        print(f"Generated response: {response}")

# Uncomment to run inference examples
# print("\nRunning inference with the multi-task fine-tuned model...")
# inference_examples()

# Example of how to evaluate on specific tasks
def evaluate_on_specific_task(task_name):
    """Evaluate the multi-task model on a specific task"""
    
    print(f"Evaluating model performance on {task_name} task...")
    
    # In a real scenario, you would load a task-specific evaluation dataset
    # and implement appropriate evaluation metrics
    
    # Example code (not functional without actual task datasets):
    """
    # Load task-specific test dataset
    task_test_dataset = load_dataset(TASKS[task_name]["dataset"], split="test")
    
    # Format according to the task template
    def format_for_task(example):
        formatted_text = f"{TASKS[task_name]['instruction']}{example['input']}{TASKS[task_name]['separator']}"
        return {"text": formatted_text, "label": example["output"]}
    
    test_dataset = task_test_dataset.map(format_for_task)
    
    # Load model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(os.path.join(OUTPUT_DIR, "final_model"))
    tokenizer = AutoTokenizer.from_pretrained(os.path.join(OUTPUT_DIR, "final_model"))
    
    # Implement task-specific evaluation logic
    # e.g., for summarization: ROUGE scores
    # for sentiment: accuracy, F1 score
    # for QA: exact match, F1 score
    """
    
    print(f"Evaluation on {task_name} completed.")

# Example usage (commented out)
# for task_name in TASKS.keys():
#     evaluate_on_specific_task(task_name)

> Knowledge Distillation Fine-Tuning Implementation

In [None]:
# Knowledge Distillation Fine-Tuning Implementation
# ===============================================
# This script demonstrates knowledge distillation from a larger teacher model to a smaller student model

import os
import torch
import torch.nn.functional as F
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)

# Configuration
TEACHER_MODEL_NAME = "gpt2-large"  # Larger teacher model
STUDENT_MODEL_NAME = "gpt2"        # Smaller student model
DATASET_NAME = "your-dataset"      # Replace with your actual dataset
OUTPUT_DIR = "./distillation_output"

# Distillation parameters
ALPHA = 0.5  # Weight for distillation loss vs task-specific loss (0 to 1)
TEMPERATURE = 2.0  # Temperature for softening probability distributions

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load tokenizers
teacher_tokenizer = AutoTokenizer.from_pretrained(TEACHER_MODEL_NAME)
if teacher_tokenizer.pad_token is None:
    teacher_tokenizer.pad_token = teacher_tokenizer.eos_token

student_tokenizer = AutoTokenizer.from_pretrained(STUDENT_MODEL_NAME)
if student_tokenizer.pad_token is None:
    student_tokenizer.pad_token = student_tokenizer.eos_token

# Load models
print("Loading teacher model...")
teacher_model = AutoModelForCausalLM.from_pretrained(TEACHER_MODEL_NAME)
teacher_model.eval()  # Set teacher to evaluation mode

print("Loading student model...")
student_model = AutoModelForCausalLM.from_pretrained(STUDENT_MODEL_NAME)

# Print model sizes
teacher_params = sum(p.numel() for p in teacher_model.parameters())
student_params = sum(p.numel() for p in student_model.parameters())
print(f"Teacher model parameters: {teacher_params:,}")
print(f"Student model parameters: {student_params:,}")
print(f"Compression ratio: {teacher_params / student_params:.2f}x")

# Custom distillation trainer
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model
        self.alpha = alpha  # Weight for distillation loss
        self.temperature = temperature  # Temperature for softening distributions
    
    def compute_loss(self, model, inputs, return_outputs=False):
        # Get student outputs
        student_outputs = model(**inputs)
        student_logits = student_outputs.logits
        
        # Calculate standard language modeling loss
        labels = inputs.get("labels")
        
        # If labels are not provided, shift input_ids one position to the right
        if labels is None:
            labels = inputs["input_ids"].clone()
            labels = labels[:, 1:].contiguous()
            student_logits = student_logits[:, :-1, :].contiguous()
        
        # Standard cross-entropy loss
        loss_ce = F.cross_entropy(
            student_logits.view(-1, student_logits.size(-1)),
            labels.view(-1),
            ignore_index=-100
        )
        
        # Get teacher logits
        with torch.no_grad():
            teacher_outputs = self.teacher_model(**inputs)
            teacher_logits = teacher_outputs.logits
            
            # Match dimensions with student logits
            if labels is None:
                teacher_logits = teacher_logits[:, :-1, :].contiguous()
        
        # Calculate distillation loss
        # Soften probabilities with temperature
        soft_student_logits = F.log_softmax(student_logits / self.temperature, dim=-1)
        soft_teacher_logits = F.softmax(teacher_logits / self.temperature, dim=-1)
        
        # KL divergence loss
        loss_kd = F.kl_div(
            soft_student_logits.view(-1, soft_student_logits.size(-1)),
            soft_teacher_logits.view(-1, soft_teacher_logits.size(-1)),
            reduction='batchmean'
        ) * (self.temperature ** 2)
        
        # Combined loss
        loss = (1 - self.alpha) * loss_ce + self.alpha * loss_kd
        
        return (loss, student_outputs) if return_outputs else loss

# Load and prepare dataset
def load_and_prepare_dataset():
    try:
        dataset = load_dataset(DATASET_NAME)
        
        # Tokenize function
        def tokenize_function(examples):
            return teacher_tokenizer(  # Use teacher tokenizer for consistent tokenization
                examples["text"],
                padding="max_length",
                truncation=True,
                max_length=512
            )
        
        # Apply tokenization
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=[col for col in dataset["train"].column_names 
                           if col not in ["input_ids", "attention_mask"]]
        )
        
        return tokenized_dataset
    
    except Exception as e:
        print(f"Error loading dataset: {e}")
        print("Using a minimal example dataset for demonstration")
        
        # Create a minimal example dataset
        from datasets import Dataset
        import numpy as np
        
        # Sample size
        sample_size = 100
        sample_length = 128
        
        # Create sample input IDs and attention mask
        inputs = {"input_ids": [], "attention_mask": []}
        for _ in range(sample_size):
            input_ids = [teacher_tokenizer.bos_token_id] + np.random.randint(
                100, 10000, sample_length - 2
            ).tolist() + [teacher_tokenizer.eos_token_id]
            attention_mask = [1] * sample_length
            
            inputs["input_ids"].append(input_ids)
            inputs["attention_mask"].append(attention_mask)
        
        # Create datasets
        train_dataset = Dataset.from_dict(inputs)
        eval_dataset = Dataset.from_dict({k: v[:10] for k, v in inputs.items()})
        
        return {"train": train_dataset, "validation": eval_dataset}

# Load dataset
print("Loading and preparing dataset...")
dataset = load_and_prepare_dataset()

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=student_tokenizer,
    mlm=False
)

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,  # Smaller batch size due to both models in memory
    gradient_accumulation_steps=8,
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=10,
    save_strategy="steps",
    save_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    save_total_limit=3,
    fp16=True,  # Mixed precision training
    report_to="tensorboard",
)

# Initialize Distillation Trainer
trainer = DistillationTrainer(
    model=student_model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
    teacher_model=teacher_model,
    alpha=ALPHA,
    temperature=TEMPERATURE
)

# Train the model
print("Starting knowledge distillation fine-tuning...")
trainer.train()

# Save the distilled student model
print("Saving distilled student model...")
trainer.save_model(os.path.join(OUTPUT_DIR, "final_model"))
student_tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "final_model"))

# Evaluate the teacher and student models
def evaluate_models():
    # Load the saved student model
    distilled_model = AutoModelForCausalLM.from_pretrained(os.path.join(OUTPUT_DIR, "final_model"))
    
    # Move models to appropriate device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    teacher_model.to(device)
    distilled_model.to(device)
    
    # Set models to evaluation mode
    teacher_model.eval()
    distilled_model.eval()
    
    # Sample prompts for evaluation
    prompts = [
        "The future of artificial intelligence",
        "Climate change is affecting",
        "The benefits of renewable energy include"
    ]
    
    print("\nComparing teacher and distilled student model outputs:")
    
    for prompt in prompts:
        print(f"\nPrompt: {prompt}")
        
        # Tokenize input
        inputs = teacher_tokenizer(prompt, return_tensors="pt").to(device)
        
        # Generate from teacher model
        with torch.no_grad():
            teacher_outputs = teacher_model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=50,
                temperature=0.7,
                do_sample=True,
                seed=42  # Use same seed for fair comparison
            )
        
        teacher_text = teacher_tokenizer.decode(teacher_outputs[0], skip_special_tokens=True)
        
        # Generate from student model
        with torch.no_grad():
            student_outputs = distilled_model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=50,
                temperature=0.7,
                do_sample=True,
                seed=42  # Use same seed for fair comparison
            )
        
        student_text = student_tokenizer.decode(student_outputs[0], skip_special_tokens=True)
        
        print(f"Teacher: {teacher_text}")
        print(f"Student: {student_text}")
    
    # You could add quantitative evaluation here (perplexity, BLEU, etc.)

# Uncomment to evaluate the models
# print("\nEvaluating teacher and distilled student models...")
# evaluate_models()

# Optional: Measure inference speed comparison
def benchmark_inference_speed():
    import time
    
    # Load the saved student model
    distilled_model = AutoModelForCausalLM.from_pretrained(os.path.join(OUTPUT_DIR, "final_model"))
    
    # Move models to appropriate device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    teacher_model.to(device)
    distilled_model.to(device)
    
    # Set models to evaluation mode
    teacher_model.eval()
    distilled_model.eval()
    
    # Generate a longer sequence for meaningful timing
    prompt = "The history of artificial intelligence spans several decades, beginning with"
    inputs = teacher_tokenizer(prompt, return_tensors="pt").to(device)
    
    # Warm-up runs
    for _ in range(3):
        with torch.no_grad():
            teacher_model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=100
            )
            distilled_model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=100
            )
    
    # Time teacher model
    teacher_times = []
    for _ in range(5):
        start_time = time.time()
        with torch.no_grad():
            teacher_model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=100
            )
        teacher_times.append(time.time() - start_time)
    
    # Time student model
    student_times = []
    for _ in range(5):
        start_time = time.time()
        with torch.no_grad():
            distilled_model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=100
            )
        student_times.append(time.time() - start_time)
    
    # Calculate average times
    avg_teacher_time = sum(teacher_times) / len(teacher_times)
    avg_student_time = sum(student_times) / len(student_times)
    
    print(f"\nInference Speed Benchmark:")
    print(f"Teacher model average generation time: {avg_teacher_time:.4f} seconds")
    print(f"Student model average generation time: {avg_student_time:.4f} seconds")
    print(f"Speedup: {avg_teacher_time / avg_student_time:.2f}x")

# Uncomment to benchmark inference speed
# print("\nBenchmarking inference speed...")
# benchmark_inference_speed()

> DoRA (Weight-Decomposed Low-Rank Adaptation) Implementation

In [None]:
# DoRA (Weight-Decomposed Low-Rank Adaptation) Implementation
# ==========================================================
# This implements DoRA (Weight-Decomposed Low-Rank Adaptation) for fine-tuning LLMs
# DoRA decomposes weights into magnitude and direction components and adapts them separately

import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)

# Configuration
MODEL_NAME = "gpt2-medium"  # Using a smaller model for demonstration
DATASET_NAME = "your-dataset"  # Replace with your actual dataset
OUTPUT_DIR = "./dora_output"

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# DoRA parameters
RANK = 8  # Rank for low-rank updates
ALPHA = 16  # Scaling factor

# DoRA implementation for transformers
class DoRAModule(nn.Module):
    """
    DoRA (Weight-Decomposed Low-Rank Adaptation) module
    """
    def __init__(self, weight, rank=8, alpha=16, module_name="", target_modules=None):
        super().__init__()
        self.weight = weight  # Original frozen weight
        self.module_name = module_name
        
        # Determine if this module should be adapted with DoRA
        if target_modules is None or any(tm in module_name for tm in target_modules):
            # Decompose the weight into magnitude and direction
            with torch.no_grad():
                # Compute L2 norm along specific dimension (for different layer types)
                if "q_proj" in module_name or "k_proj" in module_name or "v_proj" in module_name or "out_proj" in module_name:
                    # For attention layers - decompose along head dimension
                    magnitude = torch.norm(weight, dim=0, keepdim=True)
                    direction = weight / (magnitude + 1e-6)  # Normalized direction vectors
                else:
                    # For other layers - decompose along output dimension
                    magnitude = torch.norm(weight, dim=1, keepdim=True)
                    direction = weight / (magnitude + 1e-6)  # Normalized direction vectors
            
            # Initialize DoRA parameters
            self.magnitude_delta = nn.Parameter(torch.zeros_like(magnitude))  # Magnitude shift
            
            # Low-rank direction adapters
            weight_shape = weight.shape
            if len(weight_shape) == 2:  # Linear layer
                self.lora_A = nn.Parameter(torch.zeros((weight_shape[0], rank)))
                self.lora_B = nn.Parameter(torch.zeros((rank, weight_shape[1])))
            else:  # Handle other shapes as needed
                # Simplified for demonstration - real implementation would handle various layer types
                self.lora_A = nn.Parameter(torch.zeros((weight_shape[0], rank)))
                self.lora_B = nn.Parameter(torch.zeros((rank, weight_shape[1])))
            
            # Initialize LoRA weights
            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
            nn.init.zeros_(self.lora_B)
            
            self.alpha = alpha
            self.scaling = alpha / rank
            self.rank = rank
            self.is_dora = True
        else:
            # Skip DoRA for modules not in target_modules
            self.is_dora = False
    
    def forward(self, x):
        if not self.is_dora:
            # Regular forward pass for non-DoRA modules
            return F.linear(x, self.weight)
        
        # Compute original normalized direction and adjusted magnitude
        with torch.no_grad():
            if len(self.weight.shape) == 2:  # Linear layer
                magnitude = torch.norm(self.weight, dim=1, keepdim=True)
                direction = self.weight / (magnitude + 1e-6)
            else:
                # Simplified - handle other layer types as needed
                magnitude = torch.norm(self.weight, dim=0, keepdim=True)
                direction = self.weight / (magnitude + 1e-6)
        
        # Apply magnitude delta
        adjusted_magnitude = magnitude + self.magnitude_delta
        
        # Compute low-rank direction update
        direction_delta = (self.lora_A @ self.lora_B) * self.scaling
        
        # Combine for final adapted weight
        adapted_weight = direction * adjusted_magnitude + direction_delta
        
        # Apply the adapted weight
        return F.linear(x, adapted_weight)

# DoRA wrapper for a model
class DoRAModel(nn.Module):
    def __init__(self, model, rank=8, alpha=16, target_modules=None):
        super().__init__()
        self.model = model
        self.rank = rank
        self.alpha = alpha
        self.target_modules = target_modules or [
            "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"
        ]
        
        # Replace target layers with DoRA modules
        self._replace_layers()
        
        # Freeze the original model weights
        for param in self.model.parameters():
            param.requires_grad = False
    
    def _replace_layers(self):
        # This is a simplified implementation for demonstration
        # In practice, recursively traverse the model and replace target layers
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Linear) and any(target in name for target in self.target_modules):
                # Create a DoRA wrapper for this layer
                dora_layer = DoRAModule(
                    module.weight,
                    rank=self.rank,
                    alpha=self.alpha,
                    module_name=name,
                    target_modules=self.target_modules
                )
                
                # Replace the original layer with DoRA
                # In practice, this requires careful handling of the module hierarchy
                # This simplified approach just illustrates the concept
                parent_name = name.rsplit(".", 1)[0] if "." in name else ""
                child_name = name.rsplit(".", 1)[1] if "." in name else name
                
                if parent_name:
                    parent = self.model.get_submodule(parent_name)
                    setattr(parent, child_name, dora_layer)
                else:
                    setattr(self.model, child_name, dora_layer)
    
    def forward(self, *args, **kwargs):
        return self.model(*args, **kwargs)
    
    def print_trainable_parameters(self):
        """Calculate and print the number of trainable parameters"""
        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
        all_params = sum(p.numel() for p in self.parameters())
        print(f"Trainable parameters: {trainable_params:,} ({trainable_params/all_params:.2%})")
        
        # Breaking down by parameter types
        magnitude_params = sum(p.numel() for n, p in self.named_parameters() 
                              if p.requires_grad and "magnitude_delta" in n)
        lora_params = sum(p.numel() for n, p in self.named_parameters() 
                         if p.requires_grad and ("lora_A" in n or "lora_B" in n))
        
        print(f"Magnitude parameters: {magnitude_params:,}")
        print(f"Direction (LoRA) parameters: {lora_params:,}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# Wrap model with DoRA
print("Applying DoRA to model...")
model = DoRAModel(
    base_model,
    rank=RANK,
    alpha=ALPHA,
    target_modules=["c_attn", "c_proj", "c_fc", "attn.c_proj"]  # Adjust for GPT-2 architecture
)

# Show trainable parameter count
model.print_trainable_parameters()

# Load and prepare dataset
def load_and_prepare_dataset():
    try:
        dataset = load_dataset(DATASET_NAME)
        
        # Tokenize function
        def tokenize_function(examples):
            return tokenizer(
                examples["text"],
                padding="max_length",
                truncation=True,
                max_length=512
            )
        
        # Apply tokenization
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=[col for col in dataset["train"].column_names 
                           if col not in ["input_ids", "attention_mask"]]
        )
        
        return tokenized_dataset
    
    except Exception as e:
        print(f"Error loading dataset: {e}")
        print("Using a minimal example dataset for demonstration")
        
        # Create a minimal example dataset
        from datasets import Dataset
        import numpy as np
        
        # Sample data
        sample_size = 100
        sample_length = 128
        
        # Create sample input IDs and attention mask
        inputs = {"input_ids": [], "attention_mask": []}
        for _ in range(sample_size):
            input_ids = [tokenizer.bos_token_id] + np.random.randint(
                100, 10000, sample_length - 2
            ).tolist() + [tokenizer.eos_token_id]
            attention_mask = [1] * sample_length
            
            inputs["input_ids"].append(input_ids)
            inputs["attention_mask"].append(attention_mask)
        
        # Create datasets
        train_dataset = Dataset.from_dict(inputs)
        eval_dataset = Dataset.from_dict({k: v[:10] for k, v in inputs.items()})
        
        return {"train": train_dataset, "validation": eval_dataset}

# Load dataset
print("Loading and preparing dataset...")
dataset = load_and_prepare_dataset()

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=5e-4,  # Higher learning rate is often appropriate for DoRA
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=10,
    save_strategy="steps",
    save_steps=200,
    evaluation_strategy="steps",
    eval_steps=200,
    save_total_limit=3,
    fp16=True,
    report_to="tensorboard",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
)

# Train the model
print("Starting DoRA fine-tuning...")
trainer.train()

# Save the fine-tuned model
print("Saving DoRA fine-tuned model...")

# Saving DoRA parameters
# In practice, you would implement a custom save method for DoRA parameters only
# For demonstration, we'll save the entire model
trainer.save_model(os.path.join(OUTPUT_DIR, "final_model"))
tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "final_model"))

# Example: Inference with the fine-tuned model
def inference_example():
    # In practice, load and restore the DoRA parameters
    # This simplified example loads the entire saved model
    model = torch.load(os.path.join(OUTPUT_DIR, "final_model/pytorch_model.bin"))
    
    # Set the model to evaluation mode
    model.eval()
    
    # Generate text
    prompt = "The future of artificial intelligence will be shaped by"
    inputs = tokenizer(prompt, return_tensors="pt")
    
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=100,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Prompt: {prompt}")
    print(f"Generated response: {response}")

# Uncomment to run inference example
# print("Running inference with the DoRA fine-tuned model...")
# inference_example()

> Mixture of Experts (MoE) Fine-Tuning Implementation

In [None]:
# Mixture of Experts (MoE) Fine-Tuning Implementation
# =================================================
# This implements fine-tuning using a Mixture of Experts (MoE) approach
# where domain-specific experts are trained for different tasks

import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)

# Configuration
MODEL_NAME = "gpt2-medium"  # Using a smaller model for demonstration
OUTPUT_DIR = "./moe_output"

# Expert domains/tasks
DOMAINS = {
    "science": {
        "dataset": "your-science-dataset",  # Replace with actual dataset
        "token": "[SCIENCE]"
    },
    "finance": {
        "dataset": "your-finance-dataset",  # Replace with actual dataset
        "token": "[FINANCE]"
    },
    "creative": {
        "dataset": "your-creative-dataset",  # Replace with actual dataset
        "token": "[CREATIVE]"
    }
}

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# MoE Router - determines which expert(s) to use
class ExpertRouter(nn.Module):
    def __init__(self, hidden_size, num_experts, k=2):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_experts = num_experts
        self.k = k  # Top-k experts to use
        
        # Router network
        self.router = nn.Linear(hidden_size, num_experts)
    
    def forward(self, hidden_states):
        # Get router logits
        router_logits = self.router(hidden_states)  # [batch_size, seq_len, num_experts]
        
        # Get routing probabilities with softmax
        routing_weights = F.softmax(router_logits, dim=-1)
        
        # Select top-k experts
        routing_weights, indices = torch.topk(routing_weights, self.k, dim=-1)
        
        # Normalize the routing weights for the selected experts
        routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
        
        return routing_weights, indices

# Expert FFN (Feed-Forward Network) layer
class ExpertFFN(nn.Module):
    def __init__(self, hidden_size, intermediate_size, expert_domain=None):
        super().__init__()
        self.dense_h_to_4h = nn.Linear(hidden_size, intermediate_size)
        self.dense_4h_to_h = nn.Linear(intermediate_size, hidden_size)
        self.act = nn.GELU()
        self.expert_domain = expert_domain  # For logging/tracking
    
    def forward(self, hidden_states):
        hidden_states = self.dense_h_to_4h(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.dense_4h_to_h(hidden_states)
        return hidden_states

# MoE Layer - combines router and experts
class MoELayer(nn.Module):
    def __init__(self, hidden_size, intermediate_size, num_experts, k=2, expert_domains=None):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_experts = num_experts
        self.k = k
        
        # Create router
        self.router = ExpertRouter(hidden_size, num_experts, k)
        
        # Create experts
        self.experts = nn.ModuleList([
            ExpertFFN(hidden_size, intermediate_size, domain) 
            for domain in (expert_domains or [f"expert_{i}" for i in range(num_experts)])
        ])
    
    def forward(self, hidden_states):
        # Get batch size and sequence length
        batch_size, seq_len, _ = hidden_states.shape
        
        # Get routing weights and expert indices
        routing_weights, expert_indices = self.router(hidden_states)
        
        # Reshape for expert processing
        hidden_states = hidden_states.view(batch_size * seq_len, -1)
        
        # Initialize output tensor
        final_output = torch.zeros_like(hidden_states)
        
        # Process with each selected expert and combine weighted outputs
        for i in range(self.k):
            # Get the expert indices for this position
            expert_idx = expert_indices[:, :, i].view(-1)
            
            # Get the corresponding routing weights
            weight = routing_weights[:, :, i].view(-1, 1)
            
            # For each expert, process its assigned tokens
            for expert_id in range(self.num_experts):
                # Find indices where this expert is selected
                expert_mask = (expert_idx == expert_id)
                if expert_mask.sum() > 0:
                    # Get inputs for this expert
                    expert_inputs = hidden_states[expert_mask]
                    
                    # Process with expert
                    expert_output = self.experts[expert_id](expert_inputs)
                    
                    # Apply routing weight
                    expert_output = expert_output * weight[expert_mask]
                    
                    # Add to final output
                    final_output[expert_mask] += expert_output
        
        # Reshape back to original dimensions
        final_output = final_output.view(batch_size, seq_len, -1)
        
        return final_output

# MoE Adapter - Adds MoE capabilities to a pre-trained model
class MoEAdapter(nn.Module):
    def __init__(self, base_model, num_experts=3, expert_domains=None):
        super().__init__()
        self.base_model = base_model
        self.num_experts = num_experts
        self.expert_domains = expert_domains or [f"expert_{i}" for i in range(num_experts)]
        
        # Get model configuration
        config = base_model.config
        self.hidden_size = config.hidden_size
        self.intermediate_size = config.intermediate_size if hasattr(config, 'intermediate_size') else 4 * self.hidden_size
        
        # Add MoE layers to the model
        self._add_moe_layers()
        
        # Freeze base model parameters
        for param in base_model.parameters():
            param.requires_grad = False
        
        # Domain token embeddings (optional)
        self.domain_token_ids = {}
        self._add_domain_tokens()
    
    def _add_moe_layers(self):
        """Add MoE layers to the model"""
        # Simplified approach: Replace a subset of feed-forward layers with MoE layers
        # In practice, this requires a more tailored approach based on the model architecture
        
        # For GPT-2, we'll modify some of the transformer blocks
        # This is a simplified demonstration
        self.moe_layers = nn.ModuleList()
        
        # Number of layers to replace (e.g., 25% of layers)
        num_layers = len(self.base_model.transformer.h)
        num_moe_layers = max(1, num_layers // 4)
        
        # Choose which layers to replace with MoE (evenly distributed)
        moe_layer_indices = [i * (num_layers // num_moe_layers) for i in range(num_moe_layers)]
        
        # Save the indices for forward pass
        self.moe_layer_indices = moe_layer_indices
        
        # Create MoE layers
        for _ in range(num_moe_layers):
            moe_layer = MoELayer(
                self.hidden_size,
                self.intermediate_size,
                self.num_experts,
                k=2,  # Use top-2 experts
                expert_domains=self.expert_domains
            )
            self.moe_layers.append(moe_layer)
    
    def _add_domain_tokens(self):
        """Add domain-specific tokens to the tokenizer"""
        # This would typically involve adding special tokens to the tokenizer
        # For demonstration purposes, we'll just create a mapping
        for i, domain in enumerate(self.expert_domains):
            self.domain_token_ids[domain] = i
    
    def forward(self, input_ids=None, attention_mask=None, **kwargs):
        # Get the base model's hidden states for each layer
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask, 
                                  output_hidden_states=True, **kwargs)
        
        # Extract last hidden state and all hidden states
        last_hidden_state = outputs.hidden_states[-1]
        hidden_states = list(outputs.hidden_states)
        
        # Apply MoE layers at the predetermined positions
        moe_idx = 0
        for i in range(len(self.base_model.transformer.h)):
            if i in self.moe_layer_indices:
                # Apply MoE to the hidden state at this layer
                hidden_state = hidden_states[i + 1]  # +1 because first element is embeddings
                moe_output = self.moe_layers[moe_idx](hidden_state)
                
                # Replace the hidden state with the MoE output
                hidden_states[i + 1] = moe_output
                
                # Move to next MoE layer
                moe_idx += 1
        
        # Update the last hidden state if it was processed by MoE
        if len(self.base_model.transformer.h) - 1 in self.moe_layer_indices:
            last_hidden_state = hidden_states[-1]
        
        # Update the outputs with the new hidden states
        outputs.hidden_states = tuple(hidden_states)
        outputs.last_hidden_state = last_hidden_state
        
        return outputs
    
    def print_trainable_parameters(self):
        """Print the number of trainable parameters"""
        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
        all_params = sum(p.numel() for p in self.parameters())
        print(f"Trainable parameters: {trainable_params:,} ({trainable_params/all_params:.2%})")

# Load and prepare multi-domain dataset
def load_multidomain_dataset():
    # This function loads datasets from different domains and adds domain tokens
    
    # For demonstration purposes, we'll create synthetic datasets
    from datasets import Dataset
    import numpy as np
    
    all_datasets = {}
    
    for domain, config in DOMAINS.items():
        try:
            # In practice, you would load real datasets:
            # dataset = load_dataset(config["dataset"])
            
            # For demonstration, create a synthetic dataset
            print(f"Creating synthetic data for {domain} domain...")
            
            # Sample size
            sample_size = 100
            sample_length = 128
            
            # Generate synthetic texts with domain token prepended
            texts = [
                f"{config['token']} This is a sample text for {domain} domain." 
                for _ in range(sample_size)
            ]
            
            # Create dataset
            domain_dataset = Dataset.from_dict({"text": texts})
            
            # Split into train and validation
            splits = domain_dataset.train_test_split(test_size=0.1)
            all_datasets[domain] = splits
            
        except Exception as e:
            print(f"Error creating dataset for {domain}: {e}")
    
    # Combine all domains for training
    if all_datasets:
        # Combine train splits
        combined_train = concatenate_datasets([ds["train"] for ds in all_datasets.values()])
        
        # Combine validation splits
        combined_val = concatenate_datasets([ds["test"] for ds in all_datasets.values()])
        
        return {
            "train": combined_train,
            "validation": combined_val
        }
    else:
        print("No datasets could be created, using a minimal example")
        
        # Create a fallback minimal dataset
        texts = [
            "[SCIENCE] E=mc^2 is Einstein's famous equation.",
            "[FINANCE] The stock market showed volatility today.",
            "[CREATIVE] Once upon a time in a land far away."
        ]
        
        dummy_dataset = Dataset.from_dict({"text": texts})
        dummy_split = dummy_dataset.train_test_split(test_size=0.2)
        
        return {
            "train": dummy_split["train"],
            "validation": dummy_split["test"]
        }

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Add domain tokens to tokenizer
for domain, config in DOMAINS.items():
    special_tokens = {"additional_special_tokens": [config["token"]]}
    tokenizer.add_special_tokens(special_tokens)

# Load the base model
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
base_model.resize_token_embeddings(len(tokenizer))  # To account for new tokens

# Create MoE adapter
print("Creating Mixture of Experts adapter...")
model = MoEAdapter(
    base_model,
    num_experts=len(DOMAINS),
    expert_domains=list(DOMAINS.keys())
)

# Print trainable parameter count
model.print_trainable_parameters()

# Load and prepare datasets
print("Loading and preparing multi-domain datasets...")
dataset = load_multidomain_dataset()

# Tokenize function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

# Tokenize the dataset
tokenized_dataset = {
    split: dataset[split].map(
        tokenize_function,
        batched=True,
        remove_columns=["text"]
    )
    for split in dataset
}

print(f"Train dataset size: {len(tokenized_dataset['train'])}")
print(f"Validation dataset size: {len(tokenized_dataset['validation'])}")

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=10,
    save_strategy="steps",
    save_steps=200,
    evaluation_strategy="steps",
    eval_steps=200,
    save_total_limit=3,
    fp16=True,
    report_to="tensorboard",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
)

# Train the model
print("Starting Mixture of Experts fine-tuning...")
trainer.train()

# Save the fine-tuned model
print("Saving MoE fine-tuned model...")
trainer.save_model(os.path.join(OUTPUT_DIR, "final_model"))
tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "final_model"))

# Inference example - generate text for each domain
def inference_examples():
    # Load the fine-tuned model
    model = torch.load(os.path.join(OUTPUT_DIR, "final_model/pytorch_model.bin"))
    tokenizer = AutoTokenizer.from_pretrained(os.path.join(OUTPUT_DIR, "final_model"))
    
    # Set the model to evaluation mode
    model.eval()
    
    # Test for each domain
    for domain, config in DOMAINS.items():
        print(f"\nTesting {domain} domain:")
        
        # Create domain-specific prompt with domain token
        prompt = f"{config['token']} In this analysis, we will explore"
        
        # Generate response
        inputs = tokenizer(prompt, return_tensors="pt")
        
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=100,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Prompt: {prompt}")
        print(f"Generated response: {response}")

# Uncomment to run inference examples
# print("\nRunning inference with the MoE fine-tuned model...")
# inference_examples()

# Analyzing expert utilization
def analyze_expert_usage():
    """Analyze which experts are used for different domains/inputs"""
    
    # Load the fine-tuned model
    model = torch.load(os.path.join(OUTPUT_DIR, "final_model/pytorch_model.bin"))
    tokenizer = AutoTokenizer.from_pretrained(os.path.join(OUTPUT_DIR, "final_model"))
    
    # Set the model to evaluation mode
    model.eval()
    
    # Create sample inputs from each domain
    sample_inputs = {}
    for domain, config in DOMAINS.items():
        prompt = f"{config['token']} This is a test for the {domain} domain."
        inputs = tokenizer(prompt, return_tensors="pt")
        sample_inputs[domain] = inputs
    
    # Function to extract expert usage statistics from model
    def get_expert_usage(model, inputs):
        # This is a simplified placeholder - actual implementation would
        # require model-specific hooks to extract router weights
        # Example pseudo-code:
        with torch.no_grad():
            outputs = model(**inputs, output_attentions=True, output_hidden_states=True)
            # Extract router weights from MoE layers
            # Count which experts are selected most frequently
            
        # Return a dictionary with expert usage statistics
        return {
            "expert_selection": {"expert_0": 0.4, "expert_1": 0.3, "expert_2": 0.3},
            "avg_expert_confidence": 0.8
        }
    
    # Analyze expert usage for each domain
    for domain, inputs in sample_inputs.items():
        print(f"\nExpert usage analysis for {domain} domain:")
        expert_stats = get_expert_usage(model, inputs)
        
        # Print expert selection distribution
        print("Expert selection distribution:")
        for expert, weight in expert_stats["expert_selection"].items():
            print(f"  {expert}: {weight:.2f}")
        
        print(f"Average expert confidence: {expert_stats['avg_expert_confidence']:.2f}")

# Uncomment to analyze expert usage
# print("\nAnalyzing expert usage patterns...")
# analyze_expert_usage()

## Quantization for LLMs

In [None]:
# Quantization for LLMs
    # Quantization is a technique that reduces the precision of weights and activations in a neural network to lower the memory footprint 
    # and increase inference speed, often with minimal impact on performance.

# How Quantization Works
    # Quantization converts high-precision floating-point numbers (like FP32 or FP16) to lower-precision formats:

        # FP16 (Half-precision): 16-bit floating point
        # INT8: 8-bit integer quantization
        # INT4: 4-bit integer quantization
        # GPTQ: A specialized quantization method for transformer models
        # AWQ: Activation-aware weight quantization
        # SmoothQuant: Balances quantization between activations and weights

# Neural Network Quantization: A Concise Overview

## Number Representation Ranges
| Format | Bits | Range | Precision | Possible Values | Notes |
|--------|------|-------|-----------|-----------------|-------|
| FP32   | 32   | ±3.4×10³⁸ | ~7 decimal digits | $(2^{32}$) (~4.3 billion) | Standard full precision |
| FP16   | 16   | ±65,504 | ~3 decimal digits | $(2^{16}$) (65,536) | Half precision |
| BF16   | 16   | ±3.4×10³⁸ | ~2-3 decimal digits | $(2^{16}$) (65,536) | "Brain Float", wider range than FP16 |
| INT8   | 8    | -128 to 127 | Integers only | 256 | Common for inference |
| INT4   | 4    | -8 to 7   | Integers only | 16  | Recent LLM optimization |
| INT2   | 2    | -2 to 1   | Integers only | 4   | Extreme compression |


## Quantization Formulas
**Linear Quantization**: $q = round(x / s) + z$ where $q$ = quantized value, `x` = original float, `s` = scale factor, `z` = zero-point

**Dequantization**: $x_approx = s * (q - z)$ to convert back to floating point

**Scale Factor**: $s = (x_max - x_min) / (q_max - q_min)$ sets the conversion ratio

## Quantization Types
- **Symmetric**: $q = round(x / s)$ with zero-point fixed at 0
- **Asymmetric**: $q = round(x / s) + z$ using zero-point offset for better range utilization
- **Per-tensor**: One scale/zero-point for entire tensor (simple but less accurate)
- **Per-channel**: Different scale/zero-point for each output channel (better accuracy)

## Advanced Techniques
- **Weight-Only**: Quantizes just weights, keeping activations in higher precision
- **GPTQ**: Uses Hessian information to minimize error through iterative quantization
- **NF4**: Custom 4-bit format with non-uniform quantization to preserve outliers
- **SmoothQuant**: Balances quantization difficulty with formula $y = (W·α)·(x/α)$ using channel-wise scaling $α$
- **AWQ**: Preserves important weights based on activation patterns

## Benefits
- **Memory**: 2-8× reduction in model size
- **Speed**: Up to 4× faster inference
- **Energy**: Lower power consumption
- **Cost**: Enables larger models on consumer hardware

## Performance Trade-offs
Quantization reduces precision by mapping a continuous range of values to a discrete set, introducing quantization error. Modern techniques like NF4, AWQ, and GPTQ minimize this error by optimizing the quantization process based on the statistical properties of neural networks, preserving model quality even at extreme compression levels.

> Model Quantization Implementation

In [None]:
# Model Quantization Implementation
# =================================
# Examples of different approaches to quantize LLMs

import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    GPTQConfig
)
from accelerate import infer_auto_device_map, init_empty_weights

####################################################################
# SECTION 1: Loading Pre-Quantized Models from Hugging Face
####################################################################

def load_4bit_quantized_model():
    """
    Load a model with 4-bit quantization using bitsandbytes
    """
    print("Loading model quantized to 4-bit precision...")
    
    # Configure 4-bit quantization
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,                 # Load model in 4-bit precision
        bnb_4bit_compute_dtype=torch.float16,  # Compute in fp16
        bnb_4bit_use_double_quant=True,    # Use nested quantization for more memory savings
        bnb_4bit_quant_type="nf4",         # Normalized float 4-bit quantization (alternatives: "fp4")
    )
    
    # Load the tokenizer
    model_id = "meta-llama/Llama-2-7b-hf"  # Replace with your model
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Load the model with quantization configuration
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=quantization_config,
        device_map="auto",  # Automatically distribute model across available GPUs/devices
    )
    
    print(f"Model loaded in 4-bit precision")
    print(f"Model size: {get_model_size_in_gb(model):.2f} GB")
    
    return model, tokenizer

def load_8bit_quantized_model():
    """
    Load a model with 8-bit quantization using bitsandbytes
    """
    print("Loading model quantized to 8-bit precision...")
    
    # Configure 8-bit quantization
    quantization_config = BitsAndBytesConfig(
        load_in_8bit=True,                 # Load model in 8-bit precision
        llm_int8_threshold=6.0,            # Threshold for outlier features in LLM.int8()
        llm_int8_has_fp16_weight=False,    # Whether INT8 was combined with FP16
    )
    
    # Load the tokenizer
    model_id = "meta-llama/Llama-2-7b-hf"  # Replace with your model
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Load the model with quantization configuration
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=quantization_config,
        device_map="auto",  # Automatically distribute model across available GPUs/devices
    )
    
    print(f"Model loaded in 8-bit precision")
    print(f"Model size: {get_model_size_in_gb(model):.2f} GB")
    
    return model, tokenizer

def load_gptq_quantized_model():
    """
    Load a GPTQ pre-quantized model
    """
    print("Loading GPTQ-quantized model...")
    
    # GPTQ quantized models are typically shared on HF with "-gptq" suffix
    model_id = "TheBloke/Llama-2-7B-GPTQ"  # Replace with actual GPTQ model
    
    # Configure GPTQ settings
    gptq_config = GPTQConfig(
        bits=4,                  # Typically 3 or 4 bits per parameter
        disable_exllama=False,   # Whether to disable exllama kernel
        use_marlin=False,        # Whether to use marlin kernel
    )
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Load GPTQ model
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=gptq_config,
        device_map="auto",
    )
    
    print(f"GPTQ-quantized model loaded")
    print(f"Model size: {get_model_size_in_gb(model):.2f} GB")
    
    return model, tokenizer

def load_awq_quantized_model():
    """
    Load an AWQ pre-quantized model
    """
    print("Loading AWQ-quantized model...")
    
    # AWQ-quantized models are typically shared on HF with "-awq" suffix
    model_id = "TheBloke/Llama-2-7B-AWQ"  # Replace with actual AWQ model
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Load AWQ model - requires autoawq library
    try:
        from awq import AutoAWQForCausalLM
        
        model = AutoAWQForCausalLM.from_quantized(
            model_id,
            device_map="auto",
            max_memory=None,
        )
    except ImportError:
        print("To use AWQ models, please install autoawq: pip install autoawq")
        # Fallback to regular loading, which may not work properly for AWQ models
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
        )
    
    print(f"AWQ-quantized model loaded")
    print(f"Model size: {get_model_size_in_gb(model):.2f} GB")
    
    return model, tokenizer


# Load the model
model, tokenizer = load_4bit_quantized_model()

# Run inference
prompt = "Explain quantum computing in simple terms."
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate output
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.9,
    )

# Decode and print response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

####################################################################
# SECTION 2: Quantizing a Model Yourself
####################################################################

def quantize_to_4bit(model_id):
    """
    Quantize a model to 4-bit precision using BitsAndBytes
    """
    print(f"Quantizing {model_id} to 4-bit precision...")
    
    # Configure 4-bit quantization
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
    )
    
    # Load the model with 4-bit quantization
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=quantization_config,
        device_map="auto",
    )
    
    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        
    # Save as 4-bit quantized
    output_dir = f"{model_id.split('/')[-1]}-4bit"
    os.makedirs(output_dir, exist_ok=True)
    
    # BitsAndBytes models can't be directly saved with model.save_pretrained()
    # We need to save the quantization config separately
    model.config.quantization_config = quantization_config
    model.config.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    print(f"Quantization config saved to {output_dir}")
    print("Note: The actual 4-bit weights aren't saved. To share this model, use a library like")
    print("bitsandbytes that can load the original model with the saved quantization config.")

    # -------------------------------------------------------------------------------------------------------------
    # Run simple test
    print("Running test inference...")
    prompt = "Hello, how are you?"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=20,
            temperature=0.7,
            top_p=0.9,
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Test response: {response}")
        
    return model, tokenizer

def quantize_to_8bit(model_id):
    """
    Quantize a model to 8-bit precision using BitsAndBytes
    """
    print(f"Quantizing {model_id} to 8-bit precision...")
    
    # Configure 8-bit quantization
    quantization_config = BitsAndBytesConfig(
        load_in_8bit=True,
        llm_int8_threshold=6.0,
        llm_int8_has_fp16_weight=False,
    )
    
    # Load the model with 8-bit quantization
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=quantization_config,
        device_map="auto",
    )
    
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Save as 8-bit quantized
    output_dir = f"{model_id.split('/')[-1]}-8bit"
    os.makedirs(output_dir, exist_ok=True)
    
    # BitsAndBytes models can't be directly saved with model.save_pretrained()
    # We need to save the quantization config separately
    model.config.quantization_config = quantization_config
    model.config.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    print(f"Quantization config saved to {output_dir}")
    print("Note: The actual 8-bit weights aren't saved. To share this model, use a library like")
    print("bitsandbytes that can load the original model with the saved quantization config.")
    
    return model, tokenizer

def quantize_with_gptq(model_id):
    """
    Quantize a model using GPTQ
    Note: This requires additional libraries and is more complex
    """
    try:
        # Check if auto-gptq is installed
        import auto_gptq
    except ImportError:
        print("GPTQ quantization requires auto-gptq library.")
        print("Install with: pip install auto-gptq")
        return None, None
    
    from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
    
    print(f"Quantizing {model_id} with GPTQ...")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # This is a simplification - actual GPTQ requires calibration data
    # and more complex setup
    
    # Define quantization configuration
    quantize_config = BaseQuantizeConfig(
        bits=4,                      # Quantize to 4-bits
        group_size=128,              # Size of quantization groups
        desc_act=False,              # Whether to use descending activations  
    )
    
    # Prepare model for quantization
    model = AutoGPTQForCausalLM.from_pretrained(
        model_id,
        quantize_config=quantize_config,
    )
    
    # This is just a skeleton - actual quantization requires:
    # 1. Loading calibration data
    # 2. Running the quantization process with examples
    # 3. Saving the quantized model
    
    print("Note: Full GPTQ quantization requires calibration data and more setup.")
    print("See the auto-gptq documentation for complete implementation.")
    
    return model, tokenizer

def quantize_with_awq(model_id):
    """
    Quantize a model using AWQ
    Note: This requires additional libraries and is more complex
    """
    try:
        # Check if autoawq is installed
        import awq
    except ImportError:
        print("AWQ quantization requires autoawq library.")
        print("Install with: pip install autoawq")
        return None, None
    
    from awq import AutoAWQForCausalLM
    
    print(f"Quantizing {model_id} with AWQ...")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Load model in FP16 for quantization
    model = AutoAWQForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
    )
    
    # This is a simplified skeleton - actual AWQ quantization requires:
    # 1. Loading or generating calibration data
    # 2. Running the quantization process
    # 3. Exporting the quantized model
    
    # Pseudo-code for AWQ quantization:
    """
    # Generate or load calibration data
    texts = ["sample text 1", "sample text 2", ...]
    
    # Quantize the model
    model.quantize(
        tokenizer=tokenizer,
        quant_config={
            "bits": 4,                # Quantize to 4-bits
            "group_size": 128,        # Group size
            "zero_point": True,       # Use zero-point quantization
            "q_group_size": 128,      # Quantization group size
        },
        calib_data=texts,             # Calibration data
    )
    
    # Save the quantized model
    model.save_quantized("./awq-model-4bit")
    tokenizer.save_pretrained("./awq-model-4bit")
    """
    
    print("Note: Full AWQ quantization requires calibration data and more setup.")
    print("See the autoawq documentation for complete implementation.")
    
    return model, tokenizer

####################################################################
# SECTION 3: Utility Functions
####################################################################

def get_model_size_in_gb(model):
    """Calculate model size in GB"""
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    
    size_in_bytes = param_size + buffer_size
    size_in_gb = size_in_bytes / (1024 ** 3)
    return size_in_gb

def run_inference_example(model, tokenizer):
    """Run a simple inference example"""
    print("\nRunning inference example...")
    
    # Prepare input
    input_text = "The future of artificial intelligence will"
    input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
    
    # Generate text
    with torch.no_grad():
        outputs = model.generate(
            **input_ids,
            max_new_tokens=50,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
        )
    
    # Decode and print generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Input: {input_text}")
    print(f"Generated: {generated_text}")
    print("-" * 50)

def compare_model_sizes():
    """Compare model sizes with different quantization methods"""
    model_id = "meta-llama/Llama-2-7b-hf"  # Replace with your model
    
    print("\nComparing model sizes:")
    print("-" * 50)
    
    # Check FP16 size (estimated, not loaded to save memory)
    fp16_size_gb = 13.0  # ~13GB for Llama-2-7b in FP16
    print(f"FP16 Model (estimated): {fp16_size_gb:.2f} GB")
    
    # Load and check 8-bit size
    try:
        model_8bit, _ = load_8bit_quantized_model()
        size_8bit = get_model_size_in_gb(model_8bit)
        print(f"8-bit Quantized Model: {size_8bit:.2f} GB")
        del model_8bit
        torch.cuda.empty_cache()
    except Exception as e:
        print(f"Error loading 8-bit model: {e}")
    
    # Load and check 4-bit size
    try:
        model_4bit, _ = load_4bit_quantized_model()
        size_4bit = get_model_size_in_gb(model_4bit)
        print(f"4-bit Quantized Model: {size_4bit:.2f} GB")
        del model_4bit
        torch.cuda.empty_cache()
    except Exception as e:
        print(f"Error loading 4-bit model: {e}")
    
    # GPTQ and AWQ sizes (estimated, as loading depends on external libraries)
    print(f"GPTQ 4-bit Model (estimated): ~3.5 GB")
    print(f"AWQ 4-bit Model (estimated): ~3.5 GB")
    print("-" * 50)

# Example usage (commented out to avoid actual execution)
if __name__ == "__main__":
    # Uncomment any of these to run the respective functions
    
    # 1. Load pre-quantized models
    # model, tokenizer = load_4bit_quantized_model()
    # run_inference_example(model, tokenizer)
    
    # model, tokenizer = load_8bit_quantized_model()
    # run_inference_example(model, tokenizer)
    
    # model, tokenizer = load_gptq_quantized_model()
    # run_inference_example(model, tokenizer)
    
    # 2. Quantize models yourself
    # model_id = "gpt2"  # Use a small model for testing
    # model, tokenizer = quantize_to_4bit(model_id)
    # run_inference_example(model, tokenizer)
    
    # 3. Compare sizes of different quantization methods
    # compare_model_sizes()
    
    print("Script completed.")

> Using Llama.cpp for quantized inference

In [None]:
import os
import subprocess
import tempfile
from pathlib import Path

def setup_and_use_llamacpp():
    """Setup and use llama.cpp for quantized inference"""
    
    # Clone llama.cpp if not already present
    llamacpp_dir = Path("./llama.cpp")
    if not llamacpp_dir.exists():
        print("Cloning llama.cpp repository...")
        subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git", llamacpp_dir])
    
    # Compile llama.cpp
    print("Compiling llama.cpp...")
    os.chdir(llamacpp_dir)
    subprocess.run(["make"])
    
    # Download model (or use an existing one)
    model_path = Path("./models/llama-2-7b.gguf")
    if not model_path.exists():
        print("You need to download a GGUF model or convert one.")
        print("Example models can be found on Hugging Face:")
        print("https://huggingface.co/TheBloke/Llama-2-7B-GGUF")
        
        # Simulating downloading a model
        os.makedirs(model_path.parent, exist_ok=True)
        print("Download a GGUF model to:", model_path.absolute())
        
        # For actual implementation, you might use:
        # subprocess.run(["wget", "https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q4_K_M.gguf", "-O", model_path])
    
    # Create a prompt file
    with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
        prompt_file = f.name
        f.write("Explain the theory of relativity in simple terms.\n")
    
    print(f"Created prompt file: {prompt_file}")
    
    # Run llama.cpp
    print("Running inference with llama.cpp...")
    command = [
        "./main",
        "-m", str(model_path),
        "-f", prompt_file,
        "--temp", "0.7",
        "--top-p", "0.9",
        "-n", "512",  # Number of tokens to generate
        "--repeat-penalty", "1.1",
        "-t", "8"  # Number of threads
    ]
    
    try:
        subprocess.run(command, check=True)
        print("Inference completed successfully")
    except subprocess.CalledProcessError as e:
        print(f"Error running llama.cpp: {e}")
    
    # Clean up the prompt file
    os.unlink(prompt_file)
    
    # Return to original directory
    os.chdir("..")

# Alternative: Using Python bindings for llama.cpp
def use_llamacpp_python():
    """Use Python bindings for llama.cpp"""
    try:
        from llama_cpp import Llama
        
        # Path to your GGUF model
        model_path = "./llama.cpp/models/llama-2-7b.Q4_K_M.gguf"
        
        # Load the model
        llm = Llama(
            model_path=model_path,
            n_ctx=2048,  # Context size
            n_threads=8,  # CPU threads
            n_gpu_layers=0  # Set higher for GPU offloading
        )
        
        # Run inference
        prompt = "Explain how quantization works in neural networks."
        output = llm(
            prompt,
            max_tokens=512,
            temperature=0.7,
            top_p=0.9,
            repeat_penalty=1.1,
            echo=True  # Include prompt in the output
        )
        
        # Print generated text
        print(output['choices'][0]['text'])
        
    except ImportError:
        print("Python bindings for llama.cpp not found.")
        print("Install with: pip install llama-cpp-python")

# Run the examples
if __name__ == "__main__":
    # Choose which function to run
    # load_quantized_model_from_huggingface()
    # quantize_large_model_from_huggingface()
    # setup_and_use_llamacpp()
    # use_llamacpp_python()
    pass


> Full GPTQ Quantization and Export

In [None]:

# Full GPTQ Quantization and Export Implementation
# ===============================================
# This script shows a complete GPTQ quantization process and export

import os
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

# Check if auto-gptq is installed
try:
    from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig, get_gptq_peft_model
    GPTQ_AVAILABLE = True
except ImportError:
    GPTQ_AVAILABLE = False
    print("GPTQ quantization requires auto-gptq library.")
    print("Install with: pip install auto-gptq")

def prepare_calibration_data(tokenizer, num_samples=128):
    """
    Prepare calibration data for GPTQ quantization
    """
    try:
        # You can use any dataset with texts for calibration
        # Here we use WikiText-2 as an example
        dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
        
        # Process the dataset
        def preprocess(examples):
            return tokenizer(
                examples["text"],
                padding=False,
                truncation=True,
                max_length=512,
            )
        
        # Tokenize the dataset
        tokenized_dataset = dataset.map(
            preprocess,
            batched=True,
            remove_columns=["text"],
        )
        
        # Extract input_ids and limit to a subset for calibration
        calibration_data = [
            sample["input_ids"] for sample in tokenized_dataset 
            if len(sample["input_ids"]) > 128  # Filter out short sequences
        ][:num_samples]
        
        print(f"Prepared {len(calibration_data)} calibration samples")
        return calibration_data
    
    except Exception as e:
        print(f"Error preparing calibration data: {e}")
        print("Falling back to synthetic calibration data")
        
        # Create synthetic calibration data as a fallback
        synthetic_texts = [
            "The quick brown fox jumps over the lazy dog." * 10,
            "Artificial intelligence is transforming industries worldwide." * 8,
            "Machine learning algorithms improve with more training data." * 8,
            "The history of computing spans several decades of technological innovation." * 6,
            "Quantum computers use quantum physics to solve complex problems." * 8,
        ]
        
        # Tokenize the synthetic texts
        calibration_data = [
            tokenizer(text, return_tensors="pt")["input_ids"][0].tolist()
            for text in synthetic_texts
        ]
        
        # Create more variations of calibration data to reach desired number
        while len(calibration_data) < num_samples:
            calibration_data.append(calibration_data[len(calibration_data) % len(synthetic_texts)])
        
        print(f"Prepared {len(calibration_data)} synthetic calibration samples")
        return calibration_data

def quantize_with_gptq_full(model_id, output_dir, bits=4, group_size=128, use_exllama=True):
    """
    Full GPTQ quantization process with calibration data and model export
    """
    if not GPTQ_AVAILABLE:
        print("Cannot perform GPTQ quantization without auto-gptq library")
        return None, None
    
    print(f"Starting GPTQ quantization for {model_id}")
    output_dir = output_dir or f"{model_id.split('/')[-1]}-GPTQ-{bits}bit"
    os.makedirs(output_dir, exist_ok=True)
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Prepare calibration data
    calibration_data = prepare_calibration_data(tokenizer)
    
    # Define quantization configuration
    quantize_config = BaseQuantizeConfig(
        bits=bits,                # Quantize to specified bits (usually 3 or 4)
        group_size=group_size,    # Group size for quantization (typically 128)
        desc_act=False,           # Whether to use descending activations
    )
    
    # Load the model and perform quantization
    print("Loading model and performing GPTQ quantization...")
    model = AutoGPTQForCausalLM.from_pretrained(
        model_id,
        quantize_config=quantize_config,
        trust_remote_code=True,
    )
    
    # Run the quantization process
    model.quantize(
        calibration_data,
        use_triton=False,  # Whether to use Triton for faster inference
    )
    
    # Save the quantized model
    print(f"Saving GPTQ quantized model to {output_dir}")
    model.save_quantized(output_dir, use_safetensors=True)
    tokenizer.save_pretrained(output_dir)
    
    # Save a README with quantization information
    with open(os.path.join(output_dir, "README.md"), "w") as f:
        f.write(f"# GPTQ Quantized Model\n\n")
        f.write(f"Original model: {model_id}\n")
        f.write(f"Bits: {bits}\n")
        f.write(f"Group size: {group_size}\n")
        f.write(f"Calibration samples: {len(calibration_data)}\n\n")
        f.write("## Usage\n\n")
        f.write("```python\n")
        f.write("from auto_gptq import AutoGPTQForCausalLM\n")
        f.write("from transformers import AutoTokenizer\n\n")
        f.write(f'tokenizer = AutoTokenizer.from_pretrained("{output_dir}")\n')
        f.write(f'model = AutoGPTQForCausalLM.from_quantized(\n')
        f.write(f'    "{output_dir}",\n')
        f.write(f'    device="cuda:0",\n')
        f.write(f'    use_triton=False,\n')
        f.write(f')\n\n')
        f.write('inputs = tokenizer("Hello, how are you?", return_tensors="pt").to("cuda:0")\n')
        f.write('outputs = model.generate(**inputs, max_new_tokens=50)\n')
        f.write('print(tokenizer.decode(outputs[0]))\n')
        f.write("```\n")
    
    # Optionally, reload the model in inference mode to test
    print("Reloading quantized model for inference...")
    quantized_model = AutoGPTQForCausalLM.from_quantized(
        output_dir,
        device="cuda:0" if torch.cuda.is_available() else "cpu",
        use_triton=False,
        use_exllama=use_exllama and bits == 4,  # exllama only supports 4-bit
    )
    
    return quantized_model, tokenizer

def quantize_with_awq_full(model_id, output_dir, bits=4, group_size=128):
    """
    Full AWQ quantization process with calibration data and model export
    """
    try:
        # Check if AWQ is installed
        from awq import AutoAWQForCausalLM
        AWQ_AVAILABLE = True
    except ImportError:
        AWQ_AVAILABLE = False
        print("AWQ quantization requires autoawq library.")
        print("Install with: pip install autoawq")
        return None, None
    
    if not AWQ_AVAILABLE:
        return None, None
    
    print(f"Starting AWQ quantization for {model_id}")
    output_dir = output_dir or f"{model_id.split('/')[-1]}-AWQ-{bits}bit"
    os.makedirs(output_dir, exist_ok=True)
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Load model for quantization
    model = AutoAWQForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        trust_remote_code=True,
    )
    
    # Prepare calibration texts for AWQ
    print("Preparing calibration data...")
    try:
        # You can use any dataset with texts for calibration
        # Here we use WikiText as an example
        dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
        all_texts = dataset["text"]
        
        # Get a sample of non-empty texts
        calibration_texts = [
            text for text in all_texts
            if len(text.strip()) > 200  # Get reasonably long texts
        ][:128]  # Limit to a small number
        
    except Exception as e:
        print(f"Error loading calibration data: {e}")
        print("Falling back to synthetic calibration data")
        
        # Create synthetic calibration texts as a fallback
        calibration_texts = [
            "The quick brown fox jumps over the lazy dog. " * 20,
            "Artificial intelligence is transforming industries worldwide. " * 16,
            "Machine learning algorithms improve with more training data. " * 16,
            "The history of computing spans several decades of technological innovation. " * 12,
            "Quantum computers use quantum physics to solve complex problems. " * 16,
        ] * 25  # Multiply to get enough examples
        
        calibration_texts = calibration_texts[:128]  # Limit number of examples
    
    print(f"Prepared {len(calibration_texts)} calibration texts")
    
    # Run AWQ Quantization
    print("Quantizing model with AWQ...")
    model.quantize(
        tokenizer=tokenizer,
        quant_config={
            "bits": bits,                # Quantize to 4-bits
            "group_size": group_size,    # Group size (typically 128)
            "zero_point": True,          # Use zero-point quantization
            "q_group_size": group_size,  # Quantization group size
        },
        calib_data=calibration_texts,    # Calibration data
    )
    
    # Save the quantized model
    print(f"Saving AWQ quantized model to {output_dir}")
    model.save_quantized(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    # Save a README with quantization information
    with open(os.path.join(output_dir, "README.md"), "w") as f:
        f.write(f"# AWQ Quantized Model\n\n")
        f.write(f"Original model: {model_id}\n")
        f.write(f"Bits: {bits}\n")
        f.write(f"Group size: {group_size}\n")
        f.write(f"Calibration samples: {len(calibration_texts)}\n\n")
        f.write("## Usage\n\n")
        f.write("```python\n")
        f.write("from awq import AutoAWQForCausalLM\n")
        f.write("from transformers import AutoTokenizer\n\n")
        f.write(f'tokenizer = AutoTokenizer.from_pretrained("{output_dir}")\n')
        f.write(f'model = AutoAWQForCausalLM.from_quantized(\n')
        f.write(f'    "{output_dir}",\n')
        f.write(f'    device_map="auto",\n')
        f.write(f')\n\n')
        f.write('inputs = tokenizer("Hello, how are you?", return_tensors="pt").to(model.device)\n')
        f.write('outputs = model.generate(**inputs, max_new_tokens=50)\n')
        f.write('print(tokenizer.decode(outputs[0]))\n')
        f.write("```\n")
    
    # Optionally, reload for inference (can be skipped to save memory)
    print("Quantization complete. AWQ model saved.")
    
    return model, tokenizer

def run_exllama_example():
    """
    Example showing how to use ExLlama backend with GPTQ models
    """
    try:
        from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
        from exllama.tokenizer import ExLlamaTokenizer
        EXLLAMA_AVAILABLE = True
    except ImportError:
        EXLLAMA_AVAILABLE = False
        print("ExLlama backend requires the exllama library.")
        print("Install with: pip install exllama")
        return
    
    print("Running example with ExLlama backend for GPTQ models...")
    
    # Path to a GPTQ model
    model_path = "TheBloke/Llama-2-7B-GPTQ"  # Replace with actual path
    
    # Configure ExLlama
    config = ExLlamaConfig(
        model_path,
        max_seq_len=2048,        # Maximum sequence length
        gpu_split=None,          # GPU split (None for auto)
    )
    
    # Create model
    model = ExLlama(config)
    
    # Create tokenizer
    tokenizer = ExLlamaTokenizer(model_path)
    
    # Create cache for generation
    cache = ExLlamaCache(model)
    
    # Generate text with ExLlama
    prompt = "The future of artificial intelligence will"
    
    # Tokenize input
    input_ids = tokenizer.encode(prompt)
    
    # Generate text
    output_ids = model.generate(
        input_ids,
        cache,
        max_new_tokens=50,
        temperature=0.7,
        top_p=0.9,
    )
    
    # Decode and print generated text
    generated_text = tokenizer.decode(output_ids)
    print(f"Input: {prompt}")
    print(f"Generated: {generated_text}")
    print("-" * 50)
    
    print("ExLlama example completed")

def run_smoothquant_example():
    """
    Example of SmoothQuant quantization approach
    """
    try:
        import optimum
        from optimum.intel import IPEXModelForCausalLM
        SMOOTHQUANT_AVAILABLE = True
    except ImportError:
        SMOOTHQUANT_AVAILABLE = False
        print("SmoothQuant requires the optimum-intel package.")
        print("Install with: pip install optimum[intel]")
        return
    
    print("Running SmoothQuant quantization example...")
    
    # SmoothQuant is particularly good for balancing activation and weight quantization
    # It's implemented in the optimum-intel library
    
    model_id = "gpt2"  # Using a small model for demonstration
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Load model with SmoothQuant quantization
    print("Loading model with SmoothQuant INT8 quantization...")
    model = IPEXModelForCausalLM.from_pretrained(
        model_id,
        export=True,
        quantization_approach="smooth_quant",
        target_precision="int8",  # INT8 quantization
    )
    
    # Generate text
    input_text = "Artificial intelligence will"
    inputs = tokenizer(input_text, return_tensors="pt")
    
    # Generate with quantized model
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
    )
    
    # Decode and print generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Input: {input_text}")
    print(f"Generated: {generated_text}")
    print("-" * 50)
    
    print("SmoothQuant example completed")

def quantize_and_compare_performance():
    """
    Quantize a model with different methods and compare performance
    """
    model_id = "gpt2"  # Using a small model for demonstration
    
    # Set up performance metrics
    results = {}
    
    # 1. Baseline FP16 model
    print("\nTesting FP16 baseline model...")
    try:
        start_time = time.time()
        
        # Load model in FP16
        model_fp16 = AutoModelForCausalLM.from_pretrained(
            model_id, 
            torch_dtype=torch.float16,
            device_map="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        
        # Run inference
        inputs = tokenizer("The future of artificial intelligence will", return_tensors="pt").to(model_fp16.device)
        
        with torch.no_grad():
            for _ in range(10):  # Run multiple times for better timing
                outputs = model_fp16.generate(
                    **inputs,
                    max_new_tokens=50,
                    do_sample=False  # Deterministic for fair comparison
                )
        
        inference_time = (time.time() - start_time) / 10
        memory_usage = torch.cuda.max_memory_allocated() / (1024 ** 3)  # GB
        
        results["fp16"] = {
            "inference_time": inference_time,
            "memory_usage": memory_usage,
            "model_size": get_model_size_in_gb(model_fp16)
        }
        
        # Clean up
        del model_fp16
        torch.cuda.empty_cache()
        
    except Exception as e:
        print(f"Error testing FP16 model: {e}")
    
    # 2. 8-bit quantized model
    print("\nTesting INT8 quantized model...")
    try:
        start_time = time.time()
        
        # Load 8-bit quantized model
        model_int8 = AutoModelForCausalLM.from_pretrained(
            model_id,
            load_in_8bit=True,
            device_map="auto"
        )
        
        # Run inference
        inputs = tokenizer("The future of artificial intelligence will", return_tensors="pt").to(model_int8.device)
        
        with torch.no_grad():
            for _ in range(10):  # Run multiple times for better timing
                outputs = model_int8.generate(
                    **inputs,
                    max_new_tokens=50,
                    do_sample=False  # Deterministic for fair comparison
                )
        
        inference_time = (time.time() - start_time) / 10
        memory_usage = torch.cuda.max_memory_allocated() / (1024 ** 3)  # GB
        
        results["int8"] = {
            "inference_time": inference_time,
            "memory_usage": memory_usage,
            "model_size": get_model_size_in_gb(model_int8)
        }
        
        # Clean up
        del model_int8
        torch.cuda.empty_cache()
        
    except Exception as e:
        print(f"Error testing INT8 model: {e}")
    
    # 3. 4-bit quantized model
    print("\nTesting INT4 quantized model...")
    try:
        start_time = time.time()
        
        # Load 4-bit quantized model
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )
        
        model_int4 = AutoModelForCausalLM.from_pretrained(
            model_id,
            quantization_config=quantization_config,
            device_map="auto"
        )
        
        # Run inference
        inputs = tokenizer("The future of artificial intelligence will", return_tensors="pt").to(model_int4.device)
        
        with torch.no_grad():
            for _ in range(10):  # Run multiple times for better timing
                outputs = model_int4.generate(
                    **inputs,
                    max_new_tokens=50,
                    do_sample=False  # Deterministic for fair comparison
                )
        
        inference_time = (time.time() - start_time) / 10
        memory_usage = torch.cuda.max_memory_allocated() / (1024 ** 3)  # GB
        
        results["int4"] = {
            "inference_time": inference_time,
            "memory_usage": memory_usage,
            "model_size": get_model_size_in_gb(model_int4)
        }
        
        # Clean up
        del model_int4
        torch.cuda.empty_cache()
        
    except Exception as e:
        print(f"Error testing INT4 model: {e}")
    
    # Print comparison results
    print("\nPerformance Comparison:")
    print("-" * 50)
    print(f"{'Model Type':<10} | {'Size (GB)':<10} | {'Memory (GB)':<12} | {'Time (s)':<10}")
    print("-" * 50)
    
    for model_type, metrics in results.items():
        print(f"{model_type:<10} | {metrics.get('model_size', 'N/A'):<10.2f} | {metrics.get('memory_usage', 'N/A'):<12.2f} | {metrics.get('inference_time', 'N/A'):<10.4f}")
    
    print("-" * 50)

# Example usage (commented out to avoid actual execution)
if __name__ == "__main__":
    import time
    
    # Uncomment any of these to run the respective functions
    
    # 1. Full GPTQ quantization with model export
    # Note: This requires significant computation
    # model_id = "gpt2"  # Use a small model for testing
    # quantize_with_gptq_full(model_id, output_dir="./gpt2-GPTQ-4bit")
    
    # 2. Full AWQ quantization with model export
    # model_id = "gpt2"  # Use a small model for testing
    # quantize_with_awq_full(model_id, output_dir="./gpt2-AWQ-4bit")
    
    # 3. Run ExLlama backend example (for already-quantized GPTQ models)
    # run_exllama_example()
    
    # 4. Run SmoothQuant example
    # run_smoothquant_example()
    
    # 5. Compare different quantization methods
    # quantize_and_compare_performance()
    
    print("Script completed.")

> Practical Example: Loading and Using Quantized Models

In [None]:
# Practical Example: Loading and Using Quantized Models
# ====================================================
# This script demonstrates how to load and use pre-quantized models from Hugging Face

import torch
from transformers import AutoTokenizer, TextStreaming
import time
import gc

def load_and_run_4bit_model():
    """
    Load a 4-bit quantized model from Hugging Face and run inference
    """
    print("\n=== Loading 4-bit Quantized Model ===")
    
    # Import required libraries
    from transformers import AutoModelForCausalLM, BitsAndBytesConfig
    
    # Model ID - you can replace this with any model that supports 4-bit quantization
    model_id = "meta-llama/Llama-2-7b-chat-hf"  # Replace with your model choice
    
    # Configure 4-bit quantization
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,                       # Load in 4-bit precision
        bnb_4bit_compute_dtype=torch.bfloat16,   # Compute dtype (bfloat16 or float16)
        bnb_4bit_use_double_quant=True,          # Use nested quantization
        bnb_4bit_quant_type="nf4",               # Normalized float 4-bit (nf4) or pure int4 (fp4)
    )
    
    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Load model with quantization
    print("Loading model in 4-bit precision...")
    start_time = time.time()
    
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=quantization_config,  # Apply quantization
        device_map="auto",                        # Automatically distribute across available devices
        trust_remote_code=True,                   # Trust remote code if needed
    )
    
    loading_time = time.time() - start_time
    print(f"Model loaded in {loading_time:.2f} seconds")
    
    # Calculate and display memory usage
    memory_used = torch.cuda.max_memory_allocated() / (1024 ** 3)  # Convert to GB
    print(f"GPU memory used: {memory_used:.2f} GB")
    
    # Run sample inference
    print("\nRunning inference...")
    
    # For LLaMA-2-Chat models, use the proper prompt format
    prompt = """<s>[INST] Write a short paragraph about climate change. [/INST]"""
    
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate with streaming for a more interactive experience
    streamer = TextStreaming(tokenizer)
    
    # Generate text
    print("\nGenerated text:")
    with torch.no_grad():
        _ = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            top_p=0.95,
            do_sample=True,
            streamer=streamer
        )
    
    # Clean up to free memory
    del model
    torch.cuda.empty_cache()
    gc.collect()
    
    print("\n4-bit model demo completed")

def load_and_run_8bit_model():
    """
    Load an 8-bit quantized model from Hugging Face and run inference
    """
    print("\n=== Loading 8-bit Quantized Model ===")
    
    # Import required libraries
    from transformers import AutoModelForCausalLM, BitsAndBytesConfig
    
    # Model ID - you can replace this with any model that supports 8-bit quantization
    model_id = "meta-llama/Llama-2-7b-chat-hf"  # Replace with your model choice
    
    # Configure 8-bit quantization
    quantization_config = BitsAndBytesConfig(
        load_in_8bit=True,                      # Load in 8-bit precision
        llm_int8_threshold=6.0,                 # LLM.int8() threshold
        llm_int8_has_fp16_weight=False,         # Whether INT8 was combined with FP16
    )
    
    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Load model with quantization
    print("Loading model in 8-bit precision...")
    start_time = time.time()
    
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=quantization_config,  # Apply quantization
        device_map="auto",                        # Automatically distribute across available devices
        trust_remote_code=True,                   # Trust remote code if needed
    )
    
    loading_time = time.time() - start_time
    print(f"Model loaded in {loading_time:.2f} seconds")
    
    # Calculate and display memory usage
    memory_used = torch.cuda.max_memory_allocated() / (1024 ** 3)  # Convert to GB
    print(f"GPU memory used: {memory_used:.2f} GB")
    
    # Run sample inference
    print("\nRunning inference...")
    
    # For LLaMA-2-Chat models, use the proper prompt format
    prompt = """<s>[INST] Explain the concept of quantum computing in simple terms. [/INST]"""
    
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate with streaming for a more interactive experience
    streamer = TextStreaming(tokenizer)
    
    # Generate text
    print("\nGenerated text:")
    with torch.no_grad():
        _ = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            top_p=0.95,
            do_sample=True,
            streamer=streamer
        )
    
    # Clean up to free memory
    del model
    torch.cuda.empty_cache()
    gc.collect()
    
    print("\n8-bit model demo completed")

def load_and_run_gptq_model():
    """
    Load a GPTQ-quantized model from Hugging Face and run inference
    """
    print("\n=== Loading GPTQ Quantized Model ===")
    
    try:
        from auto_gptq import AutoGPTQForCausalLM
        GPTQ_AVAILABLE = True
    except ImportError:
        GPTQ_AVAILABLE = False
        print("GPTQ support requires the auto-gptq library.")
        print("Install with: pip install auto-gptq")
        return
    
    if not GPTQ_AVAILABLE:
        return
    
    # Model ID for a GPTQ-quantized model
    model_id = "TheBloke/Llama-2-7b-Chat-GPTQ"  # Replace with your model choice
    
    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Load GPTQ model
    print("Loading GPTQ model...")
    start_time = time.time()
    
    model = AutoGPTQForCausalLM.from_quantized(
        model_id,
        use_triton=False,         # Whether to use Triton for inference
        use_exllama=True,         # Whether to use exllama for inference
        device_map="auto",        # Automatically distribute across available devices
        trust_remote_code=True,   # Trust remote code if needed
    )
    
    loading_time = time.time() - start_time
    print(f"Model loaded in {loading_time:.2f} seconds")
    
    # Calculate and display memory usage
    memory_used = torch.cuda.max_memory_allocated() / (1024 ** 3)  # Convert to GB
    print(f"GPU memory used: {memory_used:.2f} GB")
    
    # Run sample inference
    print("\nRunning inference...")
    
    # For LLaMA-2-Chat models, use the proper prompt format
    prompt = """<s>[INST] Write a brief poem about artificial intelligence. [/INST]"""
    
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate text
    print("\nGenerated text:")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            top_p=0.95,
            do_sample=True,
        )
    
    # Decode and print generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(generated_text)
    
    # Clean up to free memory
    del model
    torch.cuda.empty_cache()
    gc.collect()
    
    print("\nGPTQ model demo completed")

def load_and_run_awq_model():
    """
    Load an AWQ-quantized model from Hugging Face and run inference
    """
    print("\n=== Loading AWQ Quantized Model ===")
    
    try:
        from awq import AutoAWQForCausalLM
        AWQ_AVAILABLE = True
    except ImportError:
        AWQ_AVAILABLE = False
        print("AWQ support requires the autoawq library.")
        print("Install with: pip install autoawq")
        return
    
    if not AWQ_AVAILABLE:
        return
    
    # Model ID for an AWQ-quantized model
    model_id = "TheBloke/Llama-2-7b-Chat-AWQ"  # Replace with your model choice
    
    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Load AWQ model
    print("Loading AWQ model...")
    start_time = time.time()
    
    model = AutoAWQForCausalLM.from_quantized(
        model_id,
        device_map="auto",        # Automatically distribute across available devices
        trust_remote_code=True,   # Trust remote code if needed
    )
    
    loading_time = time.time() - start_time
    print(f"Model loaded in {loading_time:.2f} seconds")
    
    # Calculate and display memory usage
    memory_used = torch.cuda.max_memory_allocated() / (1024 ** 3)  # Convert to GB
    print(f"GPU memory used: {memory_used:.2f} GB")
    
    # Run sample inference
    print("\nRunning inference...")
    
    # For LLaMA-2-Chat models, use the proper prompt format
    prompt = """<s>[INST] What are the benefits and drawbacks of remote work? [/INST]"""
    
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate text
    print("\nGenerated text:")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            top_p=0.95,
            do_sample=True,
        )
    
    # Decode and print generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(generated_text)
    
    # Clean up to free memory
    del model
    torch.cuda.empty_cache()
    gc.collect()
    
    print("\nAWQ model demo completed")

def benchmark_and_compare_models():
    """
    Benchmark and compare different quantized models
    """
    print("\n=== Benchmarking Different Quantized Models ===")
    
    # This is a simplified benchmark for demonstration purposes
    # For a real benchmark, you would typically:
    # 1. Use the same model architecture quantized with different methods
    # 2. Run multiple iterations with different prompts
    # 3. Measure throughput (tokens/sec), latency, and memory usage
    # 4. Compare quality metrics (perplexity, accuracy on tasks)
    
    models_to_benchmark = [
        {
            "name": "4-bit (NF4)",
            "load_function": load_and_run_4bit_model,
            "results": {}
        },
        {
            "name": "8-bit",
            "load_function": load_and_run_8bit_model,
            "results": {}
        },
        {
            "name": "GPTQ",
            "load_function": load_and_run_gptq_model,
            "results": {}
        },
        {
            "name": "AWQ",
            "load_function": load_and_run_awq_model,
            "results": {}
        }
    ]
    
    # Run benchmarks
    print("Starting benchmark...")
    
    for model_config in models_to_benchmark:
        try:
            print(f"\nBenchmarking {model_config['name']} model...")
            
            # Run the model and collect metrics
            # In a real benchmark, you would measure these metrics directly
            # rather than using the convenience functions
            model_config["load_function"]()
            
            # Simplified - in reality you would collect actual measurements
            model_config["results"]["success"] = True
            
        except Exception as e:
            print(f"Error benchmarking {model_config['name']} model: {e}")
            model_config["results"]["success"] = False
    
    print("\nBenchmark completed.")

# Main demonstration
if __name__ == "__main__":
    print("=== Quantized Model Usage Demonstration ===")
    print("This script shows how to load and use different types of quantized models")
    
    # Uncomment the functions you want to run
    # Each function demonstrates loading and running a different type of quantized model
    
    # load_and_run_4bit_model()
    # load_and_run_8bit_model()
    # load_and_run_gptq_model()
    # load_and_run_awq_model()
    
    # To run a benchmark comparing different models
    # benchmark_and_compare_models()
    
    print("\nDemonstration completed.")