<a href="https://colab.research.google.com/github/mobarakol/tutorial_notebooks/blob/main/LLM_GPT2_Entropy_Semantic_Entropy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip -q install datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/484.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━[0m [32m337.9/484.9 kB[0m [31m10.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25h

GPT2_FFT

In [4]:
import torch
import os
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
from datasets import Dataset
import torch.nn.functional as F
import random
import numpy as np

# Seed setting function
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

# Set the seed for reproducibility
seed = 50
set_seed(seed)

# Load GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add padding token if necessary
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Prepare 5 QA samples for training and validation
train_qa_samples = [
    {"question": "What is the capital of France?", "answer": "The capital of France is Paris."},
    {"question": "Who wrote '1984'?", "answer": "George Orwell wrote '1984'."},
    {"question": "What is the largest planet?", "answer": "The largest planet is Jupiter."},
    {"question": "Who painted the Mona Lisa?", "answer": "Leonardo da Vinci painted the Mona Lisa."},
    {"question": "What is the speed of light?", "answer": "The speed of light is approximately 299,792 kilometers per second."}
]

valid_qa_samples = [
    {"question": "Which city is the capital of France?", "answer": "The capital of France is Paris."},
    {"question": "Can you tell me who authored '1984'?", "answer": "George Orwell wrote '1984'."},
    {"question": "What planet is the biggest in our solar system?", "answer": "The largest planet is Jupiter."},
    {"question": "Who is the artist behind the Mona Lisa?", "answer": "Leonardo da Vinci painted the Mona Lisa."},
    {"question": "How fast does light travel?", "answer": "The speed of light is approximately 299,792 kilometers per second."}
]

# Preprocess dataset
def preprocess_data(example):
    input_text = f"Question: {example['question']}\nAnswer: {example['answer']}"
    inputs = tokenizer(input_text, truncation=True, padding="max_length", max_length=60)

    # Clone input_ids into labels
    labels = inputs["input_ids"].copy()

    # Mask question tokens and padding tokens in labels
    question_length = len(tokenizer(f"Question: {example['question']}\nAnswer:")["input_ids"]) - 1
    for i in range(len(labels)):
        if i < question_length or labels[i] == tokenizer.pad_token_id:
            labels[i] = tokenizer.eos_token_id  # Ignore question and padding tokens

    inputs["labels"] = labels
    return inputs

# Convert samples to dataset and preprocess
dataset_train = Dataset.from_list(train_qa_samples).map(preprocess_data, remove_columns=["question", "answer"])
dataset_valid = Dataset.from_list(valid_qa_samples).map(preprocess_data, remove_columns=["question", "answer"])

# Convert to PyTorch DataLoader
batch_size = 2

def collate_fn(batch):
    input_ids = torch.tensor([item["input_ids"] for item in batch])
    attention_mask = torch.tensor([item["attention_mask"] for item in batch])
    labels = torch.tensor([item["labels"] for item in batch])
    return input_ids, attention_mask, labels

train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(dataset_valid, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Cross-entropy loss function ignoring padding tokens
criterion = torch.nn.CrossEntropyLoss(ignore_index=-100)

# Function to save the best model based on validation loss
def save_best_model(model, tokenizer, epoch, best_loss, current_loss, save_path="./gpt2-qa-best-loss-cml"):
    if current_loss < best_loss:
        best_loss = current_loss
        os.makedirs(save_path, exist_ok=True)
        model.save_pretrained(save_path)
        tokenizer.save_pretrained(save_path)
        print(f"Best model saved at epoch {epoch} with validation loss: {best_loss:.4f}")
    return best_loss

# Training and evaluation functions
def train(model, train_loader, valid_loader, optimizer, criterion, num_epochs=10):
    best_val_loss = float("inf")

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            optimizer.zero_grad()

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Shift labels and logits for proper alignment
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()

            # Flatten logits and labels for loss calculation
            shift_logits = shift_logits.view(-1, shift_logits.size(-1))
            shift_labels = shift_labels.view(-1)

            # Compute loss
            loss = criterion(shift_logits, shift_labels)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)
        avg_val_loss = validate(model, valid_loader, criterion)

        print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

        # Save best model based on validation loss
        best_val_loss = save_best_model(model, tokenizer, epoch + 1, best_val_loss, avg_val_loss)

def validate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Shift labels and logits for proper alignment
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()

            # Flatten logits and labels for loss calculation
            shift_logits = shift_logits.view(-1, shift_logits.size(-1))
            shift_labels = shift_labels.view(-1)

            # Compute loss
            loss = criterion(shift_logits, shift_labels)
            total_loss += loss.item()

    return total_loss / len(dataloader)

# Start training
train(model, train_loader, valid_loader, optimizer, criterion, num_epochs=10)

# Load the best fine-tuned model for inference
model_name = "./gpt2-qa-best-loss-cml"
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Inference function
def generate_answer(question):
    model.eval()
    input_text = f"Question: {question} Answer:"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=60).to(device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=50, pad_token_id=tokenizer.eos_token_id)
    answer = tokenizer.decode(output[0], skip_special_tokens=True).split("Answer:")[-1].strip()
    return answer

# Example inference
question = "What is the capital of France?"
answer = generate_answer(question)
print(f"Q: {question}\nA: {answer}")


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]



Epoch 1/10, Training Loss: 4.4203, Validation Loss: 1.5726
Best model saved at epoch 1 with validation loss: 1.5726
Epoch 2/10, Training Loss: 0.9883, Validation Loss: 0.9799
Best model saved at epoch 2 with validation loss: 0.9799
Epoch 3/10, Training Loss: 0.7012, Validation Loss: 0.6840
Best model saved at epoch 3 with validation loss: 0.6840
Epoch 4/10, Training Loss: 0.4719, Validation Loss: 0.4885
Best model saved at epoch 4 with validation loss: 0.4885
Epoch 5/10, Training Loss: 0.2782, Validation Loss: 0.4139
Best model saved at epoch 5 with validation loss: 0.4139
Epoch 6/10, Training Loss: 0.2091, Validation Loss: 0.3706
Best model saved at epoch 6 with validation loss: 0.3706
Epoch 7/10, Training Loss: 0.1939, Validation Loss: 0.3273
Best model saved at epoch 7 with validation loss: 0.3273
Epoch 8/10, Training Loss: 0.1228, Validation Loss: 0.2700
Best model saved at epoch 8 with validation loss: 0.2700
Epoch 9/10, Training Loss: 0.1057, Validation Loss: 0.2148
Best model sa

#Naive Entropy

In [3]:
import torch
import os
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
from datasets import Dataset
import torch.nn.functional as F
import random
import numpy as np

model_name = "./gpt2-qa-best-loss-cml"
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Inference function with entropy-based uncertainty calculation
def generate_answer_with_entropy(question):
    model.eval()
    input_text = f"Question: {question} Answer:"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=60).to(device)

    # Generate with output scores to get logits at each generation step
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=50,
            pad_token_id=tokenizer.eos_token_id,
            output_scores=True,
            return_dict_in_generate=True
        )

    # Calculate entropy for each generated token (from the output scores)
    entropies = []
    for score in output.scores:
        # score shape: [batch_size, vocab_size]
        probs = F.softmax(score, dim=-1)
        entropy = -torch.sum(probs * torch.log(probs + 1e-10), dim=-1)  # entropy for each sample in the batch
        entropies.append(entropy)
    # Average entropy over generated tokens
    mean_entropy = torch.mean(torch.stack(entropies))

    # Decode generated text and extract answer part
    answer = tokenizer.decode(output.sequences[0], skip_special_tokens=True).split("Answer:")[-1].strip()
    return answer, mean_entropy.item()

# Example inference
question = "What is the capital of France?"
answer, uncertainty = generate_answer_with_entropy(question)
print(f"Q: {question}\nA: {answer}\nMean Entropy (Uncertainty): {uncertainty:.4f}")


Q: What is the capital of France?
A: The capital of France is Paris.
Mean Entropy (Uncertainty): 0.7557


#Semantic Entropy with additional forward pass

In [9]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch.nn.functional as F
import numpy as np
from sklearn.cluster import KMeans

# Load GPT-2 model and tokenizer
model_name = "./gpt2-qa-best-loss-cml"
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

def generate_answer_with_semantic_entropy(question, n_clusters=5):
    model.eval()
    input_text = f"Question: {question} Answer:"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=60).to(device)
    input_length = inputs['input_ids'].shape[1]

    # Generate output (without hidden states, as they might not include the generated tokens)
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=50,
            pad_token_id=tokenizer.eos_token_id,
            return_dict_in_generate=True
        )

    generated_ids = output.sequences  # shape: [batch_size, sequence_length]
    generated_sequence_length = generated_ids.shape[1]
    print(f"Input length: {input_length}, Generated sequence length: {generated_sequence_length}")

    # Decode full generated text
    full_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    answer = full_text.split("Answer:")[-1].strip()

    # Now, run a forward pass on the entire generated sequence to get hidden states
    with torch.no_grad():
        outputs = model(generated_ids, output_hidden_states=True)

    # Extract the hidden states from the last layer: shape [batch_size, sequence_length, hidden_dim]
    last_hidden = outputs.hidden_states[-1]

    # Get the hidden states corresponding to the tokens generated after the prompt
    generated_hidden = last_hidden[0, input_length:, :]
    num_tokens = generated_hidden.shape[0]

    if num_tokens == 0:
        return answer, 0.0

    # Convert hidden states to numpy array and perform clustering
    generated_hidden_np = generated_hidden.cpu().numpy()
    k = min(n_clusters, num_tokens)
    kmeans = KMeans(n_clusters=k, random_state=0)
    cluster_labels = kmeans.fit_predict(generated_hidden_np)

    # Compute probability distribution over clusters
    counts = np.bincount(cluster_labels, minlength=k)
    probs = counts / np.sum(counts)

    # Compute semantic entropy
    epsilon = 1e-10
    semantic_entropy = -np.sum(probs * np.log(probs + epsilon))

    return answer, semantic_entropy

# Example usage:
question = "What is the capital of France?"
answer, sem_entropy = generate_answer_with_semantic_entropy(question)
print(f"Q: {question}\nA: {answer}\nSemantic Entropy: {sem_entropy:.4f}")


Input length: 11, Generated sequence length: 19
Q: What is the capital of France?
A: The capital of France is Paris.
Semantic Entropy: 1.4942


#Semantic Entropy with Custom Generation Function

In [7]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch.nn.functional as F
import numpy as np
from sklearn.cluster import KMeans

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load GPT-2 model and tokenizer
model_name = "./gpt2-qa-best-loss-cml"
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

def custom_generate(input_text, max_new_tokens=50, n_clusters=5):
    # Tokenize the input
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=60).to(device)
    input_ids = inputs['input_ids']  # shape: [1, seq_length]

    # List to store the hidden states for the last token at each generation step
    collected_hidden_states = []  # Each element is a list of per-layer hidden states

    # Initialize generated_ids with the input_ids
    generated_ids = input_ids

    model.eval()
    with torch.no_grad():
        for _ in range(max_new_tokens):
            # Disable caching to recompute full hidden states for the entire sequence
            outputs = model(generated_ids, output_hidden_states=True, use_cache=False)
            # outputs.hidden_states is a tuple of length (n_layers + 1)
            # Extract the hidden states for the last token from each layer
            step_hidden = [layer_hidden[0, -1, :].unsqueeze(0) for layer_hidden in outputs.hidden_states]
            collected_hidden_states.append(step_hidden)

            # Greedy sampling for the next token
            next_token_logits = outputs.logits[:, -1, :]  # shape: [1, vocab_size]
            next_token_id = next_token_logits.argmax(dim=-1, keepdim=True)  # shape: [1, 1]
            generated_ids = torch.cat([generated_ids, next_token_id], dim=-1)

            # Stop if EOS token is generated
            if next_token_id.item() == tokenizer.eos_token_id:
                break

    # Decode the full generated sequence
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    # For semantic entropy, use the hidden states from the final layer for each generated token
    # Each element in collected_hidden_states is a list of per-layer representations;
    # We take the last element (final layer) from each generation step.
    last_layer_hidden_states = [step[-1] for step in collected_hidden_states]  # list of [1, hidden_dim]
    if len(last_layer_hidden_states) == 0:
        semantic_entropy = 0.0
    else:
        # Stack into a tensor of shape [num_generated_tokens, hidden_dim]
        hidden_tensor = torch.cat(last_layer_hidden_states, dim=0)
        num_tokens = hidden_tensor.shape[0]

        # Convert to numpy for clustering
        hidden_np = hidden_tensor.cpu().numpy()
        k = min(n_clusters, num_tokens)
        if k == 0:
            semantic_entropy = 0.0
        else:
            kmeans = KMeans(n_clusters=k, random_state=0)
            cluster_labels = kmeans.fit_predict(hidden_np)
            counts = np.bincount(cluster_labels, minlength=k)
            probs = counts / np.sum(counts)
            epsilon = 1e-10
            semantic_entropy = -np.sum(probs * np.log(probs + epsilon))

    return generated_text, semantic_entropy

# Example usage:
input_text = "Question: What is the capital of France? Answer:"
generated_text, sem_entropy = custom_generate(input_text, max_new_tokens=50, n_clusters=5)
print("Generated text:")
print(generated_text)
print("\nSemantic Entropy: {:.4f}".format(sem_entropy))


Generated text:
Question: What is the capital of France? Answer: The capital of France is Paris.

Semantic Entropy: 1.5596


#Semantic Entropy with Custom Generation Function and Semantic Entropy Function

In [8]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch.nn.functional as F
import numpy as np
from sklearn.cluster import KMeans

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def compute_semantic_entropy(last_layer_hidden_states, n_clusters=5):
    """
    Compute semantic entropy from a list of hidden states from the final layer.

    Args:
        last_layer_hidden_states (list[torch.Tensor]): List of tensors of shape [1, hidden_dim]
            representing the hidden state for the last generated token at each generation step.
        n_clusters (int): Number of clusters to form for computing entropy.

    Returns:
        float: The semantic entropy computed from the cluster distribution.
    """
    if not last_layer_hidden_states:
        return 0.0

    # Stack hidden states into a tensor of shape [num_generated_tokens, hidden_dim]
    hidden_tensor = torch.cat(last_layer_hidden_states, dim=0)
    num_tokens = hidden_tensor.shape[0]

    # Convert to numpy array for clustering
    hidden_np = hidden_tensor.cpu().numpy()

    # Use at most n_clusters (but not more than number of tokens)
    k = min(n_clusters, num_tokens)
    if k == 0:
        return 0.0

    kmeans = KMeans(n_clusters=k, random_state=0)
    cluster_labels = kmeans.fit_predict(hidden_np)

    # Calculate the probability distribution of tokens across clusters
    counts = np.bincount(cluster_labels, minlength=k)
    probs = counts / np.sum(counts)

    # Compute entropy: -sum(p * log(p))
    epsilon = 1e-10  # avoid log(0)
    semantic_entropy = -np.sum(probs * np.log(probs + epsilon))

    return semantic_entropy

def custom_generate(input_text, max_new_tokens=50, n_clusters=5):
    # Tokenize the input
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=60).to(device)
    input_ids = inputs['input_ids']  # shape: [1, seq_length]

    # List to store the hidden states for the last token at each generation step
    collected_hidden_states = []  # Each element is a list of per-layer hidden states

    # Initialize generated_ids with the input_ids
    generated_ids = input_ids

    model.eval()
    with torch.no_grad():
        for _ in range(max_new_tokens):
            # Disable caching to recompute full hidden states for the entire sequence
            outputs = model(generated_ids, output_hidden_states=True, use_cache=False)
            # outputs.hidden_states is a tuple of length (n_layers + 1)
            # Extract the hidden states for the last token from each layer
            step_hidden = [layer_hidden[0, -1, :].unsqueeze(0) for layer_hidden in outputs.hidden_states]
            collected_hidden_states.append(step_hidden)

            # Greedy sampling for the next token
            next_token_logits = outputs.logits[:, -1, :]  # shape: [1, vocab_size]
            next_token_id = next_token_logits.argmax(dim=-1, keepdim=True)  # shape: [1, 1]
            generated_ids = torch.cat([generated_ids, next_token_id], dim=-1)

            # Stop if EOS token is generated
            if next_token_id.item() == tokenizer.eos_token_id:
                break

    # Decode the full generated sequence
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    # For semantic entropy, use the hidden states from the final layer for each generated token
    # Each element in collected_hidden_states is a list of per-layer representations;
    # We take the last element (final layer) from each generation step.
    last_layer_hidden_states = [step[-1] for step in collected_hidden_states]  # list of [1, hidden_dim]
    if len(last_layer_hidden_states) == 0:
        semantic_entropy = 0.0
    else:
        # Stack into a tensor of shape [num_generated_tokens, hidden_dim]
        semantic_entropy = compute_semantic_entropy(last_layer_hidden_states, n_clusters=5)

    return generated_text, semantic_entropy


# Load GPT-2 model and tokenizer
model_name = "./gpt2-qa-best-loss-cml"
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)


# Example usage:
input_text = "Question: What is the capital of France? Answer:"
generated_text, sem_entropy = custom_generate(input_text, max_new_tokens=50, n_clusters=5)
print("Generated text:")
print(generated_text)
print("\nSemantic Entropy: {:.4f}".format(sem_entropy))


Generated text:
Question: What is the capital of France? Answer: The capital of France is Paris.

Semantic Entropy: 1.5596


#Batch: Semantic Entropy with additional forward pass

In [14]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch.nn.functional as F
import numpy as np
from sklearn.cluster import KMeans

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def generate_answers_with_semantic_entropy(questions, n_clusters=5, max_new_tokens=50,
                                             force_min_length=False, min_new_tokens=10):
    """
    For a batch of questions, generate answers and compute semantic entropy for the generated tokens.

    Args:
        questions (list of str): List of question strings.
        n_clusters (int): Number of clusters for KMeans.
        max_new_tokens (int): Maximum number of new tokens to generate.
        force_min_length (bool): If True, force a minimum generation length for each sample.
        min_new_tokens (int): Number of new tokens to force beyond the prompt if force_min_length is True.

    Returns:
        answers (list of str): Decoded answers for each question.
        semantic_entropies (list of float): Semantic entropy for each answer.
    """
    # Create input texts
    input_texts = [f"Question: {q} Answer:" for q in questions]

    # Batch tokenize with padding
    inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True, max_length=60).to(device)
    input_ids = inputs['input_ids']  # [batch_size, seq_length]
    prompt_lengths = inputs['attention_mask'].sum(dim=1)  # [batch_size]
    batch_size = input_ids.shape[0]

    # Set generation parameters; disable forced min_length in batch mode if not desired
    generation_kwargs = {
        "max_new_tokens": max_new_tokens,
        "pad_token_id": tokenizer.eos_token_id,
        "return_dict_in_generate": True,
        "early_stopping": True,
        "no_repeat_ngram_size": 2
    }
    if force_min_length:
        generation_kwargs["min_length"] = input_ids.shape[1] + min_new_tokens

    # Generate outputs for the batch
    with torch.no_grad():
        output = model.generate(**inputs, **generation_kwargs)

    generated_ids = output.sequences  # [batch_size, total_sequence_length]
    generated_seq_length = generated_ids.shape[1]
    print(f"Generated sequence length: {generated_seq_length}")

    # Decode full generated texts for each sample
    full_texts = [tokenizer.decode(generated_ids[i], skip_special_tokens=True) for i in range(batch_size)]
    answers = [text.split("Answer:")[-1].strip() for text in full_texts]

    # Run a forward pass on the full generated sequences to obtain hidden states
    with torch.no_grad():
        outputs = model(generated_ids, output_hidden_states=True)
    last_hidden = outputs.hidden_states[-1]  # shape: [batch_size, total_seq_length, hidden_dim]

    semantic_entropies = []
    # Compute semantic entropy for each sample
    for i in range(batch_size):
        prompt_len = int(prompt_lengths[i].item())
        generated_hidden = last_hidden[i, prompt_len:, :]  # hidden states for tokens after the prompt
        num_tokens = generated_hidden.shape[0]

        if num_tokens == 0:
            semantic_entropies.append(0.0)
            continue

        # Convert to numpy and cluster
        hidden_np = generated_hidden.cpu().numpy()
        k = min(n_clusters, num_tokens)
        if k == 0:
            semantic_entropies.append(0.0)
            continue
        kmeans = KMeans(n_clusters=k, random_state=0)
        cluster_labels = kmeans.fit_predict(hidden_np)
        counts = np.bincount(cluster_labels, minlength=k)
        probs = counts / np.sum(counts)
        epsilon = 1e-10
        semantic_entropy = -np.sum(probs * np.log(probs + epsilon))
        semantic_entropies.append(semantic_entropy)

    return answers, semantic_entropies

# Load GPT-2 model and tokenizer (using your custom model if applicable)
model_name = "./gpt2-qa-best-loss-cml"
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Example usage with batch size of 2:
questions = [
    "What is the capital of France?",
    "Who is the artist behind the Mona Lisa?"
]
# For batch mode, we disable forced min_length to let each sample generate naturally.
answers, sem_entropies = generate_answers_with_semantic_entropy(questions, n_clusters=5,
                                                                  max_new_tokens=50, force_min_length=False)
for i, q in enumerate(questions):
    print(f"Q: {q}\nA: {answers[i]}\nSemantic Entropy: {sem_entropies[i]:.4f}\n")


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generated sequence length: 27
Q: What is the capital of France?
A: The capital is Paris.
Semantic Entropy: 0.9089

Q: Who is the artist behind the Mona Lisa?
A: The Monma Lisa is Leonardo da Vinci's painting.
Semantic Entropy: 1.4791

