# LLaMA Embeddings for Manifold Distance Analysis

This notebook extracts embeddings from LLaMA model for the hallucination detection project.
We'll analyze the embedding space to identify manifolds and measure distances.


In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
import umap
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics.pairwise import cosine_distances

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


## Load Sample Data


In [None]:
# Load sample prompts
with open('../data/sample_prompts.json', 'r') as f:
    prompts = json.load(f)

print(f"Loaded {len(prompts)} sample prompts:")
for i, prompt in enumerate(prompts):
    print(f"{i+1}. {prompt}")


## Load LLaMA Model and Tokenizer

We'll use a smaller LLaMA model for faster processing. You can change this to a larger model if needed.


In [None]:
# Load LLaMA model and tokenizer
# Using a smaller model for faster processing - you can change this to larger models
model_name = "meta-llama/Llama-2-7b-hf"  # or "meta-llama/Llama-2-13b-hf" for larger model

print(f"Loading model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Move model to device
model = model.to(device)
model.eval()

print(f"Model loaded successfully. Hidden size: {model.config.hidden_size}")


## Extract Embeddings

We'll extract embeddings by taking the mean of the last hidden states for each prompt.


In [None]:
def extract_embeddings(texts, model, tokenizer, device):
    """Extract embeddings for a list of texts"""
    embeddings = []
    
    with torch.no_grad():
        for text in tqdm(texts, desc="Extracting embeddings"):
            # Tokenize
            inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            # Get model outputs
            outputs = model(**inputs)
            
            # Take mean of last hidden states (excluding padding tokens)
            attention_mask = inputs['attention_mask']
            last_hidden_states = outputs.last_hidden_state
            
            # Mask out padding tokens and compute mean
            masked_embeddings = last_hidden_states * attention_mask.unsqueeze(-1)
            mean_embedding = masked_embeddings.sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)
            
            embeddings.append(mean_embedding.cpu().numpy())
    
    return np.vstack(embeddings)

# Extract embeddings for all prompts
print("Extracting embeddings...")
embeddings = extract_embeddings(prompts, model, tokenizer, device)
print(f"Embeddings shape: {embeddings.shape}")
