# CELL 1: Setup for LLaMA 2





In [1]:

from google.colab import drive
import sys
import torch

drive.mount('/content/drive')
sys.path.append('/content/drive/MyDrive/RAG_Research/src')

# Install required packages for LLaMA
!pip install -q transformers>=4.33.0
!pip install -q accelerate
!pip install -q bitsandbytes  # For quantization
!pip install -q sentencepiece  # For LLaMA tokenizer

from colab_utils import ColabUtils
utils = ColabUtils()

# Check if we have enough resources for LLaMA
info = utils.get_runtime_info()
if info['gpu_total'] < 15:  # Less than 15GB GPU
    print("⚠️ Using 8-bit quantization for LLaMA due to memory constraints")
    USE_8BIT = True
else:
    print("✅ Sufficient GPU memory for full precision")
    USE_8BIT = False

Mounted at /content/drive
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m133.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m100.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m63.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━

# CELL 2: Load LLaMA 2 Model\

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

class LLaMAManager:
    """LLaMA 2 model management for Colab"""

    def __init__(self, model_size="7b"):
        self.model_size = model_size
        self.model_name = f"meta-llama/Llama-2-{model_size}-chat-hf"
        self.model = None
        self.tokenizer = None

    def load_model(self, use_8bit=True):
        """Load LLaMA model with optimization"""
        print(f"📥 Loading LLaMA 2 {self.model_size}...")

        try:
            # Load tokenizer
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.tokenizer.pad_token = self.tokenizer.eos_token

            # Configure quantization if needed
            if use_8bit:
                quantization_config = BitsAndBytesConfig(
                    load_in_8bit=True,
                    bnb_8bit_compute_dtype=torch.float16
                )
                model_kwargs = {
                    "quantization_config": quantization_config,
                    "device_map": "auto"
                }
            else:
                model_kwargs = {
                    "torch_dtype": torch.float16,
                    "device_map": "auto"
                }

            # Load model
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                **model_kwargs
            )

            print("✅ LLaMA 2 loaded successfully")
            return True

        except Exception as e:
            print(f"❌ Error loading LLaMA 2: {e}")
            return False

    def generate_response(self, prompt, max_new_tokens=256):
        """Generate response with LLaMA 2"""
        if not self.model or not self.tokenizer:
            return "Model not loaded"

        # Format prompt for chat model
        system_message = "You are a helpful assistant. Answer the question based on the provided context."
        formatted_prompt = f"<s>[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n{prompt} [/INST]"

        try:
            inputs = self.tokenizer.encode(formatted_prompt, return_tensors="pt")

            with torch.no_grad():
                outputs = self.model.generate(
                    inputs,
                    max_new_tokens=max_new_tokens,
                    temperature=0.1,
                    do_sample=True,
                    pad_token_id=self.tokenizer.eos_token_id
                )

            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            # Extract only the response part
            response = response.split("[/INST]")[-1].strip()

            return response

        except Exception as e:
            print(f"❌ Error generating response: {e}")
            return "Error generating response"

# Load LLaMA 2 7B
llama_manager = LLaMAManager("7b")
llama_success = llama_manager.load_model(use_8bit=USE_8BIT)


📥 Loading LLaMA 2 7b...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


❌ Error loading LLaMA 2: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-2-7b-chat-hf.
401 Client Error. (Request ID: Root=1-6859baa2-6a86ba1d1ebee2f575a6d393;43ccbeed-9b46-4f7d-945e-b3d2674a2ac8)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/resolve/main/config.json.
Access to model meta-llama/Llama-2-7b-chat-hf is restricted. You must have access to it and be authenticated to access it. Please log in.


In [3]:
# Install required dependencies for RAG pipeline
import subprocess
import sys

def install_package(package_name):
    """Install a package using pip"""
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        print(f"✅ Successfully installed {package_name}")
        return True
    except subprocess.CalledProcessError as e:
        print(f"❌ Failed to install {package_name}: {e}")
        return False

# Required packages for RAG with LLaMA
required_packages = [
    "chromadb",           # Vector database
    "sentence-transformers",  # For embeddings
    "python-dotenv",      # For .env file loading
    "datasets",           # For data handling
    "accelerate",         # For model optimization
    "bitsandbytes",       # For quantization
    "langchain",          # Optional: for RAG utilities
    "faiss-cpu"           # Alternative vector database
]

print("🔧 Installing required packages...")
print("=" * 50)

failed_packages = []
for package in required_packages:
    print(f"\n📦 Installing {package}...")
    if not install_package(package):
        failed_packages.append(package)

print("\n" + "=" * 50)
if failed_packages:
    print(f"❌ Failed to install: {', '.join(failed_packages)}")
    print("Try installing them manually:")
    for pkg in failed_packages:
        print(f"!pip install {pkg}")
else:
    print("✅ All packages installed successfully!")

print("\n🔄 Restarting runtime is recommended after installation...")
print("Go to Runtime -> Restart Runtime in Colab menu")

🔧 Installing required packages...

📦 Installing chromadb...
✅ Successfully installed chromadb

📦 Installing sentence-transformers...
✅ Successfully installed sentence-transformers

📦 Installing python-dotenv...
✅ Successfully installed python-dotenv

📦 Installing datasets...
✅ Successfully installed datasets

📦 Installing accelerate...
✅ Successfully installed accelerate

📦 Installing bitsandbytes...
✅ Successfully installed bitsandbytes

📦 Installing langchain...
✅ Successfully installed langchain

📦 Installing faiss-cpu...
✅ Successfully installed faiss-cpu

✅ All packages installed successfully!

🔄 Restarting runtime is recommended after installation...
Go to Runtime -> Restart Runtime in Colab menu


In [4]:
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login
import torch

# Load environment variables from .env file
try:
    from dotenv import load_dotenv
    load_dotenv()
    print("✅ .env file loaded successfully")
except ImportError:
    # Install python-dotenv if not available
    os.system('pip install python-dotenv')
    from dotenv import load_dotenv
    load_dotenv()
    print("✅ Installed python-dotenv and loaded .env file")

class LLaMAManager:
    """LLaMA 2 model management for Colab with authentication"""

    def __init__(self, model_size="7b"):
        self.model_size = model_size
        self.model_name = f"meta-llama/Llama-2-{model_size}-chat-hf"
        self.model = None
        self.tokenizer = None

    def authenticate(self):
        """Authenticate with Hugging Face"""
        try:
            # Get token from environment variable (loaded from .env)
            token = os.getenv('HUGGINGFACE_TOKEN')
            if not token:
                print("❌ HUGGINGFACE_TOKEN not found in environment variables")
                print("Make sure your .env file contains: HUGGINGFACE_TOKEN=hf_your_token_here")
                print("Current environment variables related to HF:")
                hf_vars = {k: v[:10] + "..." if len(v) > 10 else v
                          for k, v in os.environ.items() if 'HF' in k.upper() or 'HUGGING' in k.upper()}
                print(hf_vars)
                return False

            # Login to Hugging Face
            login(token=token)
            print("✅ Successfully authenticated with Hugging Face")
            return True

        except Exception as e:
            print(f"❌ Authentication failed: {e}")
            return False

    def load_model(self, use_8bit=True):
        """Load LLaMA model with optimization"""
        print(f"📥 Loading LLaMA 2 {self.model_size}...")

        # First authenticate
        if not self.authenticate():
            return False

        try:
            # Load tokenizer with authentication
            self.tokenizer = AutoTokenizer.from_pretrained(
                self.model_name,
                use_auth_token=True  # Use the authenticated token
            )
            self.tokenizer.pad_token = self.tokenizer.eos_token

            # Configure quantization if needed
            if use_8bit:
                quantization_config = BitsAndBytesConfig(
                    load_in_8bit=True,
                    bnb_8bit_compute_dtype=torch.float16
                )
                model_kwargs = {
                    "quantization_config": quantization_config,
                    "device_map": "auto",
                    "use_auth_token": True
                }
            else:
                model_kwargs = {
                    "torch_dtype": torch.float16,
                    "device_map": "auto",
                    "use_auth_token": True
                }

            # Load model with authentication
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                **model_kwargs
            )

            print("✅ LLaMA 2 loaded successfully")
            return True

        except Exception as e:
            print(f"❌ Error loading LLaMA 2: {e}")
            print("Make sure you have:")
            print("1. Accepted the LLaMA 2 license at https://huggingface.co/meta-llama/Llama-2-7b-chat-hf")
            print("2. Set your HUGGINGFACE_TOKEN correctly")
            return False

    def generate_response(self, prompt, max_new_tokens=256, temperature=0.7):
        """Generate response with LLaMA 2"""
        if not self.model or not self.tokenizer:
            return "Model not loaded"

        # Format prompt for chat model
        system_message = "You are a helpful assistant. Answer the question based on the provided context."
        formatted_prompt = f"<s>[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n{prompt} [/INST]"

        try:
            inputs = self.tokenizer.encode(formatted_prompt, return_tensors="pt")

            with torch.no_grad():
                outputs = self.model.generate(
                    inputs,
                    max_new_tokens=max_new_tokens,
                    temperature=temperature,
                    do_sample=True,
                    top_p=0.9,
                    pad_token_id=self.tokenizer.eos_token_id,
                    eos_token_id=self.tokenizer.eos_token_id
                )

            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            # Extract only the response part
            response = response.split("[/INST]")[-1].strip()

            return response

        except Exception as e:
            print(f"❌ Error generating response: {e}")
            return "Error generating response"

# Check if .env file exists and show its contents (for debugging)
if os.path.exists('.env'):
    print("📁 .env file found")
    with open('.env', 'r') as f:
        lines = f.readlines()
        print(f"📝 .env contains {len(lines)} line(s)")
        # Show variable names (not values for security)
        for line in lines:
            if '=' in line and not line.strip().startswith('#'):
                var_name = line.split('=')[0].strip()
                print(f"   - {var_name}")
else:
    print("❌ .env file not found in current directory")
    print(f"Current directory: {os.getcwd()}")
    print("Available files:", [f for f in os.listdir('.') if f.startswith('.')])

# Load LLaMA 2 7B
USE_8BIT = True  # Set to False if you have enough VRAM
llama_manager = LLaMAManager("7b")
llama_success = llama_manager.load_model(use_8bit=USE_8BIT)

# Test the model if loading was successful
if llama_success:
    test_response = llama_manager.generate_response("What is artificial intelligence?")
    print(f"\n🤖 Test Response: {test_response}")
else:
    print("❌ Model loading failed. Please check authentication and repository access.")

✅ .env file loaded successfully
📁 .env file found
📝 .env contains 2 line(s)
   - HUGGINGFACE_TOKEN
   - OPENROUTER_API_KEY
📥 Loading LLaMA 2 7b...
✅ Successfully authenticated with Hugging Face




tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


✅ LLaMA 2 loaded successfully
❌ Error generating response: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

🤖 Test Response: Error generating response




In [5]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login

# Load environment variables from .env file
try:
    from dotenv import load_dotenv
    load_dotenv()
    print("✅ .env file loaded successfully")
except ImportError:
    os.system('pip install python-dotenv')
    from dotenv import load_dotenv
    load_dotenv()
    print("✅ Installed python-dotenv and loaded .env file")

class LLaMAManager:
    """LLaMA 2 model management with device fixes"""

    def __init__(self, model_size="7b"):
        self.model_size = model_size
        self.model_name = f"meta-llama/Llama-2-{model_size}-chat-hf"
        self.model = None
        self.tokenizer = None
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"🔧 Using device: {self.device}")

    def authenticate(self):
        """Authenticate with Hugging Face"""
        try:
            token = os.getenv('HUGGINGFACE_TOKEN')
            if not token:
                print("❌ HUGGINGFACE_TOKEN not found in environment variables")
                return False

            login(token=token)
            print("✅ Successfully authenticated with Hugging Face")
            return True

        except Exception as e:
            print(f"❌ Authentication failed: {e}")
            return False

    def load_model(self, use_8bit=True):
        """Load LLaMA model with proper device management"""
        print(f"📥 Loading LLaMA 2 {self.model_size}...")

        if not self.authenticate():
            return False

        try:
            # Load tokenizer
            self.tokenizer = AutoTokenizer.from_pretrained(
                self.model_name,
                use_auth_token=True
            )
            self.tokenizer.pad_token = self.tokenizer.eos_token
            print("✅ Tokenizer loaded")

            # Configure model loading based on device and memory
            if use_8bit and torch.cuda.is_available():
                # 8-bit quantization for GPU
                quantization_config = BitsAndBytesConfig(
                    load_in_8bit=True,
                    bnb_8bit_compute_dtype=torch.float16
                )
                model_kwargs = {
                    "quantization_config": quantization_config,
                    "device_map": "auto",
                    "use_auth_token": True,
                    "torch_dtype": torch.float16,
                    "low_cpu_mem_usage": True
                }
            elif torch.cuda.is_available():
                # GPU without quantization
                model_kwargs = {
                    "torch_dtype": torch.float16,
                    "device_map": "auto",
                    "use_auth_token": True,
                    "low_cpu_mem_usage": True
                }
            else:
                # CPU fallback
                model_kwargs = {
                    "torch_dtype": torch.float32,
                    "device_map": {"": "cpu"},
                    "use_auth_token": True,
                    "low_cpu_mem_usage": True
                }

            # Load model
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                **model_kwargs
            )

            print("✅ LLaMA 2 model loaded successfully")

            # Print device info
            if hasattr(self.model, 'hf_device_map'):
                print(f"📍 Device map: {self.model.hf_device_map}")

            return True

        except Exception as e:
            print(f"❌ Error loading LLaMA 2: {e}")
            return False

    def generate_response(self, prompt, max_new_tokens=256, temperature=0.7):
        """Generate response with proper device handling"""
        if not self.model or not self.tokenizer:
            return "Model not loaded"

        # Format prompt for chat model
        system_message = "You are a helpful assistant. Answer the question based on the provided context."
        formatted_prompt = f"<s>[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n{prompt} [/INST]"

        try:
            # Tokenize input
            inputs = self.tokenizer.encode(formatted_prompt, return_tensors="pt")

            # Move inputs to the same device as model
            if torch.cuda.is_available() and hasattr(self.model, 'device'):
                # For models with device_map, find the device of the first parameter
                if hasattr(self.model, 'hf_device_map'):
                    first_device = list(self.model.hf_device_map.values())[0]
                    inputs = inputs.to(first_device)
                else:
                    inputs = inputs.to(self.device)

            # Generate with proper settings
            with torch.no_grad():
                outputs = self.model.generate(
                    inputs,
                    max_new_tokens=max_new_tokens,
                    temperature=temperature,
                    do_sample=True,
                    top_p=0.9,
                    pad_token_id=self.tokenizer.eos_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                    attention_mask=torch.ones_like(inputs)  # Explicit attention mask
                )

            # Decode response
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            response = response.split("[/INST]")[-1].strip()

            return response

        except Exception as e:
            print(f"❌ Error generating response: {e}")
            print(f"Error type: {type(e).__name__}")

            # Try fallback with simpler generation
            try:
                print("🔄 Trying fallback generation...")
                inputs = self.tokenizer.encode(prompt, return_tensors="pt")

                # Simple generation without advanced parameters
                with torch.no_grad():
                    outputs = self.model.generate(
                        inputs,
                        max_length=inputs.shape[1] + max_new_tokens,
                        pad_token_id=self.tokenizer.eos_token_id
                    )

                response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
                return response[len(prompt):].strip()

            except Exception as e2:
                print(f"❌ Fallback also failed: {e2}")
                return f"Error generating response: {str(e)}"

# Test the fixed model
print("🚀 Testing LLaMA Manager with device fixes...")

# Load model
USE_8BIT = True
llama_manager = LLaMAManager("7b")
llama_success = llama_manager.load_model(use_8bit=USE_8BIT)

if llama_success:
    print("\n🧪 Testing model generation...")

    # Simple test
    test_prompt = "What is the capital of France?"
    response = llama_manager.generate_response(test_prompt, max_new_tokens=50)
    print(f"✅ Test successful!")
    print(f"Question: {test_prompt}")
    print(f"Response: {response}")

    # Memory cleanup
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("🧹 GPU cache cleared")

else:
    print("❌ Model loading failed")

✅ .env file loaded successfully
🚀 Testing LLaMA Manager with device fixes...
🔧 Using device: cuda
📥 Loading LLaMA 2 7b...
✅ Successfully authenticated with Hugging Face




✅ Tokenizer loaded




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ LLaMA 2 model loaded successfully
📍 Device map: {'': 0}

🧪 Testing model generation...
✅ Test successful!
Question: What is the capital of France?
Response: Ah, a simple question, yet one that is near and dear to my heart! *adjusts monocle* The capital of France, my dear, is none other than the majestic Paris! ������
🧹 GPU cache cleared


# CELL 3: LLaMA-specific RAG Pipeline

In [6]:

class LLaMARAGPipeline:
    """RAG pipeline optimized for LLaMA 2"""

    def __init__(self, vector_db, llama_manager):
        self.vector_db = vector_db
        self.llama_manager = llama_manager

    def create_llama_prompt(self, question, context_docs):
        """Create properly formatted prompt for LLaMA"""
        if context_docs and context_docs['documents']:
            context = "\n\n".join([
                f"Document {i+1}: {doc}"
                for i, doc in enumerate(context_docs['documents'])
            ])
        else:
            context = "No relevant context found."

        prompt = f"""Context Information:
{context}

Question: {question}

Please answer the question based on the provided context. If the context doesn't contain enough information, say so clearly."""

        return prompt

    def evaluate_answer(self, question, generated_answer, expected_answer=None):
        """Evaluate the quality of generated answer"""
        metrics = {}

        # Basic metrics
        metrics['answer_length'] = len(generated_answer.split())
        metrics['contains_question_keywords'] = any(
            word.lower() in generated_answer.lower()
            for word in question.split()
            if len(word) > 3
        )

        if expected_answer:
            # Simple overlap metric
            expected_words = set(expected_answer.lower().split())
            generated_words = set(generated_answer.lower().split())

            if len(expected_words) > 0:
                overlap = len(expected_words.intersection(generated_words))
                metrics['word_overlap'] = overlap / len(expected_words)
            else:
                metrics['word_overlap'] = 0

        return metrics

    def run_evaluation(self, test_questions, save_results=True):
        """Run complete evaluation on test questions"""
        results = []

        for i, question_data in enumerate(test_questions):
            print(f"\n🔍 Evaluating {i+1}/{len(test_questions)}: {question_data['question'][:50]}...")

            # Retrieve documents
            retrieved = self.vector_db.search(question_data['question'], n_results=3)

            # Create prompt
            prompt = self.create_llama_prompt(question_data['question'], retrieved)

            # Generate answer
            answer = self.llama_manager.generate_response(prompt, max_new_tokens=200)

            # Evaluate
            metrics = self.evaluate_answer(
                question_data['question'],
                answer,
                question_data.get('answer')
            )

            result = {
                'question': question_data['question'],
                'expected_answer': question_data.get('answer', ''),
                'generated_answer': answer,
                'retrieved_docs': retrieved['documents'] if retrieved else [],
                'metrics': metrics,
                'timestamp': time.time()
            }

            results.append(result)

            # Print summary
            print(f"   Generated: {answer[:100]}...")
            print(f"   Metrics: {metrics}")

            # Save progress periodically
            if (i + 1) % 5 == 0 and save_results:
                utils.save_to_drive(results, f"results/llama2_results_partial_{i+1}.json")
                utils.clear_gpu_memory()  # Clear memory

        if save_results:
            utils.save_to_drive(results, "results/llama2_complete_results.json")

        return results

# Test with LLaMA if loaded successfully
if llama_success:
    # Load vector database from previous notebook
    from chromadb import PersistentClient

    client = PersistentClient(path="/content/drive/MyDrive/RAG_Research/data/embeddings")
    try:
        collection = client.get_collection("rag_test")
        vector_db_simple = type('VectorDB', (), {
            'search': lambda self, query, n_results=5: collection.query(
                query_texts=[query], n_results=n_results
            )
        })()

        # Create test questions
        test_questions = [
            {"question": "What is the capital of France?", "answer": "Paris"},
            {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"}
        ]

        # Run LLaMA evaluation
        llama_rag = LLaMARAGPipeline(vector_db_simple, llama_manager)
        llama_results = llama_rag.run_evaluation(test_questions)

        print("✅ LLaMA 2 evaluation completed!")

    except Exception as e:
        print(f"❌ Error setting up evaluation: {e}")
else:
    print("❌ LLaMA 2 not loaded. Please check the model loading cell.")


🔍 Evaluating 1/2: What is the capital of France?...


/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:19<00:00, 4.36MiB/s]


❌ Error setting up evaluation: name 'time' is not defined


In [7]:
import os
import time
import torch
from typing import List, Dict, Any

# Import required libraries
try:
    from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
    from huggingface_hub import login
    from dotenv import load_dotenv
    import chromadb
    from chromadb.config import Settings
    from sentence_transformers import SentenceTransformer
    print("✅ All imports successful")
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("Installing missing packages...")
    os.system("pip install transformers huggingface_hub python-dotenv chromadb sentence-transformers")

# Load environment variables
load_dotenv()

class LLaMAManager:
    """Working LLaMA Manager"""

    def __init__(self, model_size="7b"):
        self.model_size = model_size
        self.model_name = f"meta-llama/Llama-2-{model_size}-chat-hf"
        self.model = None
        self.tokenizer = None
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.is_loaded = False

    def authenticate(self):
        """Authenticate with Hugging Face"""
        try:
            token = os.getenv('HUGGINGFACE_TOKEN')
            if not token:
                print("❌ HUGGINGFACE_TOKEN not found")
                return False
            login(token=token)
            print("✅ Authenticated with Hugging Face")
            return True
        except Exception as e:
            print(f"❌ Authentication failed: {e}")
            return False

    def load_model(self, use_cpu=False):
        """Load model with proper device management"""
        if not self.authenticate():
            return False

        try:
            print(f"📥 Loading LLaMA 2 {self.model_size}...")

            # Load tokenizer
            self.tokenizer = AutoTokenizer.from_pretrained(
                self.model_name, use_auth_token=True
            )
            self.tokenizer.pad_token = self.tokenizer.eos_token
            print("✅ Tokenizer loaded")

            # Load model
            if use_cpu or not torch.cuda.is_available():
                self.model = AutoModelForCausalLM.from_pretrained(
                    self.model_name,
                    torch_dtype=torch.float32,
                    device_map="cpu",
                    use_auth_token=True,
                    low_cpu_mem_usage=True
                )
                self.device = "cpu"
            else:
                self.model = AutoModelForCausalLM.from_pretrained(
                    self.model_name,
                    torch_dtype=torch.float16,
                    device_map={"": 0},
                    use_auth_token=True,
                    low_cpu_mem_usage=True
                )

            self.is_loaded = True
            print(f"✅ Model loaded successfully on {self.device}")
            return True

        except Exception as e:
            print(f"❌ Model loading failed: {e}")
            self.is_loaded = False
            return False

    def generate_response(self, prompt, max_new_tokens=200):
        """Generate response"""
        if not self.is_loaded or not self.model:
            return "Model not loaded"

        try:
            # Simple prompt format
            full_prompt = f"Human: {prompt}\nAssistant:"

            inputs = self.tokenizer.encode(full_prompt, return_tensors="pt")
            inputs = inputs.to(self.device)

            with torch.no_grad():
                outputs = self.model.generate(
                    inputs,
                    max_new_tokens=max_new_tokens,
                    do_sample=False,
                    pad_token_id=self.tokenizer.eos_token_id,
                    eos_token_id=self.tokenizer.eos_token_id
                )

            full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

            if "Assistant:" in full_response:
                response = full_response.split("Assistant:")[-1].strip()
            else:
                response = full_response[len(full_prompt):].strip()

            return response

        except Exception as e:
            print(f"❌ Generation error: {e}")
            return f"Generation failed: {str(e)}"

class SimpleVectorDB:
    """Simple vector database for RAG"""

    def __init__(self):
        try:
            self.client = chromadb.Client(Settings(anonymized_telemetry=False))
            # Reset collection if it exists
            try:
                self.client.delete_collection("rag_documents")
            except:
                pass
            self.collection = self.client.create_collection("rag_documents")
            self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
            self.is_ready = True
            print("✅ Vector database initialized")
        except Exception as e:
            print(f"❌ Vector DB init failed: {e}")
            self.is_ready = False

    def add_documents(self, documents: List[str]):
        """Add documents to vector database"""
        if not self.is_ready:
            return False

        try:
            embeddings = self.encoder.encode(documents).tolist()
            ids = [f"doc_{i}" for i in range(len(documents))]

            self.collection.add(
                embeddings=embeddings,
                documents=documents,
                ids=ids
            )
            print(f"✅ Added {len(documents)} documents")
            return True
        except Exception as e:
            print(f"❌ Error adding documents: {e}")
            return False

    def search(self, query: str, n_results: int = 3):
        """Search for relevant documents"""
        if not self.is_ready:
            return {'documents': []}

        try:
            query_embedding = self.encoder.encode([query]).tolist()

            results = self.collection.query(
                query_embeddings=query_embedding,
                n_results=n_results
            )

            return {
                'documents': results['documents'][0] if results['documents'] else [],
                'distances': results['distances'][0] if results['distances'] else []
            }

        except Exception as e:
            print(f"❌ Search error: {e}")
            return {'documents': []}

class LLaMARAGPipeline:
    """RAG pipeline optimized for LLaMA 2"""

    def __init__(self, vector_db, llama_manager):
        self.vector_db = vector_db
        self.llama_manager = llama_manager

    def create_llama_prompt(self, question, context_docs):
        """Create properly formatted prompt for LLaMA"""
        if context_docs and context_docs.get('documents'):
            context = "\n\n".join([
                f"Document {i+1}: {doc}"
                for i, doc in enumerate(context_docs['documents'])
            ])
        else:
            context = "No relevant context found."

        prompt = f"""Context Information:
{context}

Question: {question}

Please answer the question based on the provided context. If the context doesn't contain enough information, say so clearly."""

        return prompt

    def evaluate_answer(self, question, generated_answer, expected_answer=None):
        """Evaluate the quality of generated answer"""
        metrics = {}

        # Basic metrics
        metrics['answer_length'] = len(generated_answer.split())
        metrics['contains_question_keywords'] = any(
            word.lower() in generated_answer.lower()
            for word in question.split()
            if len(word) > 3
        )

        if expected_answer:
            # Simple overlap metric
            expected_words = set(expected_answer.lower().split())
            generated_words = set(generated_answer.lower().split())

            if len(expected_words) > 0:
                overlap = len(expected_words.intersection(generated_words))
                metrics['word_overlap'] = overlap / len(expected_words)
            else:
                metrics['word_overlap'] = 0

        return metrics

    def run_evaluation(self, test_questions, save_results=False):
        """Run complete evaluation on test questions"""
        if not self.llama_manager.is_loaded:
            print("❌ LLaMA model not loaded")
            return []

        results = []

        for i, question_data in enumerate(test_questions):
            print(f"\n🔍 Evaluating {i+1}/{len(test_questions)}: {question_data['question'][:50]}...")

            # Retrieve documents
            retrieved = self.vector_db.search(question_data['question'], n_results=3)

            # Create prompt
            prompt = self.create_llama_prompt(question_data['question'], retrieved)

            # Generate answer
            start_time = time.time()
            answer = self.llama_manager.generate_response(prompt, max_new_tokens=200)
            generation_time = time.time() - start_time

            # Evaluate
            metrics = self.evaluate_answer(
                question_data['question'],
                answer,
                question_data.get('answer')
            )

            result = {
                'question': question_data['question'],
                'expected_answer': question_data.get('answer', ''),
                'generated_answer': answer,
                'retrieved_docs': retrieved.get('documents', []),
                'metrics': metrics,
                'generation_time': generation_time,
                'timestamp': time.time()
            }

            results.append(result)

            # Print summary
            print(f"   Generated: {answer[:100]}...")
            print(f"   Expected: {question_data.get('answer', 'N/A')}")
            print(f"   Time: {generation_time:.2f}s")
            print(f"   Metrics: {metrics}")

            # Clear GPU memory periodically
            if torch.cuda.is_available() and (i + 1) % 2 == 0:
                torch.cuda.empty_cache()

        return results

# Main execution function
def main():
    """Run the complete RAG evaluation"""
    print("🚀 Starting Complete RAG Evaluation")
    print("=" * 60)

    # Sample documents for testing
    documents = [
        "Paris is the capital and largest city of France. It is located in the north-central part of the country.",
        "William Shakespeare was an English playwright, poet, and actor. He wrote Romeo and Juliet, one of his most famous tragedies.",
        "The Eiffel Tower is a famous landmark in Paris, France. It was built in 1889 and stands 324 meters tall.",
        "Romeo and Juliet is a tragedy written by William Shakespeare early in his career about two young star-crossed lovers.",
        "France is a country in Western Europe. Its capital is Paris and it has a population of about 67 million people.",
        "Shakespeare wrote many plays including Hamlet, Macbeth, and King Lear in addition to Romeo and Juliet."
    ]

    # Test questions
    test_questions = [
        {"question": "What is the capital of France?", "answer": "Paris"},
        {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"},
        {"question": "What is the Eiffel Tower?", "answer": "A famous landmark in Paris"}
    ]

    # Initialize vector database
    print("\n📚 Setting up vector database...")
    vector_db = SimpleVectorDB()

    if not vector_db.is_ready:
        print("❌ Vector database failed to initialize")
        return

    # Add documents
    vector_db.add_documents(documents)

    # Initialize LLaMA model
    print("\n🤖 Loading LLaMA model...")
    llama_manager = LLaMAManager("7b")

    # Load model (try GPU first, fallback to CPU)
    llama_success = llama_manager.load_model(use_cpu=False)

    if not llama_success:
        print("⚠️ GPU loading failed, trying CPU...")
        llama_success = llama_manager.load_model(use_cpu=True)

    if not llama_success:
        print("❌ Failed to load LLaMA model")
        return

    # Create RAG pipeline
    print("\n🔧 Setting up RAG pipeline...")
    rag_pipeline = LLaMARAGPipeline(vector_db, llama_manager)

    # Run evaluation
    print("\n🔍 Running RAG evaluation...")
    print("=" * 60)

    results = rag_pipeline.run_evaluation(test_questions)

    # Print final results
    print("\n" + "=" * 60)
    print("📊 FINAL EVALUATION RESULTS")
    print("=" * 60)

    for i, result in enumerate(results):
        print(f"\n--- Question {i+1} ---")
        print(f"Q: {result['question']}")
        print(f"Expected: {result['expected_answer']}")
        print(f"Generated: {result['generated_answer']}")
        print(f"Documents used: {len(result['retrieved_docs'])}")
        print(f"Word overlap: {result['metrics'].get('word_overlap', 0):.2f}")
        print(f"Generation time: {result['generation_time']:.2f}s")

    print(f"\n✅ Evaluation completed! Processed {len(results)} questions.")

    # Clean up
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("🧹 GPU memory cleared")

    return results

# Run the complete evaluation
if __name__ == "__main__":
    results = main()

✅ All imports successful
🚀 Starting Complete RAG Evaluation

📚 Setting up vector database...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Vector database initialized
✅ Added 6 documents

🤖 Loading LLaMA model...
✅ Authenticated with Hugging Face
📥 Loading LLaMA 2 7b...




✅ Tokenizer loaded




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


✅ Model loaded successfully on cuda

🔧 Setting up RAG pipeline...

🔍 Running RAG evaluation...

🔍 Evaluating 1/3: What is the capital of France?...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   Generated: Based on the provided context, the capital of France is Paris....
   Expected: Paris
   Time: 0.83s
   Metrics: {'answer_length': 11, 'contains_question_keywords': True, 'word_overlap': 0.0}

🔍 Evaluating 2/3: Who wrote Romeo and Juliet?...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


   Generated: Based on the provided context, the answer to your question is William Shakespeare....
   Expected: William Shakespeare
   Time: 0.95s
   Metrics: {'answer_length': 13, 'contains_question_keywords': False, 'word_overlap': 0.5}

🔍 Evaluating 3/3: What is the Eiffel Tower?...
   Generated: The Eiffel Tower is a famous landmark in Paris, France. It was built in 1889 and stands 324 meters t...
   Expected: A famous landmark in Paris
   Time: 1.99s
   Metrics: {'answer_length': 20, 'contains_question_keywords': True, 'word_overlap': 0.8}

📊 FINAL EVALUATION RESULTS

--- Question 1 ---
Q: What is the capital of France?
Expected: Paris
Generated: Based on the provided context, the capital of France is Paris.
Documents used: 3
Word overlap: 0.00
Generation time: 0.83s

--- Question 2 ---
Q: Who wrote Romeo and Juliet?
Expected: William Shakespeare
Generated: Based on the provided context, the answer to your question is William Shakespeare.
Documents used: 3
Word overlap: 0.50
Gene