# LLM (Large Language Model) Testing Notebook

This notebook tests **Qwen2.5-7B-Instruct** with different quantization levels:
- 4-bit quantization (BitsAndBytes NF4)
- 8-bit quantization
- No quantization (FP16)

Focus:
- Spanish language understanding and generation
- GPU usage monitoring
- Performance comparison
- Response quality evaluation

In [None]:
# Check if running in Colab
try:
    import google.colab
    IN_COLAB = True
    print("✓ Running in Google Colab")
except:
    IN_COLAB = False
    print("✓ Running locally")

if IN_COLAB:
    print("\n" + "="*70)
    print("  GOOGLE COLAB SETUP")
    print("="*70)
    
    # Clone repository
    print("\n[1/3] Cloning repository...")
    !git clone https://github.com/YOUR_USERNAME/utec-voice-assistant.git
    
    # Change to repo directory
    import os
    os.chdir('utec-voice-assistant')
    print("✓ Repository cloned")
    
    # Install dependencies
    print("\n[2/3] Installing dependencies...")
    !pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
    !pip install -q -r requirements.txt
    print("✓ Dependencies installed")
    
    # Verify GPU
    print("\n[3/3] Verifying GPU access...")
    import torch
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    
    print("\n" + "="*70)
    print("  SETUP COMPLETE!")
    print("="*70)
else:
    print("Skipping Colab setup (running locally)")

# 🚀 Google Colab Setup

**Run this section if using Google Colab. Skip if running locally.**

## 1. Setup and Imports

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import time
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    print(f"Current GPU Memory Usage: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

## 2. GPU Monitoring Utilities

In [None]:
def get_gpu_memory():
    """Get current GPU memory usage in GB."""
    if torch.cuda.is_available():
        return {
            "allocated": torch.cuda.memory_allocated() / 1024**3,
            "reserved": torch.cuda.memory_reserved() / 1024**3,
            "max_allocated": torch.cuda.max_memory_allocated() / 1024**3
        }
    return {"allocated": 0, "reserved": 0, "max_allocated": 0}

def print_gpu_stats(label=""):
    """Print GPU memory statistics."""
    stats = get_gpu_memory()
    print(f"\n{'='*60}")
    print(f"GPU Memory Stats {label}")
    print(f"{'='*60}")
    print(f"Allocated: {stats['allocated']:.3f} GB")
    print(f"Reserved:  {stats['reserved']:.3f} GB")
    print(f"Peak:      {stats['max_allocated']:.3f} GB")
    print(f"{'='*60}")
    return stats

def clear_gpu_memory():
    """Clear GPU cache."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
    print("✓ GPU cache cleared")

## 3. Spanish Test Questions

In [None]:
# Comprehensive Spanish test questions
SPANISH_TEST_QUESTIONS = [
    # General Knowledge
    "¿Cuál es la capital de Perú?",
    
    # Technology
    "Explícame qué es la inteligencia artificial en términos simples.",
    
    # Mathematics
    "¿Cómo se calcula el área de un círculo?",
    
    # History
    "¿Quién fue José de San Martín y cuál fue su importancia en la historia de Sudamérica?",
    
    # Science
    "¿Por qué el cielo es azul?",
    
    # Practical advice
    "Dame tres consejos para aprender programación de manera efectiva.",
    
    # Culture
    "¿Cuáles son algunos platos típicos de la gastronomía peruana?",
    
    # Problem solving
    "Si tengo 100 dólares y quiero ahorrar 20% cada mes, ¿cuánto habré ahorrado en 6 meses?",
    
    # Creative
    "Escribe un haiku sobre la tecnología.",
    
    # Complex reasoning
    "¿Cuáles son las ventajas y desventajas de la energía solar?"
]

SYSTEM_PROMPT = "Eres un asistente de voz útil y amigable. Responde de manera concisa, clara y en español."

print(f"Loaded {len(SPANISH_TEST_QUESTIONS)} test questions")

## 4. Model Loading Functions

In [None]:
def load_llm_4bit():
    """Load Qwen2.5-7B-Instruct with 4-bit quantization."""
    print("\nLoading Qwen2.5-7B-Instruct with 4-bit quantization...")
    clear_gpu_memory()
    
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        "Qwen/Qwen2.5-7B-Instruct",
        quantization_config=quantization_config,
        device_map="auto",
        trust_remote_code=True
    )
    
    tokenizer = AutoTokenizer.from_pretrained(
        "Qwen/Qwen2.5-7B-Instruct",
        trust_remote_code=True
    )
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    print("✓ 4-bit model loaded")
    print_gpu_stats("(4-bit)")
    
    return model, tokenizer


def load_llm_8bit():
    """Load Qwen2.5-7B-Instruct with 8-bit quantization."""
    print("\nLoading Qwen2.5-7B-Instruct with 8-bit quantization...")
    clear_gpu_memory()
    
    quantization_config = BitsAndBytesConfig(
        load_in_8bit=True,
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        "Qwen/Qwen2.5-7B-Instruct",
        quantization_config=quantization_config,
        device_map="auto",
        trust_remote_code=True
    )
    
    tokenizer = AutoTokenizer.from_pretrained(
        "Qwen/Qwen2.5-7B-Instruct",
        trust_remote_code=True
    )
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    print("✓ 8-bit model loaded")
    print_gpu_stats("(8-bit)")
    
    return model, tokenizer


def load_llm_fp16():
    """Load Qwen2.5-7B-Instruct with FP16 (no quantization)."""
    print("\nLoading Qwen2.5-7B-Instruct with FP16 (no quantization)...")
    clear_gpu_memory()
    
    model = AutoModelForCausalLM.from_pretrained(
        "Qwen/Qwen2.5-7B-Instruct",
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    
    tokenizer = AutoTokenizer.from_pretrained(
        "Qwen/Qwen2.5-7B-Instruct",
        trust_remote_code=True
    )
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    print("✓ FP16 model loaded")
    print_gpu_stats("(FP16)")
    
    return model, tokenizer

## 5. Generation Function

In [None]:
def generate_response(model, tokenizer, prompt, system_prompt=SYSTEM_PROMPT, max_new_tokens=512):
    """Generate response from the model."""
    # Prepare messages
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt}
    ]
    
    # Apply chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Tokenize
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=2048)
    inputs = inputs.to(model.device)
    
    # Clear memory stats for measurement
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()
    
    # Generate
    start_time = time.time()
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            top_k=50,
            do_sample=True,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    end_time = time.time()
    
    # Decode
    generated_ids = outputs[0][inputs.input_ids.shape[1]:]
    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
    
    # Calculate metrics
    inference_time = end_time - start_time
    num_tokens = len(generated_ids)
    tokens_per_second = num_tokens / inference_time if inference_time > 0 else 0
    
    # GPU stats
    gpu_stats = get_gpu_memory()
    
    return {
        "response": response,
        "inference_time": inference_time,
        "num_tokens": num_tokens,
        "tokens_per_second": tokens_per_second,
        "gpu_memory_allocated": gpu_stats["allocated"],
        "gpu_memory_peak": gpu_stats["max_allocated"]
    }

## 6. Test with 4-bit Quantization (Recommended)

In [None]:
# Load 4-bit model
model_4bit, tokenizer_4bit = load_llm_4bit()

In [None]:
# Test all questions with 4-bit model
results_4bit = []

print("\n" + "="*70)
print("Testing 4-bit Model with Spanish Questions")
print("="*70)

for i, question in enumerate(SPANISH_TEST_QUESTIONS, 1):
    print(f"\n[{i}/{len(SPANISH_TEST_QUESTIONS)}] Question: {question}")
    
    result = generate_response(model_4bit, tokenizer_4bit, question, max_new_tokens=200)
    
    print(f"\nResponse: {result['response']}")
    print(f"\n⏱️  Time: {result['inference_time']:.2f}s")
    print(f"🚀 Tokens/s: {result['tokens_per_second']:.2f}")
    print(f"💾 GPU Memory: {result['gpu_memory_allocated']:.3f} GB")
    
    results_4bit.append({
        "question": question,
        **result
    })
    
    print("-" * 70)

print("\n✓ 4-bit testing complete!")

## 7. Test with 8-bit Quantization (Optional)

In [None]:
# Free memory from 4-bit model
del model_4bit, tokenizer_4bit
clear_gpu_memory()

# Load 8-bit model
model_8bit, tokenizer_8bit = load_llm_8bit()

In [None]:
# Test subset of questions with 8-bit model
results_8bit = []
test_questions_subset = SPANISH_TEST_QUESTIONS[:5]  # Test first 5 questions

print("\n" + "="*70)
print("Testing 8-bit Model with Spanish Questions")
print("="*70)

for i, question in enumerate(test_questions_subset, 1):
    print(f"\n[{i}/{len(test_questions_subset)}] Question: {question}")
    
    result = generate_response(model_8bit, tokenizer_8bit, question, max_new_tokens=200)
    
    print(f"\nResponse: {result['response']}")
    print(f"\n⏱️  Time: {result['inference_time']:.2f}s")
    print(f"🚀 Tokens/s: {result['tokens_per_second']:.2f}")
    print(f"💾 GPU Memory: {result['gpu_memory_allocated']:.3f} GB")
    
    results_8bit.append({
        "question": question,
        **result
    })
    
    print("-" * 70)

print("\n✓ 8-bit testing complete!")

## 8. Performance Analysis

In [None]:
# Analyze 4-bit performance
df_4bit = pd.DataFrame(results_4bit)

print("\n" + "="*70)
print("4-BIT MODEL PERFORMANCE SUMMARY")
print("="*70)

print(f"\nAverage Inference Time: {df_4bit['inference_time'].mean():.2f}s")
print(f"Average Tokens/Second: {df_4bit['tokens_per_second'].mean():.2f}")
print(f"Average GPU Memory: {df_4bit['gpu_memory_allocated'].mean():.3f} GB")
print(f"Peak GPU Memory: {df_4bit['gpu_memory_peak'].max():.3f} GB")
print(f"Average Response Length: {df_4bit['num_tokens'].mean():.0f} tokens")

print("\n" + "="*70)
print("Response Quality (Sample)")
print("="*70)

for i in range(min(3, len(results_4bit))):
    print(f"\nQ: {results_4bit[i]['question']}")
    print(f"A: {results_4bit[i]['response'][:200]}...")
    print("-" * 70)

## 9. Comparison (if 8-bit was tested)

In [None]:
if results_8bit:
    df_8bit = pd.DataFrame(results_8bit)
    
    print("\n" + "="*70)
    print("QUANTIZATION COMPARISON")
    print("="*70)
    
    comparison = pd.DataFrame({
        "Metric": [
            "Avg Inference Time (s)",
            "Avg Tokens/Second",
            "Avg GPU Memory (GB)",
            "Peak GPU Memory (GB)"
        ],
        "4-bit": [
            df_4bit['inference_time'].mean(),
            df_4bit['tokens_per_second'].mean(),
            df_4bit['gpu_memory_allocated'].mean(),
            df_4bit['gpu_memory_peak'].max()
        ],
        "8-bit": [
            df_8bit['inference_time'].mean(),
            df_8bit['tokens_per_second'].mean(),
            df_8bit['gpu_memory_allocated'].mean(),
            df_8bit['gpu_memory_peak'].max()
        ]
    })
    
    print("\n", comparison.to_string(index=False))
    
    print("\n" + "="*70)
    print("RECOMMENDATION")
    print("="*70)
    print("\nFor voice assistant with <12GB VRAM:")
    print("✅ Use 4-bit quantization")
    print(f"   - Lower memory footprint (~{df_4bit['gpu_memory_peak'].max():.1f}GB)")
    print(f"   - Good performance (~{df_4bit['tokens_per_second'].mean():.1f} tokens/s)")
    print("   - Leaves room for ASR and TTS models")
else:
    print("\n8-bit comparison skipped.")

## 10. Interactive Testing

In [None]:
# Interactive chat (using 4-bit model)
# Reload if needed
if 'model_4bit' not in locals():
    model_4bit, tokenizer_4bit = load_llm_4bit()

# Custom question
custom_question = "¿Cuáles son las mejores prácticas para crear un asistente de voz?"

print(f"\nUser: {custom_question}")
result = generate_response(model_4bit, tokenizer_4bit, custom_question, max_new_tokens=300)

print(f"\nAssistant: {result['response']}")
print(f"\n⏱️  Time: {result['inference_time']:.2f}s")
print(f"🚀 Tokens/s: {result['tokens_per_second']:.2f}")
print(f"💾 GPU Memory: {result['gpu_memory_allocated']:.3f} GB")

## 11. Cleanup

In [None]:
# Free memory
if 'model_4bit' in locals():
    del model_4bit, tokenizer_4bit

if 'model_8bit' in locals():
    del model_8bit, tokenizer_8bit

clear_gpu_memory()
print_gpu_stats("(After cleanup)")

## Summary

This notebook tested Qwen2.5-7B-Instruct for Spanish language understanding:

**Key Findings:**
- 4-bit quantization provides excellent balance of quality and memory efficiency
- Spanish language support is robust across various question types
- Memory usage fits within 12GB constraint when combined with ASR and TTS
- Inference speed is suitable for real-time voice assistant applications

**Recommendation:** Use 4-bit quantization for the voice assistant project.