# Dual Model Chat Interface
This notebook provides interfaces to both:
1. **DeepSeek-V2-Lite-Chat** (PyTorch/Transformers) - Full precision model
2. **L3.1-MOE-13.7B** (GGUF/llama.cpp) - Quantized model via API

Use this for comparing model responses and testing different approaches.

## Setup and Imports

In [None]:
import torch
import json
import requests
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import pandas as pd

# Clear GPU memory
torch.cuda.empty_cache()
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

## Load Sample Data

In [None]:
# Load alpaca sample data
def load_sample_data():
    with open('/home/lmx/EchoPersona/alpaca_sample.jsonl', 'r') as f:
        data = [json.loads(line) for line in f]
    return data

# Get a few sample prompts
sample_data = load_sample_data()
print(f"Loaded {len(sample_data)} samples")

# Display first few samples
for i, sample in enumerate(sample_data[:3]):
    print(f"\n--- Sample {i+1} ---")
    print(f"Instruction: {sample['instruction']}")
    if sample['input']:
        print(f"Input: {sample['input']}")
    print(f"Expected Output: {sample['output'][:100]}...")

## Model 1: DeepSeek-V2-Lite-Chat (Transformers)

In [None]:
# Load DeepSeek model
model_dir = "/home/lmx/EchoPersona/models/DeepSeek-V2-Lite-Chat"

print("Loading DeepSeek tokenizer and model...")
deepseek_tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)

deepseek_model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

print("✅ DeepSeek model loaded successfully")

In [None]:
def chat_with_deepseek(instruction, input_text="", max_tokens=512, temperature=0.8):
    """Chat with DeepSeek model"""
    # Format chat
    if input_text:
        prompt = f"<|user|>{instruction}\n\nInput: {input_text}<|assistant|>"
    else:
        prompt = f"<|user|>{instruction}<|assistant|>"
    
    # Tokenize and generate
    inputs = deepseek_tokenizer(prompt, return_tensors="pt").to(deepseek_model.device)
    
    with torch.no_grad():
        outputs = deepseek_model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=0.95,
            repetition_penalty=1.1,
            eos_token_id=deepseek_tokenizer.eos_token_id,
            pad_token_id=deepseek_tokenizer.eos_token_id
        )
    
    # Decode response
    full_response = deepseek_tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = full_response[len(prompt):].strip()
    
    return response

# Test with a sample
test_instruction = "Give three tips for staying healthy."
deepseek_response = chat_with_deepseek(test_instruction)
print(f"DeepSeek Response:\n{deepseek_response}")

## Model 2: L3.1-MOE-13.7B (via API Server)

In [None]:
def chat_with_moe_api(instruction, input_text="", max_tokens=256, temperature=0.8, api_url="http://localhost:7860"):
    """Chat with MOE model via API"""
    # Format prompt
    if input_text:
        prompt = f"Instruction: {instruction}\n\nInput: {input_text}\n\nResponse:"
    else:
        prompt = f"Instruction: {instruction}\n\nResponse:"
    
    # API request
    payload = {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "stream": False
    }
    
    try:
        response = requests.post(f"{api_url}/v1/completions", json=payload, timeout=30)
        if response.status_code == 200:
            return response.json()["text"].strip()
        else:
            return f"API Error: {response.status_code} - {response.text}"
    except requests.exceptions.RequestException as e:
        return f"Connection Error: {e}\n\nMake sure the MOE server is running:\npython /home/lmx/EchoPersona/moe/server.py"

# Test API connection
def check_moe_server():
    try:
        response = requests.get("http://localhost:7860/healthz", timeout=5)
        if response.status_code == 200:
            info = response.json()
            print(f"✅ MOE Server is running")
            print(f"Model: {info['model_path']}")
            print(f"Context: {info['n_ctx']}")
            return True
        else:
            print(f"❌ Server responded with {response.status_code}")
            return False
    except requests.exceptions.RequestException:
        print("❌ MOE Server not responding")
        print("Start it with: python /home/lmx/EchoPersona/moe/server.py")
        return False

server_running = check_moe_server()

In [None]:
# Test MOE model if server is running
if server_running:
    moe_response = chat_with_moe_api(test_instruction)
    print(f"MOE Response:\n{moe_response}")
else:
    print("Skipping MOE test - server not running")

## Compare Both Models

In [None]:
def compare_models(instruction, input_text="", max_tokens=256):
    """Compare responses from both models"""
    print(f"🔥 Instruction: {instruction}")
    if input_text:
        print(f"📝 Input: {input_text}")
    print("\n" + "="*80)
    
    # DeepSeek response
    print("\n🤖 DeepSeek-V2-Lite-Chat (PyTorch):")
    print("-" * 40)
    deepseek_resp = chat_with_deepseek(instruction, input_text, max_tokens)
    print(deepseek_resp)
    
    print("\n" + "="*80)
    
    # MOE response
    print("\n🚀 L3.1-MOE-13.7B (GGUF):")
    print("-" * 40)
    if server_running:
        moe_resp = chat_with_moe_api(instruction, input_text, max_tokens)
        print(moe_resp)
    else:
        print("❌ Server not running. Start with: python /home/lmx/EchoPersona/moe/server.py")
    
    print("\n" + "="*80)

# Test comparison
compare_models("Explain the concept of machine learning in simple terms.")

## Batch Testing with Sample Data

In [None]:
# Test multiple samples
def batch_test(num_samples=3):
    """Test both models on multiple samples"""
    for i in range(min(num_samples, len(sample_data))):
        sample = sample_data[i]
        print(f"\n{'='*20} SAMPLE {i+1} {'='*20}")
        compare_models(sample['instruction'], sample['input'], max_tokens=200)
        print("\n" + "="*60 + "\n")

# Run batch test
batch_test(2)  # Test first 2 samples

## Interactive Chat

In [None]:
# Interactive chat function
def interactive_chat():
    print("🤖 Interactive Dual Model Chat")
    print("Type 'quit' to exit\n")
    
    while True:
        user_input = input("\n👤 You: ")
        if user_input.lower() in ['quit', 'exit', 'q']:
            break
        
        compare_models(user_input, max_tokens=300)

# Uncomment to start interactive chat
# interactive_chat()

## Memory Management

In [None]:
# Clean up GPU memory
def cleanup_memory():
    import gc
    global deepseek_model, deepseek_tokenizer
    
    try:
        del deepseek_model
        del deepseek_tokenizer
    except:
        pass
    
    gc.collect()
    torch.cuda.empty_cache()
    print("✅ Memory cleaned up")

# Uncomment to clean up memory
# cleanup_memory()