# Dual Model Chat Interface
This notebook provides interfaces to both:
1. **DeepSeek-V2-Lite-Chat** (PyTorch/Transformers) - Full precision model
2. **L3.1-MOE-13.7B** (GGUF/llama.cpp) - Quantized model via API

Use this for comparing model responses and testing different approaches.

## Setup and Imports

In [1]:
import torch
import json
import requests
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import pandas as pd

# Clear GPU memory
torch.cuda.empty_cache()
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

CUDA available: True
GPU: NVIDIA GeForce RTX 5080
Memory: 15.9 GB


## Load Sample Data

In [2]:
def load_sample_data():
    with open('/home/lmx/EchoPersona/alpaca_sample.jsonl', 'r') as f:
        data = [json.loads(line) for line in f]
    return data

sample_data = load_sample_data()
print(f"Loaded {len(sample_data)} samples")

for i, sample in enumerate(sample_data[:3]):
    print(f"\n--- Sample {i+1} ---")
    print(f"Instruction: {sample['instruction']}")
    if sample['input']:
        print(f"Input: {sample['input']}")
    print(f"Expected Output: {sample['output'][:100]}...")

Loaded 1000 samples

--- Sample 1 ---
Instruction: Give three tips for staying healthy.
Expected Output: 1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and...

--- Sample 2 ---
Instruction: What are the three primary colors?
Expected Output: The three primary colors are red, blue, and yellow. These colors are called primary because they can...

--- Sample 3 ---
Instruction: Describe the structure of an atom.
Expected Output: An atom is the basic building block of all matter and is made up of three types of particles: proton...


## Model 1: DeepSeek-V2-Lite-Chat (Transformers)

In [3]:
# Load DeepSeek model
model_dir = "/home/lmx/EchoPersona/models/DeepSeek-V2-Lite-Chat"

print("Loading DeepSeek tokenizer and model...")
deepseek_tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)

deepseek_model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

print("DeepSeek model loaded successfully")

Loading DeepSeek tokenizer and model...




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


DeepSeek model loaded successfully


In [None]:
def chat_with_deepseek(instruction, input_text="", max_tokens=512, temperature=0.8):
    """Chat with DeepSeek model"""
    if input_text:
        prompt = f"<|user|>{instruction}\n\nInput: {input_text}<|assistant|>"
    else:
        prompt = f"<|user|>{instruction}<|assistant|>"
    
    inputs = deepseek_tokenizer(prompt, return_tensors="pt").to(deepseek_model.device)
    
    with torch.no_grad():
        outputs = deepseek_model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=0.95,
            repetition_penalty=1.1,
            eos_token_id=deepseek_tokenizer.eos_token_id,
            pad_token_id=deepseek_tokenizer.eos_token_id
        )

    full_response = deepseek_tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = full_response[len(prompt):].strip()
    
    return response

test_instruction = "Explain quantum computing in simple terms"
deepseek_response = chat_with_deepseek(test_instruction)
print(f"DeepSeek Response:\n{deepseek_response}")

DeepSeek Response:
Love can be complex and subjective, as it means different things to different people. However, here's a general definition:
* Love is a deep, emotional connection or attachment towards someone or something that inspires fond feelings and care.* Love often involves strong affection, compassion, and kindness towards another person or entity.* It can also involve the desire for an individual to protect, support, and feel connected with the one who is loved.* In many cultures around the world, love plays a central role in relationships between partners, family members, friends, and even communities.


## Model 2: L3.1-MOE-13.7B (via API Server)

In [None]:
def chat_with_moe_direct(instruction, input_text="", max_tokens=256, temperature=0.8):
    """Chat with MOE model directly using llama-cpp-python"""
    try:
        from llama_cpp import Llama
        import os
        
        model_path = "/home/lmx/EchoPersona/models/L3.1-MOE-13.7B/moe13b-q4ks.gguf"
        
        if not hasattr(chat_with_moe_direct, 'llm'):
            print("Loading MOE model directly...")
            chat_with_moe_direct.llm = Llama(
                model_path=model_path,
                n_ctx=4096,
                n_gpu_layers=-1,
                n_batch=512,
                verbose=False,
            )
            print("MOE model loaded successfully")
        
        if input_text:
            prompt = f"Instruction: {instruction}\n\nInput: {input_text}\n\nResponse:"
        else:
            prompt = f"Instruction: {instruction}\n\nResponse:"
        
        response = chat_with_moe_direct.llm.create_completion(
            prompt=prompt,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=0.95,
            stream=False,
        )
        
        return response["choices"][0]["text"].strip()
        
    except Exception as e:
        return f"Error loading MOE model: {e}"

def check_moe_server():
    """For compatibility - always return False since we're using direct access"""
    print("Using direct MOE model access (bypassing server)")
    return False

server_running = check_moe_server()

❌ MOE Server not responding
Start it with: python /home/lmx/EchoPersona/moe/server.py


In [None]:
if True:  # Always test MOE model since we're using direct access
    moe_response = chat_with_moe_direct(test_instruction)
    print(f"MOE Response:\n{moe_response}")
else:
    print("Skipping MOE test - using direct access")

## Compare Both Models

In [None]:
def compare_models(instruction, input_text="", max_tokens=256):
    """Compare responses from both models"""
    print(f"🔥 Query: {instruction}")
    if input_text:
        print(f"📝 Input: {input_text}")
    print("\n" + "="*80)
    
    print("\n🤖 DeepSeek-V2-Lite-Chat (PyTorch):")
    print("-" * 40)
    deepseek_resp = chat_with_deepseek(instruction, input_text, max_tokens)
    print(deepseek_resp)
    
    print("\n" + "="*80)
    
    print("\n🚀 L3.1-MOE-13.7B (GGUF):")
    print("-" * 40)
    moe_resp = chat_with_moe_direct(instruction, input_text, max_tokens)
    print(moe_resp)
    
    print("\n" + "="*80)

compare_models("What are the key differences between Python and JavaScript?")

## Batch Testing with Sample Data

In [None]:
# Test multiple samples
def batch_test(num_samples=3):
    """Test both models on multiple samples"""
    for i in range(min(num_samples, len(sample_data))):
        sample = sample_data[i]
        print(f"\n{'='*20} SAMPLE {i+1} {'='*20}")
        compare_models(sample['instruction'], sample['input'], max_tokens=200)
        print("\n" + "="*60 + "\n")

# Run batch test
batch_test(2)  # Test first 2 samples

## Interactive Chat

In [None]:
def interactive_chat():
    print("🤖 Dual Model Chat - Type 'quit' to exit\n")
    
    while True:
        user_input = input("\n👤 You: ")
        if user_input.lower() in ['quit', 'exit', 'q']:
            break
        
        compare_models(user_input, max_tokens=300)

# interactive_chat()

## Memory Management

In [None]:
# Clean up GPU memory
def cleanup_memory():
    import gc
    global deepseek_model, deepseek_tokenizer
    
    try:
        del deepseek_model
        del deepseek_tokenizer
    except:
        pass
    
    gc.collect()
    torch.cuda.empty_cache()
    print("✅ Memory cleaned up")

# Uncomment to clean up memory
# cleanup_memory()