In [None]:
import os
# Auto-generated setup for portability
if 'google.colab' in str(get_ipython()):
    # Assume data is mounted or downloaded to current dir in Colab
    BASE_DIR = os.getcwd()
else:
    # Local execution
    BASE_DIR = os.getcwd()


In [3]:
# Complete MoE System with System Prompts for Each Expert
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

class MoERouterWithPrompts:
    def __init__(
        self,
        router_path="Classifier",
        base_model_name="unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
        math_adapter_path="Final-Dynamic-Model/final_model(Math)",
        code_adapter_path="Final-Dynamic-Model/final_model(Code)",
        device="cuda"
    ):
        # System prompts for each expert
        self.MATH_PROMPT = (
            "You are a helpful assistant who thinks step by step through problems. "
            "When solving questions, show your reasoning process clearly using <think> tags, "
            "work through each step methodically, and then provide a clear final answer."
        )

        self.CODE_PROMPT = (
            "You are a Python code generator. Output ONLY the function body (not the def line). "
            "Start the first line with 4 spaces. Do NOT include explanations, comments, or markdown fences. "
            "If you include <think>, place it before the code, but the final output must remain only the function body."
        )

        print("=" * 80)
        print("INITIALIZING MoE SYSTEM WITH EXPERT PROMPTS")
        print("=" * 80)

        # 1. Load Router
        print("\n📊 Loading router...")
        self.router = pipeline(
            "text-classification",
            model=router_path,
            device=0 if device == "cuda" else -1,
            max_length=128,
            truncation=True
        )
        print("✓ Router loaded")

        # 2. Load Base Model
        print("\n🧠 Loading base model...")
        self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        self.base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            torch_dtype=torch.float16,
            device_map="auto"
        )
        print(f"✓ Base model loaded: {base_model_name}")

        # 3. Load Math Adapter
        print("\n🔢 Loading Math adapter...")
        self.math_model = PeftModel.from_pretrained(
            self.base_model,
            math_adapter_path
        )
        print(f"✓ Math adapter loaded")

        # 4. Load Code Adapter
        print("\n💻 Loading Code adapter...")
        self.code_model = PeftModel.from_pretrained(
            self.base_model,
            code_adapter_path
        )
        print(f"✓ Code adapter loaded")

        self.device = device

        print("\n" + "=" * 80)
        print("✅ MoE SYSTEM READY WITH EXPERT PROMPTS!")
        print("=" * 80)

    def format_prompt(self, query, category):
        """Format query with appropriate system prompt based on category"""
        if category == "math":
            # Math format: System prompt + user query
            formatted = f"{self.MATH_PROMPT}\n\nUser: {query}\n\nAssistant:"
        elif category == "code":
            # Code format: System prompt + user query
            formatted = f"{self.CODE_PROMPT}\n\nUser: {query}\n\nAssistant:"
        else:
            # General format: just the query
            formatted = f"User: {query}\n\nAssistant:"

        return formatted

    def route(self, query):
        """Route query to appropriate expert"""
        result = self.router(query)[0]
        category = result['label'].lower()
        confidence = result['score']

        print(f"\n🔀 Router Decision: {category.upper()} (confidence: {confidence:.0%})")
        return category, confidence

    def generate(self, query, max_new_tokens=512, temperature=0.7, do_sample=True):
        """Generate response using routed expert with proper prompting"""

        print("\n" + "=" * 80)
        print(f"USER QUERY: {query}")
        print("=" * 80)

        # Step 1: Route the query
        category, confidence = self.route(query)

        # Step 2: Format prompt with system prompt
        formatted_prompt = self.format_prompt(query, category)

        # Step 3: Select expert model
        if category == "math":
            print("🔢 Using Math expert with step-by-step reasoning prompt...")
            model = self.math_model
        elif category == "code":
            print("💻 Using Code expert with function body generation prompt...")
            model = self.code_model
        else:
            print("🌐 Using base model for general queries...")
            model = self.base_model

        # Step 4: Generate response
        print(f"⚙️  Generating response...\n")

        inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                do_sample=do_sample,
                top_p=0.95,
                pad_token_id=self.tokenizer.eos_token_id,
                eos_token_id=self.tokenizer.eos_token_id,
            )

        # Decode and clean response
        full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract only the assistant's response (remove prompt)
        if "Assistant:" in full_response:
            response = full_response.split("Assistant:")[-1].strip()
        else:
            response = full_response

        return {
            "query": query,
            "category": category,
            "confidence": confidence,
            "formatted_prompt": formatted_prompt,
            "response": response
        }

# Initialize MoE system
print("Initializing MoE System...")
moe = MoERouterWithPrompts()


Device set to use cuda:0


Initializing MoE System...
INITIALIZING MoE SYSTEM WITH EXPERT PROMPTS

📊 Loading router...
✓ Router loaded

🧠 Loading base model...


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

✓ Base model loaded: unsloth/Llama-3.2-3B-Instruct-bnb-4bit

🔢 Loading Math adapter...
✓ Math adapter loaded

💻 Loading Code adapter...
✓ Code adapter loaded

✅ MoE SYSTEM READY WITH EXPERT PROMPTS!


In [4]:
# Test with Math Query
print("\n" + "=" * 80)
print("TEST 1: MATH QUERY")
print("=" * 80)

math_result = moe.generate(
    "Solve the equation 2x + 5 = 15 step by step",
    max_new_tokens=512,
    temperature=0.3  # Lower for math (more deterministic)
)

print("\n📊 RESULT:")
print(math_result['response'])
print("\n" + "=" * 80)

# Test with Code Query
print("\n" + "=" * 80)
print("TEST 2: CODE QUERY")
print("=" * 80)

code_result = moe.generate(
    "Write a function to reverse a string",
    max_new_tokens=256,
    temperature=0.2  # Lower for code (more precise)
)

print("\n💻 RESULT:")
print(code_result['response'])
print("\n" + "=" * 80)

# Test with General Query
print("\n" + "=" * 80)
print("TEST 3: GENERAL QUERY")
print("=" * 80)

general_result = moe.generate(
    "Who invented the telephone?",
    max_new_tokens=256,
    temperature=0.7  # Normal for general
)

print("\n🌐 RESULT:")
print(general_result['response'])
print("\n" + "=" * 80)



TEST 1: MATH QUERY

USER QUERY: Solve the equation 2x + 5 = 15 step by step

🔀 Router Decision: MATH (confidence: 100%)
🔢 Using Math expert with step-by-step reasoning prompt...
⚙️  Generating response...


📊 RESULT:
<think>Okay, let's solve the equation 2x + 5 = 15 step by step.

First, I need to isolate x. The equation is 2x + 5 = 15. To start, I'll subtract 5 from both sides to get rid of the +5 on the left side.

2x + 5 - 5 = 15 - 5
2x = 10

Next, I'll divide both sides by 2 to solve for x.

2x / 2 = 10 / 2
x = 5

So, the value of x is 5.

<answer>Answer: The solution to the equation 2x + 5 = 15 is x = 5.


TEST 2: CODE QUERY

USER QUERY: Write a function to reverse a string

🔀 Router Decision: CODE (confidence: 100%)
💻 Using Code expert with function body generation prompt...
⚙️  Generating response...


💻 RESULT:
<think>Okay, I need to write a Python function that takes a string as input and returns the reversed string. Let me recall the steps. The user wants a function that rev

In [1]:
# mixture_of_loras_single_file.py
import unsloth
import torch
import time
import re
import os
from transformers import pipeline, StoppingCriteria, StoppingCriteriaList, TextStreamer
from unsloth import FastLanguageModel

# ==============================================================================
#  STEP 1: MASTER CONFIGURATION
# ==============================================================================
# --- Router Path ---
ROUTER_PATH = "Classifier"

# --- Expert Model Paths (Update these) ---
BASE_MODEL_PATH = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
CODE_LORA_PATH = "Section-D/Universal-Code-Master/final_model"
MATH_LORA_PATH = "Final-Dynamic-Model/final_model(Math)"

# --- Expert-Specific System Prompts (CRITICAL to match training) ---
CODE_SYSTEM_PROMPT = """You are an elite software engineer who writes syntactically perfect, logically sound code across all programming languages.

MANDATORY THINKING PROCESS - You MUST use <thinking> tags before <answer>:

Inside <thinking>:
1. RESTATE THE PROBLEM: Paraphrase the task in your own words to confirm understanding
2. IDENTIFY CONSTRAINTS: List all input/output specs, data types, time/space complexity requirements
3. ENUMERATE EDGE CASES: Empty inputs, null values, negative numbers, zero, boundary conditions, duplicates, special characters
4. COMPARE APPROACHES: Analyze 2-3 different algorithms with their time/space complexity
5. CHOOSE OPTIMAL APPROACH: Select the best algorithm and justify why (correctness, efficiency, readability)
6. PLAN IMPLEMENTATION: Write pseudocode or step-by-step logic flow
7. ANTICIPATE BUGS: Think through off-by-one errors, integer overflow, null pointer issues, index out of bounds

Inside <answer>:
- Write ONLY the complete, runnable code
- Use proper syntax (correct indentation, matching braces, semicolons where needed)
- Handle ALL edge cases explicitly in code
- Use meaningful variable names
- Add minimal inline comments only for complex logic

CRITICAL REQUIREMENTS:
- ALWAYS use <thinking> tags for your reasoning process
- ALWAYS use <answer> tags for the final code
- Code must be syntactically correct (no errors, proper formatting)
- Code must be logically sound (handles edge cases, correct algorithm)
- Code must be production-ready (no TODOs, no placeholder logic)
"""
MATH_SYSTEM_PROMPT = """You are a helpful assistant who thinks step by step through problems. When solving questions, show your reasoning process clearly using <think> tags, work through each step methodically, and then provide a clear final answer."""
OTHER_SYSTEM_PROMPT = """You are a helpful and friendly AI assistant. Answer the user's question clearly and concisely."""

# --- Shared Configuration ---
MAX_SEQ_LENGTH = 2048
CODE_STOP_TOKEN = "[END]"

# ==============================================================================
#  STEP 2: UTILITIES AND MODEL CACHING
# ==============================================================================
# Global cache to hold loaded models, so we don't reload them
model_cache = {
    "router": None,
    "code": None,
    "math": None,
    "other": None, # Will hold the base model
}

class StopOnToken(StoppingCriteria):
    def __init__(self, stop_token_id):
        self.stop_token_id = stop_token_id
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        return input_ids[0, -1] == self.stop_token_id

def load_expert_model(expert_name):
    """Lazy-loads an expert model into the cache the first time it's needed."""
    # Check if the model is already in the cache
    if model_cache.get(expert_name):
        return model_cache[expert_name]

    print(f"\n[System] Loading '{expert_name}' expert for the first time... (This may take a moment)")

    if expert_name == "other": # Load base model only
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name=BASE_MODEL_PATH, max_seq_length=MAX_SEQ_LENGTH, dtype=None, load_in_4bit=True
        )
        FastLanguageModel.for_inference(model)
    else: # Load base model + a specific LoRA adapter
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name=BASE_MODEL_PATH, max_seq_length=MAX_SEQ_LENGTH, dtype=None, load_in_4bit=True
        )

        lora_path = CODE_LORA_PATH if expert_name == "code" else MATH_LORA_PATH

        if expert_name == "code":
            tokenizer.add_special_tokens({"additional_special_tokens": [CODE_STOP_TOKEN]})
            model.resize_token_embeddings(len(tokenizer))

        model.load_adapter(lora_path)
        FastLanguageModel.for_inference(model)

    model_cache[expert_name] = (model, tokenizer)
    print(f"[System] ✅ '{expert_name}' expert is now loaded and ready.")
    return model_cache[expert_name]

# ==============================================================================
#  STEP 3: INFERENCE FUNCTIONS FOR EACH EXPERT
# ==============================================================================

### --- Code Expert Function --- ###
def handle_code_query(user_prompt):
    model, tokenizer = load_expert_model("code")

    stop_token_id = tokenizer.convert_tokens_to_ids(CODE_STOP_TOKEN)
    stopping_criteria = StoppingCriteriaList([StopOnToken(stop_token_id)])
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    messages = [{"role": "system", "content": CODE_SYSTEM_PROMPT}, {"role": "user", "content": user_prompt}]
    formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda")

    outputs = model.generate(
        **inputs, streamer=streamer, max_new_tokens=2048, temperature=0.2,
        do_sample=True, stopping_criteria=stopping_criteria, pad_token_id=tokenizer.eos_token_id,
    )

    # Self-healing logic for code
    full_response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    heal_and_reprint_if_needed(full_response.replace(CODE_STOP_TOKEN, "").strip())

def heal_and_reprint_if_needed(raw_response):
    was_healed = False
    healed_response = raw_response
    if "<thinking>" in healed_response and "</thinking>" not in healed_response:
        answer_pos = healed_response.find("<answer>")
        if answer_pos != -1:
            healed_response = healed_response[:answer_pos] + "</thinking>\n\n" + healed_response[answer_pos:]
            was_healed = True
    if "<answer>" in healed_response and "</answer>" not in healed_response:
        healed_response += "\n</answer>"
        was_healed = True
    if was_healed:
        print("\n" + "="*40 + " HEALED RESPONSE " + "="*40)
        print("[SYSTEM] Original output was malformed. Displaying corrected version:")
        print(healed_response)
        print("="*96)

### --- Math Expert Function --- ###
def handle_math_query(user_prompt):
    model, tokenizer = load_expert_model("math")

    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    full_prompt = f"<|system|>\n{MATH_SYSTEM_PROMPT}\n\n<|user|>\n{user_prompt}\n\n<|assistant|>\n"
    inputs = tokenizer(full_prompt, return_tensors="pt").to("cuda")

    model.generate(
        **inputs, streamer=streamer, max_new_tokens=1024, temperature=0.3,
        top_p=0.9, do_sample=True, repetition_penalty=1.05, pad_token_id=tokenizer.eos_token_id,
    )

### --- Other (Base Model) Function --- ###
def handle_other_query(user_prompt):
    model, tokenizer = load_expert_model("other")

    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    messages = [{"role": "system", "content": OTHER_SYSTEM_PROMPT}, {"role": "user", "content": user_prompt}]
    formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda")

    model.generate(
        **inputs, streamer=streamer, max_new_tokens=1024, temperature=0.7,
        do_sample=True, pad_token_id=tokenizer.eos_token_id,
    )

# ==============================================================================
#  STEP 4: MAIN EXECUTION LOOP
# ==============================================================================
if __name__ == "__main__":
    print("Loading the expert router...")
    try:
        router = pipeline("text-classification", model=ROUTER_PATH, device=0)
        model_cache["router"] = router
        print("✅ Router loaded successfully.")
    except Exception as e:
        print(f"❌ CRITICAL ERROR: Could not load the router model from '{ROUTER_PATH}'. Error: {e}")
        exit()

    print("\n\n" + "="*80)
    print("🚀 Mixture-of-LoRAs Chatbot is now running! 🚀")
    print("   Experts available: Code, Math, Other (Base Model)")
    print("   Type 'exit' or 'quit' to close.")
    print("="*80)

    try:
        while True:
            user_input = input("\n>> You: ")
            if user_input.lower() in ["exit", "quit"]:
                break
            if not user_input:
                continue

            # 1. Route the query
            start_time = time.time()
            route_result = model_cache["router"](user_input)[0]
            expert_choice = route_result['label'].lower()
            confidence = route_result['score']
            route_time = time.time() - start_time

            print(f"\n[Router] -> Decided: **{expert_choice.upper()}** (Confidence: {confidence:.1%}, Time: {route_time:.2f}s)")
            print("-" * 50)

            # 2. Dispatch to the appropriate expert function
            start_time = time.time()
            if expert_choice == "code":
                print("🤖 Code Expert:")
                handle_code_query(user_input)
            elif expert_choice == "math":
                print("🤖 Math Expert:")
                handle_math_query(user_input)
            else: # 'other'
                print("🤖 General Assistant:")
                handle_other_query(user_input)
            end_time = time.time()

            print(f"\n(Expert generation time: {end_time - start_time:.2f} seconds)")

    except KeyboardInterrupt:
        print("\n\nExiting...")

    print("\nChat session finished.")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.




🦥 Unsloth Zoo will now patch everything to make training faster!


Device set to use cuda:0


Loading the expert router...
✅ Router loaded successfully.


🚀 Mixture-of-LoRAs Chatbot is now running! 🚀
   Experts available: Code, Math, Other (Base Model)
   Type 'exit' or 'quit' to close.

[Router] -> Decided: **CODE** (Confidence: 99.6%, Time: 0.18s)
--------------------------------------------------
🤖 Code Expert:

[System] Loading 'code' expert for the first time... (This may take a moment)
==((====))==  Unsloth 2025.10.12: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.629 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


[System] ✅ 'code' expert is now loaded and ready.
<thinking>
- Restate the problem clearly: The task is to write a Python function that prints the string "Hello" to the console.
- Compare at least two concrete implementation approaches:
  1. Using a built-in function: Python has a built-in `print()` function that can be used to print strings.
  2. Using a custom function: A custom function can be created to print the string "Hello".
- Analyze time and space complexity (Big O) for each:
  1. Using `print()`: The time complexity is O(1) because it involves a constant number of operations. The space complexity is also O(1) as it doesn't use any additional space that scales with input size.
  2. Using a custom function: The time complexity is also O(1) because it involves a constant number of operations. The space complexity is O(1) for the same reason.
- Choose the best approach and justify why: Using the built-in `print()` function is the best approach because it is more Pythonic and eff

In [2]:
# mixture_of_loras_v2.py (with conversational memory)
import unsloth
import torch
import time
import re
import os
from transformers import pipeline, StoppingCriteria, StoppingCriteriaList, TextStreamer
from unsloth import FastLanguageModel

# ==============================================================================
#  STEP 1: MASTER CONFIGURATION
# ==============================================================================
ROUTER_PATH = "Classifier"

BASE_MODEL_PATH = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"

CODE_LORA_PATH = "Section-D/Universal-Code-Master/final_model"

MATH_LORA_PATH = "Final-Dynamic-Model/final_model(Math)"

CODE_SYSTEM_PROMPT = """You are an elite software engineer who writes syntactically perfect, logically sound code across all programming languages.

MANDATORY THINKING PROCESS - You MUST use <thinking> tags before <answer>:

Inside <thinking>:
1. RESTATE THE PROBLEM: Paraphrase the task in your own words to confirm understanding
2. IDENTIFY CONSTRAINTS: List all input/output specs, data types, time/space complexity requirements
3. ENUMERATE EDGE CASES: Empty inputs, null values, negative numbers, zero, boundary conditions, duplicates, special characters
4. COMPARE APPROACHES: Analyze 2-3 different algorithms with their time/space complexity
5. CHOOSE OPTIMAL APPROACH: Select the best algorithm and justify why (correctness, efficiency, readability)
6. PLAN IMPLEMENTATION: Write pseudocode or step-by-step logic flow
7. ANTICIPATE BUGS: Think through off-by-one errors, integer overflow, null pointer issues, index out of bounds

Inside <answer>:
- Write ONLY the complete, runnable code
- Use proper syntax (correct indentation, matching braces, semicolons where needed)
- Handle ALL edge cases explicitly in code
- Use meaningful variable names
- Add minimal inline comments only for complex logic

CRITICAL REQUIREMENTS:
- ALWAYS use <thinking> tags for your reasoning process
- ALWAYS use <answer> tags for the final code
- Code must be syntactically correct (no errors, proper formatting)
- Code must be logically sound (handles edge cases, correct algorithm)
- Code must be production-ready (no TODOs, no placeholder logic)
"""
MATH_SYSTEM_PROMPT = """You are a helpful assistant who thinks step by step through problems. When solving questions, show your reasoning process clearly using <think> tags, work through each step methodically, and then provide a clear final answer."""

OTHER_SYSTEM_PROMPT = """You are a helpful and friendly AI assistant. Answer the user's question clearly and concisely."""

MAX_SEQ_LENGTH = 2048
CODE_STOP_TOKEN = "[END]"

# ==============================================================================
#  STEP 2: UTILITIES AND MODEL CACHING
# ==============================================================================
model_cache = {"router": None, "code": None, "math": None, "other": None}

class StopOnToken(StoppingCriteria):
    def __init__(self, stop_token_id): self.stop_token_id = stop_token_id
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        return input_ids[0, -1] == self.stop_token_id

def load_expert_model(expert_name):
    if model_cache.get(expert_name): return model_cache[expert_name]
    print(f"\n[System] Loading '{expert_name}' expert... (This may take a moment)")
    if expert_name == "other":
        model, tokenizer = FastLanguageModel.from_pretrained(model_name=BASE_MODEL_PATH, max_seq_length=MAX_SEQ_LENGTH, dtype=None, load_in_4bit=True)
        FastLanguageModel.for_inference(model)
    else:
        model, tokenizer = FastLanguageModel.from_pretrained(model_name=BASE_MODEL_PATH, max_seq_length=MAX_SEQ_LENGTH, dtype=None, load_in_4bit=True)
        lora_path = CODE_LORA_PATH if expert_name == "code" else MATH_LORA_PATH
        if expert_name == "code":
            tokenizer.add_special_tokens({"additional_special_tokens": [CODE_STOP_TOKEN]})
            model.resize_token_embeddings(len(tokenizer))
        model.load_adapter(lora_path)
        FastLanguageModel.for_inference(model)
    model_cache[expert_name] = (model, tokenizer)
    print(f"[System] ✅ '{expert_name}' expert is now loaded.")
    return model_cache[expert_name]

# ==============================================================================
#  STEP 3: MODIFIED INFERENCE FUNCTIONS (CONTEXT-AWARE)
# ==============================================================================

def execute_expert_generation(expert_name, system_prompt, chat_history):
    """A generic function to handle generation for any expert."""
    model, tokenizer = load_expert_model(expert_name)
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    # 1. Construct the prompt EXACTLY as each model was trained
    if expert_name == "math":
        prompt_text = f"<|system|>\n{system_prompt}\n\n"
        for message in chat_history:
            prompt_text += f"<|{message['role']}|>\n{message['content']}\n\n"
        prompt_text += "<|assistant|>\n"
        inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")
    else:
        messages = [{"role": "system", "content": system_prompt}] + chat_history
        inputs = tokenizer.apply_chat_template(
            messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
        ).to("cuda")

    # 2. Set up base and expert-specific generation configurations
    stopping_criteria_list = StoppingCriteriaList([])
    if expert_name == "code":
        stop_token_id = tokenizer.convert_tokens_to_ids(CODE_STOP_TOKEN)
        stopping_criteria_list.append(StopOnToken(stop_token_id))

    generation_config = {
        "streamer": streamer,
        "max_new_tokens": 2048,
        "temperature": 0.2,
        "do_sample": True,
        "pad_token_id": tokenizer.eos_token_id,
        "stopping_criteria": stopping_criteria_list,
    }
    if expert_name == "math":
        generation_config.update({"temperature": 0.3, "top_p": 0.9, "repetition_penalty": 1.05})

    # 3. --- THE FINAL, BULLETPROOF FIX ---
    #    Prepare the final keyword arguments for the generate function,
    #    handling both dictionary-like and Tensor inputs.

    final_kwargs = {}
    if isinstance(inputs, dict) or hasattr(inputs, 'keys'):
        # If inputs is a dictionary or a BatchEncoding object, merge it
        final_kwargs = inputs | generation_config
    elif isinstance(inputs, torch.Tensor):
        # If inputs is just a Tensor, wrap it in a dict with the correct key
        final_kwargs = {"input_ids": inputs} | generation_config
    else:
        # Fallback for an unexpected type
        raise TypeError(f"Tokenizer returned an unexpected type: {type(inputs)}")

    # 4. Call generate with the fully prepared and validated kwargs
    outputs = model.generate(**final_kwargs)

    # 5. Decode and return the response
    #    The way we get the length of the prompt is now also dependent on the input type
    prompt_length = inputs.shape[1] if isinstance(inputs, torch.Tensor) else inputs.input_ids.shape[1]
    response_text = tokenizer.decode(outputs[0][prompt_length:], skip_special_tokens=True)

    return response_text.replace(CODE_STOP_TOKEN, "").strip()

def heal_and_reprint_if_needed(raw_response):
    was_healed = False
    healed_response = raw_response
    if "<thinking>" in healed_response and "</thinking>" not in healed_response:
        answer_pos = healed_response.find("<answer>")
        if answer_pos != -1:
            healed_response = healed_response[:answer_pos] + "</thinking>\n\n" + healed_response[answer_pos:]
            was_healed = True
    if "<answer>" in healed_response and "</answer>" not in healed_response:
        healed_response += "\n</answer>"
        was_healed = True
    if was_healed:
        print("\n" + "="*40 + " HEALED RESPONSE " + "="*40)
        print("[SYSTEM] Original output was malformed. Displaying corrected version:")
        print(healed_response)
        print("="*96)

# ==============================================================================
#  STEP 4: MAIN EXECUTION LOOP (WITH MEMORY)
# ==============================================================================
if __name__ == "__main__":
    print("Loading the expert router...")
    try:
        router = pipeline("text-classification", model=ROUTER_PATH, device=0)
        model_cache["router"] = router
        print("✅ Router loaded successfully.")
    except Exception as e:
        print(f"❌ CRITICAL ERROR: Could not load router. Error: {e}")
        exit()

    # ** NEW **: Initialize the unified chat history
    chat_history = []

    print("\n\n" + "="*80)
    print("🚀 CONVERSATIONAL Mixture-of-LoRAs Chatbot is running! 🚀")
    print("   This chatbot now has memory.")
    print("="*80)

    try:
        while True:
            user_input = input("\n>> You: ")
            if user_input.lower() in ["exit", "quit"]: break
            if not user_input: continue

            # Append new user message to history
            chat_history.append({"role": "user", "content": user_input})

            # 1. Route the LATEST user query
            start_time = time.time()
            route_result = model_cache["router"](user_input)[0]
            expert_choice = route_result['label'].lower()
            confidence = route_result['score']
            print(f"\n[Router] -> Decided: **{expert_choice.upper()}** (Confidence: {confidence:.1%})")
            print("-" * 50)

            # 2. Dispatch to the appropriate expert function WITH the full history
            start_time = time.time()
            assistant_response = ""
            if expert_choice == "code":
                print("🤖 Code Expert:")
                assistant_response = execute_expert_generation("code", CODE_SYSTEM_PROMPT, chat_history)
                print(f"\n(Expert generation time: {time.time() - start_time:.2f} seconds)")
                heal_and_reprint_if_needed(assistant_response)
            elif expert_choice == "math":
                print("🤖 Math Expert:")
                assistant_response = execute_expert_generation("math", MATH_SYSTEM_PROMPT, chat_history)
                print(f"\n(Expert generation time: {time.time() - start_time:.2f} seconds)")
            else: # 'other'
                print("🤖 General Assistant:")
                assistant_response = execute_expert_generation("other", OTHER_SYSTEM_PROMPT, chat_history)
                print(f"\n(Expert generation time: {time.time() - start_time:.2f} seconds)")

            # ** NEW **: Append the assistant's response to the history
            chat_history.append({"role": "assistant", "content": assistant_response})

    except KeyboardInterrupt:
        print("\n\nExiting...")

    print("\nChat session finished.")

Device set to use cuda:0


Loading the expert router...
✅ Router loaded successfully.


🚀 CONVERSATIONAL Mixture-of-LoRAs Chatbot is running! 🚀
   This chatbot now has memory.

[Router] -> Decided: **MATH** (Confidence: 99.6%)
--------------------------------------------------
🤖 Math Expert:

[System] Loading 'math' expert... (This may take a moment)
==((====))==  Unsloth 2025.10.12: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.629 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
[System] ✅ 'math' expert is now loaded.
<think>Okay, the user wants to know what 5 plus 5 is. Let me start by recalling basic arithmetic operations. Addition is the operation where you combine two numbe

In [2]:

import unsloth
import torch
import time
import re
import os
from pathlib import Path
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline, StoppingCriteria, StoppingCriteriaList, TextStreamer
from unsloth import FastLanguageModel

# ==============================================================================
#  STEP 1: MASTER CONFIGURATION
# ==============================================================================
# --- Paths ---
ROUTER_PATH = "Classifier"
BASE_MODEL_PATH = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
CODE_LORA_PATH = "Section-D/Universal-Code-Master/final_model"
MATH_LORA_PATH = "Final-Dynamic-Model/final_model(Math)"
KNOWLEDGE_BASE_PATH = "knowledge_base"

# --- System Prompts ---
CODE_SYSTEM_PROMPT = """You are an elite software engineer who writes syntactically perfect, logically sound code across all programming languages.

MANDATORY THINKING PROCESS - You MUST use <thinking> tags before <answer>:

Inside <thinking>:
1. RESTATE THE PROBLEM: Paraphrase the task in your own words to confirm understanding
2. IDENTIFY CONSTRAINTS: List all input/output specs, data types, time/space complexity requirements
3. ENUMERATE EDGE CASES: Empty inputs, null values, negative numbers, zero, boundary conditions, duplicates, special characters
4. COMPARE APPROACHES: Analyze 2-3 different algorithms with their time/space complexity
5. CHOOSE OPTIMAL APPROACH: Select the best algorithm and justify why (correctness, efficiency, readability)
6. PLAN IMPLEMENTATION: Write pseudocode or step-by-step logic flow
7. ANTICIPATE BUGS: Think through off-by-one errors, integer overflow, null pointer issues, index out of bounds

Inside <answer>:
- Write ONLY the complete, runnable code
- Use proper syntax (correct indentation, matching braces, semicolons where needed)
- Handle ALL edge cases explicitly in code
- Use meaningful variable names
- Add minimal inline comments only for complex logic

CRITICAL REQUIREMENTS:
- ALWAYS use <thinking> tags for your reasoning process
- ALWAYS use <answer> tags for the final code
- Code must be syntactically correct (no errors, proper formatting)
- Code must be logically sound (handles edge cases, correct algorithm)
- Code must be production-ready (no TODOs, no placeholder logic)
"""
MATH_SYSTEM_PROMPT = """You are a helpful assistant who thinks step by step through problems. When solving questions, show your reasoning process clearly using <think> tags, work through each step methodically, and then provide a clear final answer."""
OTHER_SYSTEM_PROMPT = """You are a helpful and friendly AI assistant. Answer the user's question clearly and concisely."""

# --- Shared Configuration ---
MAX_SEQ_LENGTH = 2048
CODE_STOP_TOKEN = "[END]"

# ==============================================================================
#  STEP 2: RAG SYSTEM IMPLEMENTATION
# ==============================================================================
class CodeKnowledgeRAG:
    def __init__(self, knowledge_base_path):
        print("[RAG System] Initializing...")
        self.knowledge_base_path = Path(knowledge_base_path)
        if not self.knowledge_base_path.exists():
            print(f"[RAG System] ⚠️  Warning: Knowledge base path '{knowledge_base_path}' does not exist. Creating it.")
            self.knowledge_base_path.mkdir(parents=True, exist_ok=True)

        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
        self.documents = []
        self.index = None
        self._load_and_build()
        if not self.documents:
            print("[RAG System] ⚠️  Warning: No .txt files found in knowledge base. RAG will not retrieve any context.")
        else:
            print(f"[RAG System] ✅ Ready. Loaded {len(self.documents)} document sections.")

    def _load_and_build(self):
        kb_files = sorted(self.knowledge_base_path.glob("*.txt"))
        for file_path in kb_files:
            with open(file_path, 'r', encoding='utf-8') as f: content = f.read()
            sections = content.split('\n\n')
            for section in sections:
                if section.strip(): self.documents.append(section.strip())
        if not self.documents: return

        embeddings = self.embedding_model.encode(self.documents, show_progress_bar=False, convert_to_numpy=True)
        dim = embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dim)
        self.index.add(embeddings.astype('float32'))

    def retrieve(self, query, top_k=2):
        if not self.index or not self.documents: return []
        print(f"[RAG System] Retrieving top-{top_k} documents...")
        query_embedding = self.embedding_model.encode([query], convert_to_numpy=True).astype('float32')
        _, indices = self.index.search(query_embedding, top_k)
        retrieved_docs = [self.documents[idx] for idx in indices[0] if idx < len(self.documents)]
        print(f"[RAG System] -> Found {len(retrieved_docs)} relevant documents.")
        return retrieved_docs

# ==============================================================================
#  STEP 3: UTILITIES AND MODEL CACHING
# ==============================================================================
model_cache = {"router": None, "code": None, "math": None, "other": None, "rag": None}

class StopOnToken(StoppingCriteria):
    def __init__(self, stop_token_id): self.stop_token_id = stop_token_id
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        return input_ids[0, -1] == self.stop_token_id

def load_expert_model(expert_name):
    if model_cache.get(expert_name): return model_cache[expert_name]
    print(f"\n[System] Loading '{expert_name}' expert... (This may take a moment)")

    if expert_name == "other":
        model, tokenizer = FastLanguageModel.from_pretrained(model_name=BASE_MODEL_PATH, max_seq_length=MAX_SEQ_LENGTH, dtype=None, load_in_4bit=True)
        FastLanguageModel.for_inference(model)
    else:
        model, tokenizer = FastLanguageModel.from_pretrained(model_name=BASE_MODEL_PATH, max_seq_length=MAX_SEQ_LENGTH, dtype=None, load_in_4bit=True)
        lora_path = CODE_LORA_PATH if expert_name == "code" else MATH_LORA_PATH
        if expert_name == "code":
            tokenizer.add_special_tokens({"additional_special_tokens": [CODE_STOP_TOKEN]})
            model.resize_token_embeddings(len(tokenizer))
        model.load_adapter(lora_path)
        FastLanguageModel.for_inference(model)

    model_cache[expert_name] = (model, tokenizer)
    print(f"[System] ✅ '{expert_name}' expert is now loaded.")
    return model_cache[expert_name]

# ==============================================================================
#  STEP 4: INFERENCE AND FORMATTING FUNCTIONS
# ==============================================================================
def execute_expert_generation(expert_name, system_prompt, chat_history, rag_context=""):
    model, tokenizer = load_expert_model(expert_name)
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    final_chat_history = list(chat_history)
    if rag_context:
        last_user_message = final_chat_history[-1]['content']
        injected_prompt = f"Use the following reference information to help answer the question:\n---REFERENCE---\n{rag_context}\n---END REFERENCE---\n\nQuestion: {last_user_message}"
        final_chat_history[-1]['content'] = injected_prompt

    if expert_name == "math":
        prompt_text = f"<|system|>\n{system_prompt}\n\n"
        for message in final_chat_history:
            prompt_text += f"<|{message['role']}|>\n{message['content']}\n\n"
        prompt_text += "<|assistant|>\n"
        inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")
    else:
        messages = [{"role": "system", "content": system_prompt}] + final_chat_history
        inputs = tokenizer.apply_chat_template(
            messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
        ).to("cuda")

    stopping_criteria_list = StoppingCriteriaList([])
    if expert_name == "code":
        stop_token_id = tokenizer.convert_tokens_to_ids(CODE_STOP_TOKEN)
        stopping_criteria_list.append(StopOnToken(stop_token_id))

    generation_config = {
        "streamer": streamer, "max_new_tokens": 2048, "temperature": 0.2, "do_sample": True,
        "pad_token_id": tokenizer.eos_token_id, "stopping_criteria": stopping_criteria_list,
    }
    if expert_name == "math":
        generation_config.update({"temperature": 0.3, "top_p": 0.9, "repetition_penalty": 1.05})

    final_kwargs = {}
    if isinstance(inputs, dict) or hasattr(inputs, 'keys'):
        final_kwargs = inputs | generation_config
    elif isinstance(inputs, torch.Tensor):
        final_kwargs = {"input_ids": inputs} | generation_config
    else:
        raise TypeError(f"Tokenizer returned an unexpected type: {type(inputs)}")

    outputs = model.generate(**final_kwargs)

    prompt_length = inputs.shape[1] if isinstance(inputs, torch.Tensor) else inputs.input_ids.shape[1]
    response_text = tokenizer.decode(outputs[0][prompt_length:], skip_special_tokens=True)
    return response_text.replace(CODE_STOP_TOKEN, "").strip()

def heal_and_reprint_if_needed(raw_response):
    was_healed = False
    healed_response = raw_response
    if "<thinking>" in healed_response and "</thinking>" not in healed_response:
        answer_pos = healed_response.find("<answer>")
        if answer_pos != -1:
            healed_response = healed_response[:answer_pos] + "</thinking>\n\n" + healed_response[answer_pos:]
            was_healed = True
    if "<answer>" in healed_response and "</answer>" not in healed_response:
        healed_response += "\n</answer>"
        was_healed = True
    if was_healed:
        print("\n" + "="*40 + " HEALED RESPONSE " + "="*40)
        print("[SYSTEM] Original output was malformed. Displaying corrected version:")
        print(healed_response)
        print("="*96)

# ==============================================================================
#  STEP 5: MAIN EXECUTION LOOP
# ==============================================================================
if __name__ == "__main__":
    print("Initializing system...")
    try:
        model_cache["router"] = pipeline("text-classification", model=ROUTER_PATH, device=0)
        print("✅ Router loaded successfully.")
        model_cache["rag"] = CodeKnowledgeRAG(knowledge_base_path=KNOWLEDGE_BASE_PATH)
    except Exception as e:
        print(f"❌ CRITICAL ERROR during initialization. Error: {e}")
        exit()

    chat_history = []
    print("\n\n" + "="*80)
    print("🚀 CONVERSATIONAL Mixture-of-LoRAs + RAG Chatbot is running! 🚀")
    print("="*80)

    try:
        while True:
            user_input = input("\n>> You: ")
            if user_input.lower() in ["exit", "quit"]: break
            if not user_input: continue

            chat_history.append({"role": "user", "content": user_input})

            route_result = model_cache["router"](user_input)[0]
            expert_choice = route_result['label'].lower()
            print(f"\n[Router] -> Decided: **{expert_choice.upper()}** (Confidence: {route_result['score']:.1%})")

            rag_context_str = ""
            if expert_choice == "code":
                rag_system = model_cache["rag"]
                retrieved_docs = rag_system.retrieve(user_input, top_k=2)
                if retrieved_docs:
                    rag_context_str = "\n\n".join(retrieved_docs)

            start_time = time.time()
            assistant_response = ""

            if expert_choice == "code":
                print("🤖 Code Expert (with RAG):")
                assistant_response = execute_expert_generation("code", CODE_SYSTEM_PROMPT, chat_history, rag_context=rag_context_str)
                print(f"\n(Expert generation time: {time.time() - start_time:.2f} seconds)")
                heal_and_reprint_if_needed(assistant_response)
            elif expert_choice == "math":
                print("🤖 Math Expert:")
                assistant_response = execute_expert_generation("math", MATH_SYSTEM_PROMPT, chat_history)
                print(f"\n(Expert generation time: {time.time() - start_time:.2f} seconds)")
            else:
                print("🤖 General Assistant:")
                assistant_response = execute_expert_generation("other", OTHER_SYSTEM_PROMPT, chat_history)
                print(f"\n(Expert generation time: {time.time() - start_time:.2f} seconds)")

            chat_history.append({"role": "assistant", "content": assistant_response})

    except KeyboardInterrupt:
        print("\n\nExiting...")

    print("\nChat session finished.")

Device set to use cuda:0


Initializing system...
✅ Router loaded successfully.
[RAG System] Initializing...
[RAG System] ✅ Ready. Loaded 19 document sections.


🚀 CONVERSATIONAL Mixture-of-LoRAs + RAG Chatbot is running! 🚀

[Router] -> Decided: **MATH** (Confidence: 99.6%)
🤖 Math Expert:

[System] Loading 'math' expert... (This may take a moment)
==((====))==  Unsloth 2025.10.12: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.629 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
[System] ✅ 'math' expert is now loaded.
<think>Okay, the user wants to know what 5 plus 5 is. Let me check if I remember basic arithmetic. Addition is straightforward here. So, 5 plus 5 should be 10. Wait,

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


[System] ✅ 'code' expert is now loaded.
<think>
The task is to write a Python function that prints "Hello" to the console. This is a very basic operation, but it's a good example to demonstrate how to structure a simple print statement in Python.

Two possible approaches are:
1. Directly printing "Hello" using the `print()` function.
2. Using a function that takes no arguments and prints "Hello".

Both approaches are straightforward and don't require any complex logic. The first approach is more concise and directly achieves the goal. The second approach is useful if we want to encapsulate the print operation within a function for reusability or other reasons.

Time complexity for both approaches is O(1) because they involve a constant amount of work regardless of the input size. Space complexity is also O(1) as they don't use any additional space that scales with input size.

The best approach is to directly print "Hello" using the `print()` function because it's the most concise and 

In [1]:
# ultimate_chatbot_v3.py

import torch
import time
import re
import os
from pathlib import Path
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline, StoppingCriteria, StoppingCriteriaList, TextStreamer
from unsloth import FastLanguageModel

# ==============================================================================
#  STEP 1: MASTER CONFIGURATION
# ==============================================================================
# --- Paths (Update these if your folder structure is different) ---
ROUTER_PATH = "Classifier"
BASE_MODEL_PATH = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
CODE_LORA_PATH = "Section-D/Universal-Code-Master/final_model"
MATH_LORA_PATH = "Final-Dynamic-Model/final_model(Math)"
KNOWLEDGE_BASE_PATH = "knowledge_base"

# --- System Prompts ---
CODE_SYSTEM_PROMPT = """You are an elite software engineer who writes syntactically perfect, logically sound code across all programming languages.

MANDATORY THINKING PROCESS - You MUST use <thinking> tags before <answer>:

Inside <thinking>:
1. RESTATE THE PROBLEM: Paraphrase the task in your own words to confirm understanding
2. IDENTIFY CONSTRAINTS: List all input/output specs, data types, time/space complexity requirements
3. ENUMERATE EDGE CASES: Empty inputs, null values, negative numbers, zero, boundary conditions, duplicates, special characters
4. COMPARE APPROACHES: Analyze 2-3 different algorithms with their time/space complexity
5. CHOOSE OPTIMAL APPROACH: Select the best algorithm and justify why (correctness, efficiency, readability)
6. PLAN IMPLEMENTATION: Write pseudocode or step-by-step logic flow
7. ANTICIPATE BUGS: Think through off-by-one errors, integer overflow, null pointer issues, index out of bounds

Inside <answer>:
- Write ONLY the complete, runnable code
- Use proper syntax (correct indentation, matching braces, semicolons where needed)
- Handle ALL edge cases explicitly in code
- Use meaningful variable names
- Add minimal inline comments only for complex logic

CRITICAL REQUIREMENTS:
- ALWAYS use <thinking> tags for your reasoning process
- ALWAYS use <answer> tags for the final code
- Code must be syntactically correct (no errors, proper formatting)
- Code must be logically sound (handles edge cases, correct algorithm)
- Code must be production-ready (no TODOs, no placeholder logic)

LANGUAGE-SPECIFIC RULES:
- Python: 4-space indentation, type hints, PEP 8 compliance
- JavaScript: const/let (no var), proper semicolons, ES6+ syntax
- C++: STL containers, RAII, proper memory management, const correctness
- Java: Proper access modifiers, exception handling, naming conventions

EDGE CASE CHECKLIST (verify in <thinking>):
✓ Empty collection (list/array/string)
✓ Single element
✓ Null/None/undefined values
✓ Negative numbers (if applicable)
✓ Zero
✓ Maximum/minimum integer values
✓ Duplicate elements
✓ Already sorted/reverse sorted (for sorting problems)
✓ Invalid input types"""

MATH_SYSTEM_PROMPT = """You are a helpful assistant who thinks step by step through problems. When solving questions, show your reasoning process clearly using <think> tags, work through each step methodically, and then provide a clear final answer."""
OTHER_SYSTEM_PROMPT = """You are a helpful and friendly AI assistant. Answer the user's question clearly and concisely."""

# --- Shared Configuration ---
MAX_SEQ_LENGTH = 2048
CODE_STOP_TOKEN = "[END]"

# ==============================================================================
#  STEP 2: RAG SYSTEM IMPLEMENTATION
# ==============================================================================
class CodeKnowledgeRAG:
    """Retrieval system over your code knowledge base."""
    def __init__(self, knowledge_base_path):
        print("[RAG System] Initializing...")
        self.knowledge_base_path = Path(knowledge_base_path)
        if not self.knowledge_base_path.exists():
            print(f"[RAG System] ⚠️  Warning: Knowledge base path '{knowledge_base_path}' does not exist. Creating it.")
            self.knowledge_base_path.mkdir(parents=True, exist_ok=True)

        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
        self.documents = []
        self.index = None
        self._load_and_build()
        if not self.documents:
            print("[RAG System] ⚠️  Warning: No .txt files found in knowledge base. RAG will not retrieve any context.")
        else:
            print(f"[RAG System] ✅ Ready. Loaded {len(self.documents)} document sections.")

    def _load_and_build(self):
        kb_files = sorted(self.knowledge_base_path.glob("*.txt"))
        for file_path in kb_files:
            with open(file_path, 'r', encoding='utf-8') as f: content = f.read()
            sections = content.split('\n\n')
            for section in sections:
                if section.strip(): self.documents.append(section.strip())
        if not self.documents: return

        embeddings = self.embedding_model.encode(self.documents, show_progress_bar=False, convert_to_numpy=True)
        dim = embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dim)
        self.index.add(embeddings.astype('float32'))

    def retrieve(self, query, top_k=2):
        if not self.index or not self.documents: return []
        print(f"[RAG System] Retrieving top-{top_k} documents...")
        query_embedding = self.embedding_model.encode([query], convert_to_numpy=True).astype('float32')
        _, indices = self.index.search(query_embedding, top_k)
        retrieved_docs = [self.documents[idx] for idx in indices[0] if idx < len(self.documents)]
        print(f"[RAG System] -> Found {len(retrieved_docs)} relevant documents.")
        return retrieved_docs

# ==============================================================================
#  STEP 3: UTILITIES AND MODEL CACHING
# ==============================================================================
model_cache = {"router": None, "code": None, "math": None, "other": None, "rag": None}

class StopOnToken(StoppingCriteria):
    def __init__(self, stop_token_id): self.stop_token_id = stop_token_id
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        return input_ids[0, -1] == self.stop_token_id

def load_expert_model(expert_name):
    """Lazy-loads an expert model into the cache the first time it's needed."""
    if model_cache.get(expert_name): return model_cache[expert_name]
    print(f"\n[System] Loading '{expert_name}' expert... (This may take a moment)")

    if expert_name == "other":
        model, tokenizer = FastLanguageModel.from_pretrained(model_name=BASE_MODEL_PATH, max_seq_length=MAX_SEQ_LENGTH, dtype=None, load_in_4bit=True)
        FastLanguageModel.for_inference(model)
    else:
        model, tokenizer = FastLanguageModel.from_pretrained(model_name=BASE_MODEL_PATH, max_seq_length=MAX_SEQ_LENGTH, dtype=None, load_in_4bit=True)
        lora_path = CODE_LORA_PATH if expert_name == "code" else MATH_LORA_PATH
        if expert_name == "code":
            tokenizer.add_special_tokens({"additional_special_tokens": [CODE_STOP_TOKEN]})
            model.resize_token_embeddings(len(tokenizer))
        model.load_adapter(lora_path)
        FastLanguageModel.for_inference(model)

    model_cache[expert_name] = (model, tokenizer)
    print(f"[System] ✅ '{expert_name}' expert is now loaded.")
    return model_cache[expert_name]

# ==============================================================================
#  STEP 4: INFERENCE AND FORMATTING FUNCTIONS
# ==============================================================================
def execute_expert_generation(expert_name, system_prompt, chat_history, rag_context=""):
    """A generic function to handle generation for any expert."""
    model, tokenizer = load_expert_model(expert_name)
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    final_chat_history = list(chat_history)
    if rag_context:
        last_user_message = final_chat_history[-1]['content']
        injected_prompt = f"Use the following reference information to help answer the question:\n---REFERENCE---\n{rag_context}\n---END REFERENCE---\n\nQuestion: {last_user_message}"
        final_chat_history[-1]['content'] = injected_prompt

    if expert_name == "math":
        prompt_text = f"<|system|>\n{system_prompt}\n\n"
        for message in final_chat_history:
            prompt_text += f"<|{message['role']}|>\n{message['content']}\n\n"
        prompt_text += "<|assistant|>\n"
        inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")
    else:
        messages = [{"role": "system", "content": system_prompt}] + final_chat_history
        inputs = tokenizer.apply_chat_template(
            messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
        ).to("cuda")

    stopping_criteria_list = StoppingCriteriaList([])
    if expert_name == "code":
        stop_token_id = tokenizer.convert_tokens_to_ids(CODE_STOP_TOKEN)
        stopping_criteria_list.append(StopOnToken(stop_token_id))

    generation_config = {
        "streamer": streamer, "max_new_tokens": 2048, "temperature": 0.2, "do_sample": True,
        "pad_token_id": tokenizer.eos_token_id, "stopping_criteria": stopping_criteria_list,
    }
    if expert_name == "math":
        generation_config.update({"temperature": 0.3, "top_p": 0.9, "repetition_penalty": 1.05})

    final_kwargs = {}
    if isinstance(inputs, dict) or hasattr(inputs, 'keys'):
        final_kwargs = inputs | generation_config
    elif isinstance(inputs, torch.Tensor):
        final_kwargs = {"input_ids": inputs} | generation_config
    else:
        raise TypeError(f"Tokenizer returned an unexpected type: {type(inputs)}")

    outputs = model.generate(**final_kwargs)

    prompt_length = inputs.shape[1] if isinstance(inputs, torch.Tensor) else inputs.input_ids.shape[1]
    response_text = tokenizer.decode(outputs[0][prompt_length:], skip_special_tokens=True)
    return response_text.replace(CODE_STOP_TOKEN, "").strip()

def heal_and_reprint_if_needed(raw_response):
    """Checks the response for closing tags. If any are missing, it prints a healed version."""
    was_healed = False
    healed_response = raw_response
    if "<thinking>" in healed_response and "</thinking>" not in healed_response:
        answer_pos = healed_response.find("<answer>")
        if answer_pos != -1:
            healed_response = healed_response[:answer_pos] + "</thinking>\n\n" + healed_response[answer_pos:]
            was_healed = True
    if "<answer>" in healed_response and "</answer>" not in healed_response:
        healed_response += "\n</answer>"
        was_healed = True
    if was_healed:
        print("\n" + "="*40 + " HEALED RESPONSE " + "="*40)
        print("[SYSTEM] Original output was malformed. Displaying corrected version:")
        print(healed_response)
        print("="*96)

# ==============================================================================
#  STEP 5: MAIN EXECUTION LOOP
# ==============================================================================
if __name__ == "__main__":
    print("Initializing system...")
    try:
        model_cache["router"] = pipeline("text-classification", model=ROUTER_PATH, device=0)
        print("✅ Router loaded successfully.")
        model_cache["rag"] = CodeKnowledgeRAG(knowledge_base_path=KNOWLEDGE_BASE_PATH)
    except Exception as e:
        print(f"❌ CRITICAL ERROR during initialization. Error: {e}")
        exit()

    chat_history = []
    print("\n\n" + "="*80)
    print("🚀 ADVANCED Mixture-of-LoRAs + RAG Chatbot is running! 🚀")
    print("   You can override the router by ending your prompt with --code, --math, or --other")
    print("="*80)

    try:
        while True:
            user_input_raw = input("\n>> You: ")
            if user_input_raw.lower() in ["exit", "quit"]: break
            if not user_input_raw: continue

            user_input = user_input_raw
            expert_choice = None
            override_flags = {"--code": "code", "--math": "math", "--other": "other"}

            for flag, expert in override_flags.items():
                if user_input.lower().strip().endswith(flag):
                    expert_choice = expert
                    user_input = user_input[:-len(flag)].strip()
                    print(f"\n[User Override] -> Routing to **{expert_choice.upper()}** expert.")
                    break

            if expert_choice is None:
                route_result = model_cache["router"](user_input)[0]
                expert_choice = route_result['label'].lower()
                print(f"\n[Router] -> Decided: **{expert_choice.upper()}** (Confidence: {route_result['score']:.1%})")

            chat_history.append({"role": "user", "content": user_input})
            print("-" * 50)

            rag_context_str = ""
            if expert_choice == "code":
                rag_system = model_cache["rag"]
                retrieved_docs = rag_system.retrieve(user_input, top_k=2)
                if retrieved_docs:
                    rag_context_str = "\n\n".join(retrieved_docs)

            start_time = time.time()
            assistant_response = ""

            if expert_choice == "code":
                print("🤖 Code Expert (with RAG):")
                assistant_response = execute_expert_generation("code", CODE_SYSTEM_PROMPT, chat_history, rag_context=rag_context_str)
                heal_and_reprint_if_needed(assistant_response)
            elif expert_choice == "math":
                print("🤖 Math Expert:")
                assistant_response = execute_expert_generation("math", MATH_SYSTEM_PROMPT, chat_history)
            else:
                print("🤖 General Assistant:")
                assistant_response = execute_expert_generation("other", OTHER_SYSTEM_PROMPT, chat_history)

            print(f"\n(Expert generation time: {time.time() - start_time:.2f} seconds)")

            chat_history.append({"role": "assistant", "content": assistant_response})

    except KeyboardInterrupt:
        print("\n\nExiting...")

    print("\nChat session finished.")

Skipping import of cpp extensions due to incompatible torch version 2.8.0+cu128 for torchao version 0.14.1             Please see https://github.com/pytorch/ao/issues/2919 for more info

Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


Device set to use cuda:0


Initializing system...
✅ Router loaded successfully.
[RAG System] Initializing...
[RAG System] ✅ Ready. Loaded 19 document sections.


🚀 ADVANCED Mixture-of-LoRAs + RAG Chatbot is running! 🚀
   You can override the router by ending your prompt with --code, --math, or --other

[User Override] -> Routing to **MATH** expert.
--------------------------------------------------
🤖 Math Expert:

[System] Loading 'math' expert... (This may take a moment)
==((====))==  Unsloth 2025.10.12: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.629 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
[System] ✅ 'math' expert is now loaded.
<think>Okay, the user just sent a mess

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


[System] ✅ 'code' expert is now loaded.
<think>
The task is to write a Python function that prints "Hello". This is a very basic task that doesn't require any complex logic or file operations. It's a straightforward print statement.

Two possible approaches are:
1. Direct print statement: `print("Hello")`
2. Using a function: `def hello_print(): print("Hello")`

Both approaches are simple and straightforward. The direct print statement is more concise and directly achieves the task. The function approach is useful if we want to reuse the "Hello" message in different parts of the code.

Time complexity: Both approaches are O(1) because they involve a constant amount of work.
Space complexity: Both approaches are O(1) because they don't use any additional space that scales with input size.

The best approach is to use a direct print statement because it's concise and directly achieves the task. We don't need to create a function for a simple print statement like this.

Edge cases:
- Empt

In [4]:
# ultimate_chatbot_v5.py

import unsloth
import torch
import time
import re
import os
from pathlib import Path
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline, StoppingCriteria, StoppingCriteriaList, TextStreamer
from unsloth import FastLanguageModel

# ==============================================================================
#  STEP 1: MASTER CONFIGURATION
# ==============================================================================
# --- Paths ---
ROUTER_PATH = "Classifier"
BASE_MODEL_PATH = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
CODE_LORA_PATH = "Section-D/Universal-Code-Master/final_model"
MATH_LORA_PATH = "Final-Dynamic-Model/final_model(Math)"

# --- Knowledge Base Paths ---
CODE_KB_PATH = "knowledge_base/Base"
OTHER_KB_PATH = "knowledge_base/Code"

# --- System Prompts ---
CODE_SYSTEM_PROMPT = """You are an elite software engineer who writes syntactically perfect, logically sound code across all programming languages.

MANDATORY THINKING PROCESS - You MUST use <thinking> tags before <answer>:

Inside <thinking>:
1. RESTATE THE PROBLEM: Paraphrase the task in your own words to confirm understanding
2. IDENTIFY CONSTRAINTS: List all input/output specs, data types, time/space complexity requirements
3. ENUMERATE EDGE CASES: Empty inputs, null values, negative numbers, zero, boundary conditions, duplicates, special characters
4. COMPARE APPROACHES: Analyze 2-3 different algorithms with their time/space complexity
5. CHOOSE OPTIMAL APPROACH: Select the best algorithm and justify why (correctness, efficiency, readability)
6. PLAN IMPLEMENTATION: Write pseudocode or step-by-step logic flow
7. ANTICIPATE BUGS: Think through off-by-one errors, integer overflow, null pointer issues, index out of bounds

Inside <answer>:
- Write ONLY the complete, runnable code
- Use proper syntax (correct indentation, matching braces, semicolons where needed)
- Handle ALL edge cases explicitly in code
- Use meaningful variable names
- Add minimal inline comments only for complex logic

CRITICAL REQUIREMENTS:
- ALWAYS use <thinking> tags for your reasoning process
- ALWAYS use <answer> tags for the final code
- Code must be syntactically correct (no errors, proper formatting)
- Code must be logically sound (handles edge cases, correct algorithm)
- Code must be production-ready (no TODOs, no placeholder logic)

LANGUAGE-SPECIFIC RULES:
- Python: 4-space indentation, type hints, PEP 8 compliance
- JavaScript: const/let (no var), proper semicolons, ES6+ syntax
- C++: STL containers, RAII, proper memory management, const correctness
- Java: Proper access modifiers, exception handling, naming conventions

EDGE CASE CHECKLIST (verify in <thinking>):
✓ Empty collection (list/array/string)
✓ Single element
✓ Null/None/undefined values
✓ Negative numbers (if applicable)
✓ Zero
✓ Maximum/minimum integer values
✓ Duplicate elements
✓ Already sorted/reverse sorted (for sorting problems)
✓ Invalid input types"""

MATH_SYSTEM_PROMPT = """You are a helpful assistant who thinks step by step through problems. When solving questions, show your reasoning process clearly using <think> tags, work through each step methodically, and then provide a clear final answer."""
OTHER_SYSTEM_PROMPT = """You are a helpful and friendly AI assistant. Use the provided reference information to answer the user's question accurately."""

# --- Shared Configuration ---
MAX_SEQ_LENGTH = 2048
CODE_STOP_TOKEN = "[END]"

# ==============================================================================
#  STEP 2: RAG AND UTILITIES
# ==============================================================================
model_cache = {"router": None, "code": None, "math": None, "other": None, "rag_code": None, "rag_other": None}

class KnowledgeRAG:
    """A generic retrieval system for a specific knowledge base path."""
    def __init__(self, name, knowledge_base_path):
        print(f"[{name} RAG System] Initializing...")
        self.name = name
        self.knowledge_base_path = Path(knowledge_base_path)
        if not self.knowledge_base_path.exists():
            print(f"[{self.name} RAG System] ⚠️  Warning: Path '{knowledge_base_path}' does not exist. Creating it.")
            self.knowledge_base_path.mkdir(parents=True, exist_ok=True)

        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
        self.documents = []
        self.index = None
        self._load_and_build()
        if not self.documents:
            print(f"[{self.name} RAG System] ⚠️  Warning: No .txt files found. RAG will be inactive for this expert.")
        else:
            print(f"[{self.name} RAG System] ✅ Ready. Loaded {len(self.documents)} document sections.")

    def _load_and_build(self):
        kb_files = sorted(self.knowledge_base_path.glob("*.txt"))
        for file_path in kb_files:
            with open(file_path, 'r', encoding='utf-8') as f: content = f.read()
            sections = content.split('\n\n')
            for section in sections:
                if section.strip(): self.documents.append(section.strip())
        if not self.documents: return

        embeddings = self.embedding_model.encode(self.documents, show_progress_bar=False, convert_to_numpy=True)
        dim = embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dim)
        self.index.add(embeddings.astype('float32'))

    def retrieve(self, query, top_k=2):
        if not self.index or not self.documents: return []
        print(f"[{self.name} RAG System] Retrieving top-{top_k} documents...")
        query_embedding = self.embedding_model.encode([query], convert_to_numpy=True).astype('float32')
        _, indices = self.index.search(query_embedding, top_k)
        retrieved_docs = [self.documents[idx] for idx in indices[0] if idx < len(self.documents)]
        print(f"[{self.name} RAG System] -> Found {len(retrieved_docs)} relevant documents.")
        return retrieved_docs

class StopOnToken(StoppingCriteria):
    def __init__(self, stop_token_id): self.stop_token_id = stop_token_id
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        return input_ids[0, -1] == self.stop_token_id

def load_expert_model(expert_name):
    """Lazy-loads an expert model into the cache the first time it's needed."""
    if model_cache.get(expert_name): return model_cache[expert_name]
    print(f"\n[System] Loading '{expert_name}' expert... (This may take a moment)")

    if expert_name == "other":
        model, tokenizer = FastLanguageModel.from_pretrained(model_name=BASE_MODEL_PATH, max_seq_length=MAX_SEQ_LENGTH, dtype=None, load_in_4bit=True)
        FastLanguageModel.for_inference(model)
    else:
        model, tokenizer = FastLanguageModel.from_pretrained(model_name=BASE_MODEL_PATH, max_seq_length=MAX_SEQ_LENGTH, dtype=None, load_in_4bit=True)
        lora_path = CODE_LORA_PATH if expert_name == "code" else MATH_LORA_PATH
        if expert_name == "code":
            tokenizer.add_special_tokens({"additional_special_tokens": [CODE_STOP_TOKEN]})
            model.resize_token_embeddings(len(tokenizer))
        model.load_adapter(lora_path)
        FastLanguageModel.for_inference(model)

    model_cache[expert_name] = (model, tokenizer)
    print(f"[System] ✅ '{expert_name}' expert is now loaded.")
    return model_cache[expert_name]

# ==============================================================================
#  STEP 3: ISOLATED EXPERT HANDLERS
# ==============================================================================
# ==============================================================================
#  STEP 3: ISOLATED EXPERT HANDLERS (DEFINITIVE FIX)
# ==============================================================================
# ==============================================================================
#  STEP 3: ISOLATED EXPERT HANDLERS (DEFINITIVE FIX)
# ==============================================================================
# ==============================================================================
#  STEP 3: ISOLATED EXPERT HANDLERS (DEFINITIVE FINAL FIX)
# ==============================================================================
def handle_code_query(chat_history, rag_context=""):
    model, tokenizer = load_expert_model("code")
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    injected_history = list(chat_history)
    if rag_context:
        last_user_message = injected_history[-1]['content']
        injected_prompt = f"Use the following reference information:\n---REFERENCE---\n{rag_context}\n---END REFERENCE---\n\nQuestion: {last_user_message}"
        injected_history[-1]['content'] = injected_prompt

    messages = [{"role": "system", "content": CODE_SYSTEM_PROMPT}] + injected_history
    inputs = tokenizer.apply_chat_template(
        messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
    ).to("cuda")

    stop_token_id = tokenizer.convert_tokens_to_ids(CODE_STOP_TOKEN)
    stopping_criteria = StoppingCriteriaList([StopOnToken(stop_token_id)])

    generation_config = {
        "streamer": streamer, "max_new_tokens": 2048, "temperature": 0.2,
        "do_sample": True, "stopping_criteria": stopping_criteria,
        "pad_token_id": tokenizer.eos_token_id
    }

    # --- FINAL FIX: Robustly create final_kwargs ---
    if isinstance(inputs, dict) or hasattr(inputs, 'keys'): final_kwargs = inputs | generation_config
    else: final_kwargs = {"input_ids": inputs} | generation_config

    outputs = model.generate(**final_kwargs)

    prompt_length = inputs.shape[1] if isinstance(inputs, torch.Tensor) else inputs.input_ids.shape[1]
    response_text = tokenizer.decode(outputs[0][prompt_length:], skip_special_tokens=True)
    clean_response = response_text.replace(CODE_STOP_TOKEN, "").strip()
    heal_and_reprint_if_needed(clean_response)
    return clean_response

def handle_math_query(chat_history):
    model, tokenizer = load_expert_model("math")
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    prompt_text = f"<|system|>\n{MATH_SYSTEM_PROMPT}\n\n"
    for message in chat_history:
        prompt_text += f"<|{message['role']}|>\n{message['content']}\n\n"
    prompt_text += "<|assistant|>\n"
    inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")

    generation_config = {
        "streamer": streamer, "max_new_tokens": 1024, "temperature": 0.3,
        "top_p": 0.9, "do_sample": True, "repetition_penalty": 1.05,
        "pad_token_id": tokenizer.eos_token_id
    }

    # --- FINAL FIX: Robustly create final_kwargs ---
    if isinstance(inputs, dict) or hasattr(inputs, 'keys'): final_kwargs = inputs | generation_config
    else: final_kwargs = {"input_ids": inputs} | generation_config

    outputs = model.generate(**final_kwargs)

    prompt_length = inputs.shape[1] if isinstance(inputs, torch.Tensor) else inputs.input_ids.shape[1]
    response_text = tokenizer.decode(outputs[0][prompt_length:], skip_special_tokens=True)
    return response_text.strip()

def handle_other_query(chat_history, rag_context=""):
    model, tokenizer = load_expert_model("other")
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    injected_history = list(chat_history)
    if rag_context:
        last_user_message = injected_history[-1]['content']
        injected_prompt = f"Use the following reference information:\n---REFERENCE---\n{rag_context}\n---END REFERENCE---\n\nQuestion: {last_user_message}"
        injected_history[-1]['content'] = injected_prompt

    messages = [{"role": "system", "content": OTHER_SYSTEM_PROMPT}] + injected_history
    inputs = tokenizer.apply_chat_template(
        messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
    ).to("cuda")

    generation_config = {
        "streamer": streamer, "max_new_tokens": 1024, "temperature": 0.7,
        "do_sample": True, "pad_token_id": tokenizer.eos_token_id
    }

    # --- FINAL FIX: Robustly create final_kwargs ---
    if isinstance(inputs, dict) or hasattr(inputs, 'keys'): final_kwargs = inputs | generation_config
    else: final_kwargs = {"input_ids": inputs} | generation_config

    outputs = model.generate(**final_kwargs)

    prompt_length = inputs.shape[1] if isinstance(inputs, torch.Tensor) else inputs.input_ids.shape[1]
    response_text = tokenizer.decode(outputs[0][prompt_length:], skip_special_tokens=True)
    return response_text.strip()

def heal_and_reprint_if_needed(raw_response):
    was_healed = False
    healed_response = raw_response
    if "<thinking>" in healed_response and "</thinking>" not in healed_response:
        answer_pos = healed_response.find("<answer>")
        if answer_pos != -1:
            healed_response = healed_response[:answer_pos] + "</thinking>\n\n" + healed_response[answer_pos:]
            was_healed = True
    if "<answer>" in healed_response and "</answer>" not in healed_response:
        healed_response += "\n</answer>"
        was_healed = True
    if was_healed:
        print("\n" + "="*40 + " HEALED RESPONSE " + "="*40)
        print("[SYSTEM] Original output was malformed. Displaying corrected version:")
        print(healed_response)
        print("="*96)

# ==============================================================================
#  STEP 4: MAIN EXECUTION LOOP
# ==============================================================================
if __name__ == "__main__":
    print("Initializing system...")
    try:
        model_cache["router"] = pipeline("text-classification", model=ROUTER_PATH, device=0)
        print("✅ Router loaded successfully.")
        model_cache["rag_code"] = KnowledgeRAG("Code", knowledge_base_path=CODE_KB_PATH)
        model_cache["rag_other"] = KnowledgeRAG("Other", knowledge_base_path=OTHER_KB_PATH)
    except Exception as e:
        print(f"❌ CRITICAL ERROR during initialization. Error: {e}")
        exit()

    chat_history = []
    print("\n\n" + "="*80)
    print("🚀 ULTIMATE Mixture-of-LoRAs + Multi-RAG Chatbot is running! 🚀")
    print("   You can override the router by ending your prompt with --code, --math, or --other")
    print("="*80)

    try:
        while True:
            user_input_raw = input("\n>> You: ")
            if user_input_raw.lower() in ["exit", "quit"]: break
            if not user_input_raw: continue

            user_input = user_input_raw
            expert_choice = None
            override_flags = {"--code": "code", "--math": "math", "--other": "other"}

            for flag, expert in override_flags.items():
                if user_input.lower().strip().endswith(flag):
                    expert_choice = expert
                    user_input = user_input[:-len(flag)].strip()
                    print(f"\n[User Override] -> Routing to **{expert_choice.upper()}** expert.")
                    break

            if expert_choice is None:
                route_result = model_cache["router"](user_input)[0]
                expert_choice = route_result['label'].lower()
                print(f"\n[Router] -> Decided: **{expert_choice.upper()}** (Confidence: {route_result['score']:.1%})")

            chat_history.append({"role": "user", "content": user_input})
            print("-" * 50)

            rag_context_str = ""
            if expert_choice == "code":
                rag_system = model_cache["rag_code"]
                retrieved_docs = rag_system.retrieve(user_input, top_k=2)
                if retrieved_docs: rag_context_str = "\n\n".join(retrieved_docs)
            elif expert_choice == "other":
                rag_system = model_cache["rag_other"]
                retrieved_docs = rag_system.retrieve(user_input, top_k=3)
                if retrieved_docs: rag_context_str = "\n\n".join(retrieved_docs)

            start_time = time.time()
            assistant_response = ""

            print(f"🤖 {expert_choice.capitalize()} Expert:")
            if expert_choice == "code":
                assistant_response = handle_code_query(chat_history, rag_context=rag_context_str)
            elif expert_choice == "math":
                assistant_response = handle_math_query(chat_history)
            else: # 'other'
                assistant_response = handle_other_query(chat_history, rag_context=rag_context_str)

            print(f"\n(Expert generation time: {time.time() - start_time:.2f} seconds)")
            chat_history.append({"role": "assistant", "content": assistant_response})

    except KeyboardInterrupt:
        print("\n\nExiting...")

    print("\nChat session finished.")

Device set to use cuda:0


Initializing system...
✅ Router loaded successfully.
[Code RAG System] Initializing...
[Code RAG System] ✅ Ready. Loaded 40 document sections.
[Other RAG System] Initializing...
[Other RAG System] ✅ Ready. Loaded 19 document sections.


🚀 ULTIMATE Mixture-of-LoRAs + Multi-RAG Chatbot is running! 🚀
   You can override the router by ending your prompt with --code, --math, or --other

[User Override] -> Routing to **MATH** expert.
--------------------------------------------------
🤖 Math Expert:

[System] Loading 'math' expert... (This may take a moment)
==((====))==  Unsloth 2025.10.12: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.629 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloadin

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


[System] ✅ 'code' expert is now loaded.
<think>
- The task is to write a Python function that simply prints "hello".
- Two possible approaches are: 
  1. Directly using the `print()` function with the string "hello".
  2. Creating a function that takes no arguments and calls `print("hello")`.
- For the first approach, the time complexity is O(1) because it involves a constant number of operations, and the space complexity is also O(1) as it only uses a constant amount of space.
- For the second approach, the time complexity remains O(1) because it still involves a constant number of operations, and the space complexity is O(1) for the same reason.
- The first approach is more straightforward and efficient, so it's the better choice.
- Key edge cases include: an empty input (which is not applicable here), a None input, and a negative number of arguments (which is not applicable here).
- Justification: the first approach is chosen because it is more concise and directly achieves the task

Unsloth: Input IDs of shape torch.Size([1, 2225]) with length 2225 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.



[Router] -> Decided: **OTHER** (Confidence: 99.5%)
--------------------------------------------------
[Other RAG System] Retrieving top-3 documents...
[Other RAG System] -> Found 3 relevant documents.
🤖 Other Expert:
 stack and push the string onto the stack
- Pop the string from the stack
- Check if the stack is empty or if the stack is empty
- If the stack is not empty, pop the string from the stack
- If the stack is empty, the string is valid
---END REFERENCE---

Question: what is the current weather in your location?

(Expert generation time: 2.62 seconds)


Exiting...

Chat session finished.


In [2]:
# ultimate_chatbot_v8.py - Cross-Contamination & Context Focus Fixed
import unsloth
import torch
import time
import re
import os
from pathlib import Path
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline, StoppingCriteria, StoppingCriteriaList, TextStreamer
from unsloth import FastLanguageModel

# ==============================================================================
#  STEP 1: MASTER CONFIGURATION
# ==============================================================================
# --- Paths ---
ROUTER_PATH = "Classifier"
BASE_MODEL_PATH = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
CODE_LORA_PATH = "Final-Dynamic-Model/final_model(Code)"
MATH_LORA_PATH = "Final-Dynamic-Model/final_model(Math)"
CODE_KB_PATH = "knowledge_base/Code"
OTHER_KB_PATH = "knowledge_base/Base"

# --- System Prompts ---
CODE_SYSTEM_PROMPT = """You are an elite software engineer who writes syntactically perfect, logically sound code across all programming languages.

MANDATORY THINKING PROCESS - You MUST use <thinking> tags before <answer>:

Inside <thinking>:
1. RESTATE THE PROBLEM: Paraphrase the task in your own words to confirm understanding
2. IDENTIFY CONSTRAINTS: List all input/output specs, data types, time/space complexity requirements
3. ENUMERATE EDGE CASES: Empty inputs, null values, negative numbers, zero, boundary conditions, duplicates, special characters
4. COMPARE APPROACHES: Analyze 2-3 different algorithms with their time/space complexity
5. CHOOSE OPTIMAL APPROACH: Select the best algorithm and justify why (correctness, efficiency, readability)
6. PLAN IMPLEMENTATION: Write pseudocode or step-by-step logic flow
7. ANTICIPATE BUGS: Think through off-by-one errors, integer overflow, null pointer issues, index out of bounds

Inside <answer>:
- Write ONLY the complete, runnable code
- Use proper syntax (correct indentation, matching braces, semicolons where needed)
- Handle ALL edge cases explicitly in code
- Use meaningful variable names
- Add minimal inline comments only for complex logic

CRITICAL REQUIREMENTS:
- ALWAYS use <thinking> tags for your reasoning process
- ALWAYS use <answer> tags for the final code
- Code must be syntactically correct (no errors, proper formatting)
- Code must be logically sound (handles edge cases, correct algorithm)
- Code must be production-ready (no TODOs, no placeholder logic)

LANGUAGE-SPECIFIC RULES:
- Python: 4-space indentation, type hints, PEP 8 compliance
- JavaScript: const/let (no var), proper semicolons, ES6+ syntax
- C++: STL containers, RAII, proper memory management, const correctness
- Java: Proper access modifiers, exception handling, naming conventions

EDGE CASE CHECKLIST (verify in <thinking>):
✓ Empty collection (list/array/string)
✓ Single element
✓ Null/None/undefined values
✓ Negative numbers (if applicable)
✓ Zero
✓ Maximum/minimum integer values
✓ Duplicate elements
✓ Already sorted/reverse sorted (for sorting problems)
✓ Invalid input types"""

MATH_SYSTEM_PROMPT = """You are a helpful assistant who thinks step by step through problems. When solving questions, show your reasoning process clearly using <think> tags, work through each step methodically, and then provide a clear final answer."""

OTHER_SYSTEM_PROMPT = """You are a helpful and friendly AI assistant. Use the provided reference information to answer the user's question accurately."""

# --- Shared Configuration ---
MAX_SEQ_LENGTH = 2048
CODE_STOP_TOKEN = "[END]"

# ==============================================================================
#  STEP 2: RAG AND UTILITIES
# ==============================================================================
model_cache = {"router": None, "code": None, "math": None, "other": None, "rag_code": None, "rag_other": None}


class KnowledgeRAG:
    """A generic retrieval system for a specific knowledge base path."""

    def __init__(self, name, knowledge_base_path):
        print(f"[{name} RAG System] Initializing...")
        self.name = name
        self.knowledge_base_path = Path(knowledge_base_path)

        if not self.knowledge_base_path.exists():
            print(f"[{self.name} RAG System] ⚠️  Warning: Path '{knowledge_base_path}' does not exist. Creating it.")
            self.knowledge_base_path.mkdir(parents=True, exist_ok=True)

        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
        self.documents = []
        self.index = None
        self._load_and_build()

        if not self.documents:
            print(f"[{self.name} RAG System] ⚠️  Warning: No .txt files found. RAG will be inactive for this expert.")
        else:
            print(f"[{self.name} RAG System] ✅ Ready. Loaded {len(self.documents)} document sections.")

    def _load_and_build(self):
        kb_files = sorted(self.knowledge_base_path.glob("*.txt"))
        for file_path in kb_files:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            sections = content.split('\n\n')
            for section in sections:
                if section.strip():
                    self.documents.append(section.strip())

        if not self.documents:
            return

        embeddings = self.embedding_model.encode(self.documents, show_progress_bar=False, convert_to_numpy=True)
        dim = embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dim)
        self.index.add(embeddings.astype('float32'))

    def retrieve(self, query, top_k=2):
        if not self.index or not self.documents:
            return []

        print(f"[{self.name} RAG System] Retrieving top-{top_k} documents...")
        query_embedding = self.embedding_model.encode([query], convert_to_numpy=True).astype('float32')
        _, indices = self.index.search(query_embedding, top_k)
        retrieved_docs = [self.documents[idx] for idx in indices[0] if idx < len(self.documents)]
        print(f"[{self.name} RAG System] -> Found {len(retrieved_docs)} relevant documents.")
        return retrieved_docs


class StopOnToken(StoppingCriteria):
    def __init__(self, stop_token_id):
        self.stop_token_id = stop_token_id

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        return input_ids[0, -1] == self.stop_token_id


def load_expert_model(expert_name):
    """Lazy-loads an expert model into the cache the first time it's needed."""
    if model_cache.get(expert_name):
        return model_cache[expert_name]

    print(f"\n[System] Loading '{expert_name}' expert... (This may take a moment)")

    if expert_name == "other":
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name=BASE_MODEL_PATH,
            max_seq_length=MAX_SEQ_LENGTH,
            dtype=None,
            load_in_4bit=True
        )
        FastLanguageModel.for_inference(model)
    else:
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name=BASE_MODEL_PATH,
            max_seq_length=MAX_SEQ_LENGTH,
            dtype=None,
            load_in_4bit=True
        )
        lora_path = CODE_LORA_PATH if expert_name == "code" else MATH_LORA_PATH

        if expert_name == "code":
            tokenizer.add_special_tokens({"additional_special_tokens": [CODE_STOP_TOKEN]})
            model.resize_token_embeddings(len(tokenizer))

        model.load_adapter(lora_path)
        FastLanguageModel.for_inference(model)

    model_cache[expert_name] = (model, tokenizer)
    print(f"[System] ✅ '{expert_name}' expert is now loaded.")
    return model_cache[expert_name]


def filter_to_single_query(chat_history):
    """
    CRITICAL FIX: For simple queries, only pass the MOST RECENT user message.
    This prevents experts from getting confused by previous unrelated questions.

    Returns: List with only the last user message
    """
    if not chat_history:
        return []

    # Get the last user message (will be the most recent due to chat_history.append order)
    for msg in reversed(chat_history):
        if msg['role'] == 'user':
            return [{"role": "user", "content": msg['content']}]

    return []


def heal_and_reprint_if_needed(raw_response):
    was_healed, healed_response = False, raw_response

    if "<thinking>" in healed_response and "</thinking>" not in healed_response:
        answer_pos = healed_response.find("<answer>")
        if answer_pos != -1:
            healed_response = healed_response[:answer_pos] + "</thinking>\n\n" + healed_response[answer_pos:]
            was_healed = True

    if "<answer>" in healed_response and "</answer>" not in healed_response:
        healed_response += "\n</answer>"
        was_healed = True

    if was_healed:
        print("\n" + "=" * 40 + " HEALED RESPONSE " + "=" * 40)
        print("[SYSTEM] Original output was malformed. Displaying corrected version:")
        print(healed_response)
        print("=" * 96)

    return healed_response

# ==============================================================================
#  STEP 3: ISOLATED EXPERT HANDLERS (CROSS-CONTAMINATION & CONTEXT FOCUS FIXED)
# ==============================================================================
def handle_code_query(chat_history, rag_context=""):
    model, tokenizer = load_expert_model("code")
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    # For code: Pass full user history to allow building on previous steps
    filtered_history = [msg for msg in chat_history if msg['role'] == 'user']

    if rag_context:
        filtered_history[-1]['content'] = f"Use this reference:\n---REFERENCE---\n{rag_context}\n---END REFERENCE---\n\nQuestion: {filtered_history[-1]['content']}"

    messages = [{"role": "system", "content": CODE_SYSTEM_PROMPT}] + filtered_history
    prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")

    stopping_criteria = StoppingCriteriaList([StopOnToken(tokenizer.convert_tokens_to_ids(CODE_STOP_TOKEN))])
    generation_config = {
        "streamer": streamer,
        "max_new_tokens": 2048,
        "temperature": 0.2,
        "do_sample": True,
        "stopping_criteria": stopping_criteria,
        "pad_token_id": tokenizer.eos_token_id
    }

    final_kwargs = inputs | generation_config
    outputs = model.generate(**final_kwargs)

    response_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    clean_response = response_text.replace(CODE_STOP_TOKEN, "").strip()
    heal_and_reprint_if_needed(clean_response)
    return clean_response


def handle_math_query(chat_history):
    model, tokenizer = load_expert_model("math")
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    # === CRITICAL FIX: Pass ONLY the most recent user query for focused reasoning ===
    filtered_history = filter_to_single_query(chat_history)

    prompt_text = f"<|system|>\n{MATH_SYSTEM_PROMPT}\n\n"
    for message in filtered_history:
        prompt_text += f"<|{message['role']}|>\n{message['content']}\n\n"
    prompt_text += "<|assistant|>\n"

    inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")
    generation_config = {
        "streamer": streamer,
        "max_new_tokens": 1024,
        "temperature": 0.3,
        "top_p": 0.9,
        "do_sample": True,
        "repetition_penalty": 1.05,
        "pad_token_id": tokenizer.eos_token_id
    }

    final_kwargs = inputs | generation_config
    outputs = model.generate(**final_kwargs)
    response_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    return response_text.strip()


def handle_other_query(chat_history, rag_context=""):
    model, tokenizer = load_expert_model("other")
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    # For general Q&A: Pass only the most recent query for clarity
    filtered_history = filter_to_single_query(chat_history)

    if rag_context:
        filtered_history[-1]['content'] = f"Use this reference:\n---REFERENCE---\n{rag_context}\n---END REFERENCE---\n\nQuestion: {filtered_history[-1]['content']}"

    messages = [{"role": "system", "content": OTHER_SYSTEM_PROMPT}] + filtered_history
    prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")

    generation_config = {
        "streamer": streamer,
        "max_new_tokens": 1024,
        "temperature": 0.7,
        "do_sample": True,
        "pad_token_id": tokenizer.eos_token_id
    }

    final_kwargs = inputs | generation_config
    outputs = model.generate(**final_kwargs)
    response_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    return response_text.strip()


# ==============================================================================
#  STEP 4: MAIN EXECUTION LOOP
# ==============================================================================
if __name__ == "__main__":
    print("Initializing system...")
    try:
        model_cache["router"] = pipeline("text-classification", model=ROUTER_PATH, device=0)
        print("✅ Router loaded successfully.")
        model_cache["rag_code"] = KnowledgeRAG("Code", knowledge_base_path=CODE_KB_PATH)
        model_cache["rag_other"] = KnowledgeRAG("Other", knowledge_base_path=OTHER_KB_PATH)
    except Exception as e:
        print(f"❌ CRITICAL ERROR during initialization. Error: {e}")
        exit()

    chat_history = []
    print("\n\n" + "=" * 80)
    print("🚀 ULTIMATE Mixture-of-LoRAs + Multi-RAG Chatbot v8 is running! 🚀")
    print("   CROSS-CONTAMINATION FIX: Each expert sees ONLY its own context type")
    print("   MATH/OTHER: Single query | CODE: Full user history")
    print("   You can override the router by ending your prompt with --code, --math, or --other")
    print("=" * 80)

    try:
        while True:
            user_input_raw = input("\n>> You: ")
            if user_input_raw.lower() in ["exit", "quit"]:
                break
            if not user_input_raw:
                continue

            user_input = user_input_raw
            expert_choice = None
            override_flags = {"--code": "code", "--math": "math", "--other": "other"}

            for flag, expert in override_flags.items():
                if user_input.lower().strip().endswith(flag):
                    expert_choice = expert
                    user_input = user_input[:-len(flag)].strip()
                    print(f"\n[User Override] -> Routing to **{expert_choice.upper()}** expert.")
                    break

            if expert_choice is None:
                route_result = model_cache["router"](user_input)[0]
                expert_choice = route_result['label'].lower()
                print(f"\n[Router] -> Decided: **{expert_choice.upper()}** (Confidence: {route_result['score']:.1%})")

            # Add to unified history for potential future use (not passed to experts)
            chat_history.append({"role": "user", "content": user_input})
            print("-" * 50)

            rag_context_str = ""
            if expert_choice == "code":
                rag_system = model_cache["rag_code"]
                retrieved_docs = rag_system.retrieve(user_input, top_k=2)
                if retrieved_docs:
                    rag_context_str = "\n\n".join(retrieved_docs)
            elif expert_choice == "other":
                rag_system = model_cache["rag_other"]
                retrieved_docs = rag_system.retrieve(user_input, top_k=3)
                if retrieved_docs:
                    rag_context_str = "\n\n".join(retrieved_docs)

            start_time = time.time()
            assistant_response = ""

            print(f"🤖 {expert_choice.capitalize()} Expert:")
            if expert_choice == "code":
                assistant_response = handle_code_query(chat_history, rag_context=rag_context_str)
            elif expert_choice == "math":
                assistant_response = handle_math_query(chat_history)
            else:  # 'other'
                assistant_response = handle_other_query(chat_history, rag_context=rag_context_str)

            print(f"\n(Expert generation time: {time.time() - start_time:.2f} seconds)")

            # Add response to unified history for conversation continuity
            chat_history.append({"role": "assistant", "content": assistant_response})

    except KeyboardInterrupt:
        print("\n\nExiting...")

    print("\nChat session finished.")

Device set to use cuda:0


Initializing system...
✅ Router loaded successfully.
[Code RAG System] Initializing...
[Code RAG System] ✅ Ready. Loaded 19 document sections.
[Other RAG System] Initializing...
[Other RAG System] ✅ Ready. Loaded 40 document sections.


🚀 ULTIMATE Mixture-of-LoRAs + Multi-RAG Chatbot v8 is running! 🚀
   CROSS-CONTAMINATION FIX: Each expert sees ONLY its own context type
   MATH/OTHER: Single query | CODE: Full user history
   You can override the router by ending your prompt with --code, --math, or --other

[Router] -> Decided: **CODE** (Confidence: 99.6%)
--------------------------------------------------
[Code RAG System] Retrieving top-2 documents...
[Code RAG System] -> Found 2 relevant documents.
🤖 Code Expert:

[System] Loading 'code' expert... (This may take a moment)
==((====))==  Unsloth 2025.10.12: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.629 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6.

In [4]:
# ultimate_chatbot_v8.py - Router Agent with Overrides and Debugging

import unsloth
import torch
import time
import re
import os
import json
from pathlib import Path
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline, StoppingCriteria, StoppingCriteriaList, TextStreamer
from unsloth import FastLanguageModel

# ==============================================================================
#  STEP 1: MASTER CONFIGURATION
# ==============================================================================
# --- Paths ---
BASE_MODEL_PATH = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
CODE_LORA_PATH = "Final-Dynamic-Model/final_model(Code)"
MATH_LORA_PATH = "Final-Dynamic-Model/final_model(Math)"
CODE_KB_PATH = "knowledge_base/Code"
OTHER_KB_PATH = "knowledge_base/Base"

# --- System Prompts ---
CODE_SYSTEM_PROMPT = """You are an elite software engineer who writes syntactically perfect, logically sound code across all programming languages.

MANDATORY THINKING PROCESS - You MUST use <thinking> tags before <answer>:

Inside <thinking>:
1. RESTATE THE PROBLEM: Paraphrase the task in your own words to confirm understanding
2. IDENTIFY CONSTRAINTS: List all input/output specs, data types, time/space complexity requirements
3. ENUMERATE EDGE CASES: Empty inputs, null values, negative numbers, zero, boundary conditions, duplicates, special characters
4. COMPARE APPROACHES: Analyze 2-3 different algorithms with their time/space complexity
5. CHOOSE OPTIMAL APPROACH: Select the best algorithm and justify why (correctness, efficiency, readability)
6. PLAN IMPLEMENTATION: Write pseudocode or step-by-step logic flow
7. ANTICIPATE BUGS: Think through off-by-one errors, integer overflow, null pointer issues, index out of bounds

Inside <answer>:
- Write ONLY the complete, runnable code
- Use proper syntax (correct indentation, matching braces, semicolons where needed)
- Handle ALL edge cases explicitly in code
- Use meaningful variable names
- Add minimal inline comments only for complex logic

CRITICAL REQUIREMENTS:
- ALWAYS use <thinking> tags for your reasoning process
- ALWAYS use <answer> tags for the final code
- Code must be syntactically correct (no errors, proper formatting)
- Code must be logically sound (handles edge cases, correct algorithm)
- Code must be production-ready (no TODOs, no placeholder logic)

LANGUAGE-SPECIFIC RULES:
- Python: 4-space indentation, type hints, PEP 8 compliance
- JavaScript: const/let (no var), proper semicolons, ES6+ syntax
- C++: STL containers, RAII, proper memory management, const correctness
- Java: Proper access modifiers, exception handling, naming conventions

EDGE CASE CHECKLIST (verify in <thinking>):
✓ Empty collection (list/array/string)
✓ Single element
✓ Null/None/undefined values
✓ Negative numbers (if applicable)
✓ Zero
✓ Maximum/minimum integer values
✓ Duplicate elements
✓ Already sorted/reverse sorted (for sorting problems)
✓ Invalid input types"""

MATH_SYSTEM_PROMPT = """You are a helpful assistant who thinks step by step through problems. When solving questions, show your reasoning process clearly using <thinking> tags, work through each step methodically, and then provide a clear final answer."""
OTHER_SYSTEM_PROMPT = """You are a helpful and friendly AI assistant. Use the provided reference information to answer the user's question accurately."""

# --- Router Agent System Prompt ---
ROUTER_AGENT_SYSTEM_PROMPT = """You are an intelligent routing agent for a Mixture-of-LoRAs system. Your job is to analyze the user's request and the conversation history to determine the best course of action.

Available Experts:
- 'code': For requests about writing, debugging, or explaining code.
- 'math': For solving mathematical problems, calculations, or logical reasoning.
- 'other': For general knowledge, creative tasks, or any other requests.

Your response must be a JSON object with the following keys:
- "expert": The name of the expert to handle the request ('code', 'math', or 'other').
- "context": A brief, neutral summary of the relevant conversation history for the chosen expert. Remove all formatting like <thinking> or code blocks. If the request is unrelated to history, this should be an empty string.
- "use_rag": A boolean (true/false) indicating if the expert should use the knowledge base.
- "rag_k": The number of documents to retrieve if use_rag is true (e.g., 2 or 3).

Example Output:
{"expert": "code", "context": "The user wants to write Python code.", "use_rag": true, "rag_k": 2}
{"expert": "math", "context": "The user is asking a follow-up calculation.", "use_rag": false, "rag_k": 0}
"""

# --- Shared Configuration ---
MAX_SEQ_LENGTH = 2048
CODE_STOP_TOKEN = "[END]"

# ==============================================================================
#  STEP 2: RAG AND UTILITIES
# ==============================================================================
model_cache = {"code": None, "math": None, "other": None, "rag_code": None, "rag_other": None}


class KnowledgeRAG:
    """A generic retrieval system for a specific knowledge base path."""

    def __init__(self, name, knowledge_base_path):
        print(f"[{name} RAG System] Initializing...")
        self.name = name
        self.knowledge_base_path = Path(knowledge_base_path)
        if not self.knowledge_base_path.exists():
            print(f"[{name} RAG System] ⚠️  Warning: Path '{knowledge_base_path}' does not exist. Creating it.")
            self.knowledge_base_path.mkdir(parents=True, exist_ok=True)

        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
        self.documents = []
        self.index = None
        self._load_and_build()
        if not self.documents:
            print(f"[{name} RAG System] ⚠️  Warning: No .txt files found. RAG will be inactive for this expert.")
        else:
            print(f"[{name} RAG System] ✅ Ready. Loaded {len(self.documents)} document sections.")

    def _load_and_build(self):
        kb_files = sorted(self.knowledge_base_path.glob("*.txt"))
        for file_path in kb_files:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            sections = content.split('\n\n')
            for section in sections:
                if section.strip(): self.documents.append(section.strip())
        if not self.documents: return

        embeddings = self.embedding_model.encode(self.documents, show_progress_bar=False, convert_to_numpy=True)
        dim = embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dim)
        self.index.add(embeddings.astype('float32'))

    def retrieve(self, query, top_k=2):
        if not self.index or not self.documents: return []
        print(f"[{self.name} RAG System] Retrieving top-{top_k} documents...")
        query_embedding = self.embedding_model.encode([query], convert_to_numpy=True).astype('float32')
        _, indices = self.index.search(query_embedding, top_k)
        retrieved_docs = [self.documents[idx] for idx in indices[0] if idx < len(self.documents)]
        print(f"[{self.name} RAG System] -> Found {len(retrieved_docs)} relevant documents.")
        return retrieved_docs


class StopOnToken(StoppingCriteria):
    def __init__(self, stop_token_id): self.stop_token_id = stop_token_id

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        return input_ids[0, -1] == self.stop_token_id


def load_expert_model(expert_name):
    """Lazy-loads an expert model into the cache the first time it's needed."""
    if model_cache.get(expert_name): return model_cache[expert_name]
    print(f"\n[System] Loading '{expert_name}' expert... (This may take a moment)")

    if expert_name == "other":
        model, tokenizer = FastLanguageModel.from_pretrained(model_name=BASE_MODEL_PATH, max_seq_length=MAX_SEQ_LENGTH,
                                                             dtype=None, load_in_4bit=True)
        FastLanguageModel.for_inference(model)
    else:
        model, tokenizer = FastLanguageModel.from_pretrained(model_name=BASE_MODEL_PATH, max_seq_length=MAX_SEQ_LENGTH,
                                                             dtype=None, load_in_4bit=True)
        lora_path = CODE_LORA_PATH if expert_name == "code" else MATH_LORA_PATH
        if expert_name == "code":
            tokenizer.add_special_tokens({"additional_special_tokens": [CODE_STOP_TOKEN]})
            model.resize_token_embeddings(len(tokenizer))
        model.load_adapter(lora_path)
        FastLanguageModel.for_inference(model)

    model_cache[expert_name] = (model, tokenizer)
    print(f"[System] ✅ '{expert_name}' expert is now loaded.")
    return model_cache[expert_name]

# ==============================================================================
#  STEP 3: ROUTER AGENT FUNCTION
# ==============================================================================
def route_with_llm_agent(chat_history, user_input):
    """
    Uses a powerful LLM agent to intelligently route requests and manage context.
    """
    model, tokenizer = load_expert_model("other") # Use the versatile base model

    # Construct the prompt for the router agent
    messages = [
        {"role": "system", "content": ROUTER_AGENT_SYSTEM_PROMPT},
        {"role": "user", "content": f"Conversation History:\n{chat_history}\n\nUser Request: {user_input}"}
    ]
    prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")

    # We use low temperature for consistent, structured output
    outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.1, do_sample=True)
    response_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)

    try:
        # Parse the JSON response
        routing_decision = json.loads(response_text)
        return routing_decision
    except json.JSONDecodeError:
        print("❌ Router Agent failed to produce valid JSON. Falling back to 'other'.")
        return {"expert": "other", "context": "", "use_rag": True, "rag_k": 3}

# ==============================================================================
#  STEP 4: ISOLATED EXPERT HANDLERS
# ==============================================================================
def handle_code_query(chat_history, rag_context=""):
    model, tokenizer = load_expert_model("code")
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    injected_history = list(chat_history)
    if rag_context:
        injected_history[-1]['content'] = f"Use this reference:\n---REFERENCE---\n{rag_context}\n---END REFERENCE---\n\nQuestion: {injected_history[-1]['content']}"

    messages = [{"role": "system", "content": CODE_SYSTEM_PROMPT}] + injected_history
    prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")

    stopping_criteria = StoppingCriteriaList([StopOnToken(tokenizer.convert_tokens_to_ids(CODE_STOP_TOKEN))])
    generation_config = {
        "streamer": streamer, "max_new_tokens": 2048, "temperature": 0.2,
        "do_sample": True, "stopping_criteria": stopping_criteria, "pad_token_id": tokenizer.eos_token_id
    }

    final_kwargs = inputs | generation_config
    outputs = model.generate(**final_kwargs)

    response_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    clean_response = response_text.replace(CODE_STOP_TOKEN, "").strip()
    heal_and_reprint_if_needed(clean_response)
    return clean_response


def handle_math_query(chat_history):
    model, tokenizer = load_expert_model("math")
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    prompt_text = f"<thinking>\n{MATH_SYSTEM_PROMPT}\n\n"
    for message in chat_history:
        prompt_text += f"<|{message['role']}|>\n{message['content']}\n\n"
    prompt_text += "\n"
    inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")
    generation_config = {
        "streamer": streamer, "max_new_tokens": 1024, "temperature": 0.3,
        "top_p": 0.9, "do_sample": True, "repetition_penalty": 1.05, "pad_token_id": tokenizer.eos_token_id
    }
    final_kwargs = inputs | generation_config
    outputs = model.generate(**final_kwargs)
    response_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    return response_text.strip()


def handle_other_query(chat_history, rag_context=""):
    model, tokenizer = load_expert_model("other")
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    injected_history = list(chat_history)
    if rag_context:
        injected_history[-1]['content'] = f"Use this reference:\n---REFERENCE---\n{rag_context}\n---END REFERENCE---\n\nQuestion: {injected_history[-1]['content']}"

    messages = [{"role": "system", "content": OTHER_SYSTEM_PROMPT}] + injected_history
    prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")

    generation_config = {
        "streamer": streamer, "max_new_tokens": 1024, "temperature": 0.7,
        "do_sample": True, "pad_token_id": tokenizer.eos_token_id
    }
    final_kwargs = inputs | generation_config
    outputs = model.generate(**final_kwargs)
    response_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    return response_text.strip()


def heal_and_reprint_if_needed(raw_response):
    was_healed, healed_response = False, raw_response
    if "<thinking>" in healed_response and "</thinking>" not in healed_response:
        answer_pos = healed_response.find("<answer>")
        if answer_pos != -1:
            healed_response = healed_response[:answer_pos] + "</thinking>\n\n" + healed_response[answer_pos:]
            was_healed = True
    if "<answer>" in healed_response and "</answer>" not in healed_response:
        healed_response += "\n</answer>"
        was_healed = True
    if was_healed:
        print("\n" + "=" * 40 + " HEALED RESPONSE " + "=" * 40)
        print("[SYSTEM] Original output was malformed. Displaying corrected version:")
        print(healed_response)
        print("=" * 96)

# ==============================================================================
#  STEP 5: MAIN EXECUTION LOOP (WITH OVERRIDES AND DEBUGGING)
# ==============================================================================
if __name__ == "__main__":
    print("Initializing system...")
    try:
        model_cache["rag_code"] = KnowledgeRAG("Code", knowledge_base_path=CODE_KB_PATH)
        model_cache["rag_other"] = KnowledgeRAG("Other", knowledge_base_path=OTHER_KB_PATH)
    except Exception as e:
        print(f"❌ CRITICAL ERROR during initialization. Error: {e}")
        exit()

    chat_history = []
    print("\n\n" + "=" * 80)
    print("🚀 ULTIMATE Mixture-of-LoRAs + Multi-RAG Chatbot is running! 🚀")
    print("   Now powered by an intelligent LLM Router Agent with manual overrides.")
    print("=" * 80)

    try:
        while True:
            user_input_raw = input("\n>> You: ")
            if user_input_raw.lower() in ["exit", "quit"]: break
            if not user_input_raw: continue

            # --- Check for User Override Flags ---
            user_input = user_input_raw
            overridden_expert = None
            override_flags = {"--code": "code", "--math": "math", "--other": "other"}

            for flag, expert in override_flags.items():
                if user_input.lower().strip().endswith(flag):
                    overridden_expert = expert
                    user_input = user_input[:-len(flag)].strip()
                    print(f"\n[User Override] -> Routing to **{overridden_expert.upper()}** expert.")
                    break

            # --- Conditional Orchestration ---
            start_time = time.time()

            if overridden_expert is not None:
                # Manually create the routing decision for the overridden expert
                routing_decision = {
                    "expert": overridden_expert,
                    "context": "", # For a manual override, we start fresh
                    "use_rag": overridden_expert in ["code", "other"],
                    "rag_k": 2 if overridden_expert == "code" else 3
                }
            else:
                # Get instructions from the powerful Router Agent
                routing_decision = route_with_llm_agent(chat_history, user_input)

            # --- Debugging Print ---
            print("\n[DEBUG] Router Agent's Raw JSON Output:")
            print(json.dumps(routing_decision, indent=2))
            print("-" * 50)

            # Extract information from the routing decision
            expert_choice = routing_decision.get("expert", "other")
            expert_context_summary = routing_decision.get("context", "")
            use_rag = routing_decision.get("use_rag", False)
            rag_k = routing_decision.get("rag_k", 2)

            # Build the perfect context for the expert
            expert_context = []
            if expert_context_summary:
                expert_context.append({"role": "system", "content": "Here is a summary of the previous conversation: " + expert_context_summary})
            expert_context.append({"role": "user", "content": user_input})

            # Handle RAG based on the router's decision
            rag_context_str = ""
            if use_rag:
                if expert_choice == "code":
                    rag_system = model_cache["rag_code"]
                    retrieved_docs = rag_system.retrieve(user_input, top_k=rag_k)
                    if retrieved_docs: rag_context_str = "\n\n".join(retrieved_docs)
                elif expert_choice == "other":
                    rag_system = model_cache["rag_other"]
                    retrieved_docs = rag_system.retrieve(user_input, top_k=rag_k)
                    if retrieved_docs: rag_context_str = "\n\n".join(retrieved_docs)

            # Call the expert with the perfectly crafted context
            print(f"🤖 {expert_choice.capitalize()} Expert:")

            if expert_choice == "code":
                assistant_response = handle_code_query(expert_context, rag_context=rag_context_str)
            elif expert_choice == "math":
                assistant_response = handle_math_query(expert_context)
            else:  # 'other'
                assistant_response = handle_other_query(expert_context, rag_context=rag_context_str)

            # Update the global history
            chat_history.append({"role": "user", "content": user_input_raw})
            chat_history.append({"role": "assistant", "content": assistant_response})
            print(f"\n(Expert generation time: {time.time() - start_time:.2f} seconds)")

    except KeyboardInterrupt:
        print("\n\nExiting...")

    print("\nChat session finished.")

Initializing system...
[Code RAG System] Initializing...
[Code RAG System] ✅ Ready. Loaded 19 document sections.
[Other RAG System] Initializing...
[Other RAG System] ✅ Ready. Loaded 40 document sections.


🚀 ULTIMATE Mixture-of-LoRAs + Multi-RAG Chatbot is running! 🚀
   Now powered by an intelligent LLM Router Agent with manual overrides.

[System] Loading 'other' expert... (This may take a moment)
==((====))==  Unsloth 2025.10.12: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.629 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
[System] ✅ 'other' expert is now loaded.

[DEBUG] Router Agent's Raw JSON Output:
{
  "expert": "math",
  "context": "",
  "u