In [None]:
import os
# Auto-generated setup for portability
if 'google.colab' in str(get_ipython()):
    # Assume data is mounted or downloaded to current dir in Colab
    BASE_DIR = os.getcwd()
else:
    # Local execution
    BASE_DIR = os.getcwd()


In [2]:
#!/usr/bin/env python3
"""
chat_thinking_model_FIXED.py - Working chat interface with patch
"""
import unsloth
# PATCH FIRST - This fixes the num_logits_to_keep error
import transformers.generation.utils as gen_utils

original_prepare = gen_utils.GenerationMixin._prepare_generation_config

def patched_prepare_generation_config(self, generation_config, use_model_defaults, **kwargs):
    config, model_kwargs = original_prepare(self, generation_config, use_model_defaults, **kwargs)
    model_kwargs.pop('num_logits_to_keep', None)  # Remove the problematic parameter
    return config, model_kwargs

gen_utils.GenerationMixin._prepare_generation_config = patched_prepare_generation_config
print("✅ Applied num_logits_to_keep fix")

# NOW your original imports and code
from unsloth import FastLanguageModel
import torch

# Path to your thinking model
THINKING_MODEL_PATH = "Section-C/Elite-Math-Thinking-Merged"

# Same system prompt used in training
THINKING_SYSTEM_PROMPT = """You are a helpful assistant who thinks step by step through problems. When solving questions, show your reasoning process clearly using <think> tags, work through each step methodically, and then provide a clear final answer."""

def load_model():
    """Load the thinking model"""
    print("🧠 Loading Elite Math + Thinking model...")

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=THINKING_MODEL_PATH,
        max_seq_length=1536,
        dtype="bfloat16",
        load_in_4bit=True,
        device_map={"": 0},
        trust_remote_code=True,
    )

    model = FastLanguageModel.for_inference(model)
    print("✅ Model ready!")
    return model, tokenizer

def chat():
    """Simple chat interface"""
    print("🎯 ELITE MATH + THINKING MODEL CHAT")
    print("Type 'quit' or 'exit' to stop")
    print("=" * 50)

    model, tokenizer = load_model()

    while True:
        # Get user input
        question = input("\n🤔 You: ").strip()

        if question.lower() in ['quit', 'exit', 'q']:
            print("👋 Goodbye!")
            break

        if not question:
            continue

        # Format with thinking system prompt
        full_prompt = f"<|system|>\n{THINKING_SYSTEM_PROMPT}\n\n<|user|>\n{question}\n\n<|assistant|>\n"

        # Generate response
        inputs = tokenizer(full_prompt, return_tensors="pt").to("cuda")

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1024,
                temperature=0.3,
                top_p=0.9,
                do_sample=True,
                repetition_penalty=1.05,
                pad_token_id=tokenizer.eos_token_id,
            )

        response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)

        print(f"\n🤖 Model: {response}")
        print("-" * 50)

if __name__ == "__main__":
    try:
        chat()
    except KeyboardInterrupt:
        print("\n👋 Goodbye!")
    except Exception as e:
        print(f"❌ Error: {e}")


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
✅ Applied num_logits_to_keep fix
🎯 ELITE MATH + THINKING MODEL CHAT
Type 'quit' or 'exit' to stop
🧠 Loading Elite Math + Thinking model...
Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.9.10: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.629 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.9.10 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


✅ Model ready!

🤖 Model: <think>Okay, the user just sent a greeting message. Let me check if I need to respond. Since the greeting is a common way to start a conversation, I should probably reply with something friendly but not too long. Maybe a simple "Hello" or a more personalized greeting if possible. Wait, the user didn't specify any context or topic. So I should keep it general. Hmm, maybe a question to keep the conversation going? Like "How's your day?" or "What's new?" That way, I'm engaging without being too pushy. Yeah, that's a good approach. Also, I don't want to overdo it with too many words. Keep it short and sweet. Let me check the tone. The user started with "Hi," so maybe a casual tone. Okay, let me draft a response. "Hi! How's your day going?" That's friendly and open-ended. No need to add anything else. Yep, that's perfect.</think>
<answer>Hello! How's your day going?</answer>
--------------------------------------------------

👋 Goodbye!


In [5]:
import torch
from unsloth import FastLanguageModel
from sentence_transformers import SentenceTransformer
import faiss
from pathlib import Path

class CodeKnowledgeRAG:
    """Retrieval system over your knowledge base."""
    def __init__(self, knowledge_base_path="./knowledge_base"):
        self.knowledge_base_path = Path(knowledge_base_path)
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.documents = []
        self.document_embeddings = None
        self.index = None
        self._load_knowledge_base()
        self._build_index()

    def _load_knowledge_base(self):
        kb_files = sorted(self.knowledge_base_path.glob("*.txt"))
        for file_path in kb_files:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            sections = content.split('\n\n')
            for section in sections:
                if section.strip():
                    self.documents.append({'source': file_path.name, 'content': section.strip()})

    def _build_index(self):
        doc_texts = [doc['content'] for doc in self.documents]
        self.document_embeddings = self.embedding_model.encode(doc_texts, show_progress_bar=False, convert_to_numpy=True)
        dim = self.document_embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dim)
        self.index.add(self.document_embeddings.astype('float32'))

    def retrieve_relevant_context(self, query, top_k=3):
        query_embedding = self.embedding_model.encode([query], convert_to_numpy=True).astype('float32')
        distances, indices = self.index.search(query_embedding, top_k)
        relevant_docs = []
        for idx in indices[0]:
            if idx < len(self.documents):
                relevant_docs.append(self.documents[idx]['content'])
        return relevant_docs


class RAGChatAssistant:
    """Chat assistant with RAG and enforced stepwise reasoning."""
    def __init__(self, model_path, knowledge_base_path="./knowledge_base"):
        print("Initializing RAG Chat Assistant...")
        self.rag = CodeKnowledgeRAG(knowledge_base_path)
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=model_path,
            max_seq_length=2048,
            dtype=None,
            load_in_4bit=True,
            device_map="auto",
        )
        FastLanguageModel.for_inference(self.model)

        # Strong system prompt encouraging stepwise thinking with <think> tags
        self.system_prompt = (
            "You are a helpful assistant who thinks step by step through problems. "
            "Always show your reasoning clearly using <think> tags, work through each step "
            "methodically, and then provide a clear final answer. When you incorporate any reference information, "
            "explain how it relates inside <think> tags."
        )

    def chat(self):
        print("Chat Started. Type 'exit' to quit.")
        chat_history = []

        while True:
            user_input = input("User: ").strip()
            if user_input.lower() in {"exit", "quit"}:
                print("Goodbye!")
                break

            # Detect if query is code-related to activate RAG retrieval
            code_keywords = ["code", "python", "list", "dict", "algorithm", "function", "set", "file", "json", "search"]
            use_rag = any(kw in user_input.lower() for kw in code_keywords)

            context = ""
            if use_rag:
                relevant_docs = self.rag.retrieve_relevant_context(user_input, top_k=2)
                if relevant_docs:
                    context = "\n\n".join(f"Reference:\n{doc}" for doc in relevant_docs)

            # Build chat messages as list of dicts preserving history
            messages = [{"role": "system", "content": self.system_prompt}]
            for (u, a) in chat_history:
                messages.append({"role": "user", "content": u})
                messages.append({"role": "assistant", "content": a})
            prompt_text = context + "\n\n" + user_input if context else user_input
            messages.append({"role": "user", "content": prompt_text})

            # Tokenize prompt for model
            prompt = self.tokenizer.apply_chat_template(
                messages, add_generation_prompt=True, return_tensors="pt"
            ).to(self.model.device)

            # Generate response
            with torch.inference_mode():
                output_ids = self.model.generate(
                    input_ids=prompt,
                    max_new_tokens=1536,
                    temperature=0.2,
                    top_p=0.95,
                    eos_token_id=self.tokenizer.eos_token_id,
                    pad_token_id=self.tokenizer.pad_token_id,
                )
            response_text = self.tokenizer.decode(output_ids[0][prompt.shape[1]:], skip_special_tokens=True).strip()

            # Enforce <think> tags if missing for reasoning queries
            reasoning_keywords = ["how", "why", "explain", "code", "python", "algorithm", "calculate"]
            if "<think>" not in response_text and any(kw in user_input.lower() for kw in reasoning_keywords):
                response_text = f"<think>{response_text}</think>"

            print(f"Assistant: {response_text}")

            chat_history.append((user_input, response_text))


if __name__ == "__main__":
    MODEL_PATH = "Final-Dynamic-Model/final_model(Math)"
    KNOWLEDGE_BASE_PATH = "./knowledge_base"

    assistant = RAGChatAssistant(MODEL_PATH, KNOWLEDGE_BASE_PATH)
    assistant.chat()


Initializing RAG Chat Assistant...
==((====))==  Unsloth 2025.9.10: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.629 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Chat Started. Type 'exit' to quit.
Assistant: <think>Okay, let's tackle this problem. The task is to find the shortest transformation sequence from a start word to an end word, where only one letter can be changed at a time, and each transformed word must exist in a given word list. All words should have the same length.

First, I need to understand the approach. The problem mentions using a stack to track matching pairs and nested structures. But here, we're dealing with a single-letter c

KeyboardInterrupt: Interrupted by user

In [15]:
import torch
from unsloth import FastLanguageModel

model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
model, tokenizer = FastLanguageModel.from_pretrained(model_name, load_in_4bit=True)
model = model.for_inference()

chat_history = [
    {"role": "system", "content": "You are a helpful assistant."}
]

def chat_step(user_input):
    chat_history.append({"role": "user", "content": user_input})

    prompt_text = tokenizer.apply_chat_template(chat_history, tokenize=False)
    tokens_str = tokenizer.tokenize(prompt_text)  # returns tokens as strings like ['▁Hello', ...]
    input_ids = tokenizer.convert_tokens_to_ids(tokens_str)  # convert to list of int token ids

    input_tensor = torch.tensor([input_ids], device=model.device)  # batch dim added

    output_tokens = model.generate(
        input_tensor,
        max_new_tokens=1536,
        temperature=0.2,
        min_p=0.1,
    )

    output_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

    chat_history.append({"role": "assistant", "content": output_text.strip()})
    return output_text.strip()

user_message = """Write a Python function called 'longest_valid_parentheses' that finds the length of the longest valid (well-formed) parentheses substring in a given string containing only '(' and ')' characters.

A valid parentheses string is one where:
- Every opening parenthesis '(' has a matching closing parenthesis ')'
- The parentheses are properly nested

Example inputs and expected outputs:
- longest_valid_parentheses("(()") should return 2
- longest_valid_parentheses(")()())") should return 4
- longest_valid_parentheses("") should return 0
- longest_valid_parentheses("()(()") should return 2
- longest_valid_parentheses("(()())") should return 6

Provide a complete, working implementation with clear logic. The solution should be efficient (O(n) time complexity preferred). Test your function with all the examples above and explain your approach.

"""
response = chat_step(user_message)
print("Assistant:", response)


==((====))==  Unsloth 2025.9.10: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    NVIDIA GeForce RTX 3060. Num GPUs = 1. Max memory: 11.629 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Assistant: system

Cutting Knowledge Date: December 2023
Today Date: 02 Oct 2025

You are a helpful assistant.user

Write a Python function called 'longest_valid_parentheses' that finds the length of the longest valid (well-formed) parentheses substring in a given string containing only '(' and ')' characters.

A valid parentheses string is one where:
- Every opening parenthesis '(' has a matching closing parenthesis ')'
- The parentheses are properly nested

Example inputs and expected outputs:
- longest_valid_parentheses(