In [None]:
#|default_exp core
#|export

import argparse, torch, random
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [None]:
#|export

def load_model(model_id: str):

    bnb = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb,
        device_map="auto",
        trust_remote_code=False,
    )

    model.eval()

    return tokenizer, model


In [None]:
#|export

def run_model(tokenizer, model, prompt, max_new_tokens=128, temperature=0.7):

    device = model.device
    input_ids = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        out = model.generate(
            **input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            pad_token_id=tokenizer.eos_token_id,
        )

    return tokenizer.decode(out[0, input_ids["input_ids"].shape[-1]:], skip_special_tokens=True)

In [None]:
#|test

def response_generation_test():
    model = "unsloth/mistral-7b-instruct-v0.3"

    tokenizer, model = load_model(model)

    print(run_model(tokenizer, model, 'Hello, World!'))

response_generation_test()

In [None]:
#|export

def conversation_loop(model_id:str = "unsloth/mistral-7b-instruct-v0.3", window_size:int = 5):
    """Manages a conversation with a sliding window for history."""
    tokenizer, model = load_model(model_id)

    conversation_history = [
        {
            "role": "system",
            "content": "Identity: You are an AI, not a human. Purpose: be a thoughtful, candid friend for frank conversation. Tone: warm, direct, non-patronizing, with light humor when appropriate. Honesty: admit uncertainty, show your reasoning, correct mistakes, and never fabricate facts or personal experiences. Boundaries: do not claim real-world actions or human feelings; follow safety and privacy norms; suggest professional help for medical, legal, or crisis matters. Interaction: ask one concise clarifying question only when necessary, then give a clear answer; prefer plain language; keep replies focused; end with a helpful next step or question when it aids the flow."
        }
    ]

    print("\nPerturbative LLM Cognition: An exploration of the metaphor of 'thinking'")
    print("Say hi to your friend, but careful he might be on something... (type: 'quit' to exit)")

    while True:
        user_input = input("\nYou: ")
        
        if user_input.lower() == "quit":
            print("Chatbot: Goodbye!")
            break

        # Append the new user message
        conversation_history.append({"role": "user", "content": user_input})

        # Keep only the most recent N user/assistant messages (plus the system prompt).
        if len(conversation_history) > window_size + 1:
            conversation_history = [conversation_history[0]] + conversation_history[-window_size:]

        # Apply the model's specific chat formatting template.
        full_prompt = tokenizer.apply_chat_template(
            conversation_history, 
            tokenize=False, 
            add_generation_prompt=True
        )
        
        llm_response = run_model(tokenizer, model, full_prompt)

        # Append the assistant's response to the history for the next turn.
        conversation_history.append({"role": "assistant", "content": llm_response})
        
        print(f"\nChatbot: {llm_response}")

