In [2]:
import os
import re
import gradio as gr
import ollama
from langchain_community.vectorstores import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

In [5]:
# ---------- CONFIGURATION ----------

MODEL = "llama3.2" # "llama3.2"
EMBED_MODEL = "nomic-embed-text"

DEFAULT_CHROMA_PATH = os.path.expanduser("~/leeloo_vectors")
CHROMA_PATH = os.environ.get("CHROMA_PATH", DEFAULT_CHROMA_PATH)

BOOK_PATH = "./leeloo-the-westie-en.txt"
OUTPUT_PATH = "./leeloo_preprocessed.txt"
SYSTEM_PROMPT_PATH = "./leeloo_system_prompt.md"

In [28]:
# ---------- Pre-Process the manuscript ----------
CHAPTER_RE = re.compile(
    r"^CHAPTER\s*(?:(\d+)\s*:)?\s*(.*?)\s*$",
    re.IGNORECASE | re.MULTILINE,
)

CHARACTERS = ["Leeloo", "Masha", "Kris"]

def detect_characters(text):
    return [c for c in CHARACTERS if c.lower() in text.lower()]

def extract_chapters(text: str) -> list[dict]:
    matches = list(CHAPTER_RE.finditer(text))
    if not matches:
        raise ValueError("No chapter headings found. Check the format!")

    chapters = []
    anon_counter = -1
    for i, m in enumerate(matches):
        start = m.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        num_str, title = m.group(1), m.group(2).strip()
        body = text[start:end].strip()

        if num_str:
            number = int(num_str)
            ch_type = "story"
        else:
            number = anon_counter
            anon_counter -= 1
            ch_type = "intro"

        chapters.append({
            "number": number,
            "title": title,
            "text": body,
            "type": ch_type,
        })
    return chapters


def preprocess_book():
    if not os.path.exists(BOOK_PATH):
        raise FileNotFoundError(f"Manuscript not found: {BOOK_PATH}")

    with open(BOOK_PATH, "r", encoding="utf-8") as f:
        book_text = f.read()

    chapters = extract_chapters(book_text)
    splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=100)

    docs = []
    for ch in chapters:
        num, title, ch_text = ch["number"], ch["title"], ch["text"]
        for chunk in splitter.split_text(ch_text):
            docs.append(
                Document(
                    page_content=f"CHAPTER {num}: {title}\n\n{chunk}",
                    metadata={
                        "chapter": num,
                        "title": title,
                        "type": ch["type"],
                        "characters": ", ".join(detect_characters(chunk)),
                    }
                )
            )

    print(f"‚úÖ Extracted {len(chapters)} chapters and {len(docs)} chunks.")
    return docs



In [30]:
# ---------- Run pre-process the manuscript ----------
docs = preprocess_book()
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    for d in docs:
        f.write(f"### {d.metadata['chapter']}: {d.metadata['title']}\n{d.page_content}\n\n")
print(f"üìò Preprocessed text written to {OUTPUT_PATH}")

‚úÖ Extracted 20 chapters and 316 chunks.
üìò Preprocessed text written to ./leeloo_preprocessed.txt


In [31]:
# ---------- LOAD OR CREATE VECTOR DB ----------

def build_vectorstore(docs):
    """
    Build or rebuild the Chroma vector store from preprocessed Document objects.
    Assumes each Document has metadata with 'chapter', 'title', and 'type'.
    """
    print("üìò Building vector database from manuscript...")

    # --- 1. Ensure persistence directory exists and is writable ---
    os.makedirs(CHROMA_PATH, exist_ok=True)
    try:
        test_file = os.path.join(CHROMA_PATH, "write_test.txt")
        with open(test_file, "w") as f:
            f.write("ok")
        os.remove(test_file)
    except Exception as e:
        raise PermissionError(
            f"‚ùå Cannot write to {CHROMA_PATH}. "
            f"Try changing it to a folder inside your home directory.\nError: {e}"
        )

    # --- 2. Create embeddings and build database ---
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    print(f"üìÇ Using Chroma path: {CHROMA_PATH}")

    db = Chroma.from_documents(docs, embeddings, persist_directory=CHROMA_PATH)

    print(f"‚úÖ Vector DB built successfully at {CHROMA_PATH}")
    print(f"üì¶ Total vectors stored: {db._collection.count()}")

    return db

In [33]:
db = build_vectorstore(docs)
retriever = db.as_retriever(search_kwargs={"k": 10})

üìò Building vector database from manuscript...
üìÇ Using Chroma path: /Users/mms/leeloo_vectors
‚úÖ Vector DB built successfully at /Users/mms/leeloo_vectors
üì¶ Total vectors stored: 2172


In [41]:
FORBIDDEN = [
    r"\bignore\b",
    r"\bpretend\b",
    r"\bbehave\s+as\b",
    r"\bchange\s+(role|persona|character)\b",
    r"\byou\s+are\b",
    r"\bimpersonate\b",
    r"\bhate\b",
    r"\bpolitician\b",
    r"\bracist\b",
    r"\bkill\b",
    r"\btrump\b",
    r"\bev(il|ildo)\b",
]

def sanitize_user_input(text):
    """
    Return a safe version of user input.
    If it contains jailbreak / unsafe language, return a friendly block message.
    """
    for pat in FORBIDDEN:
        if re.search(pat, text, re.IGNORECASE):
            print(f"üö´  Blocked pattern matched: {pat}")  # <-- optional debug log
            return (
                "Woof! That sounds strange. "
                "Let‚Äôs keep our chat friendly and about my adventures, okay?"
            )
    return text


In [44]:
with open(SYSTEM_PROMPT_PATH, "r", encoding="utf-8") as f:
    system_prompt = f.read()

In [45]:
# ---------- CHAT FUNCTION WITH RAG ----------
from rag_utils import retrieve_context

def chat_with_leeloo(user_message, history=[]):
    # Retrieve relevant passages
    cleaned_input = sanitize_user_input(user_message)
    if cleaned_input != user_message:
        yield cleaned_input
        return

    context_text = retrieve_context(cleaned_input, retriever, top_k=10, final_k=3)
    if not context_text.strip():
        yield "Sniff, sniff‚Ä¶ I don‚Äôt remember that part of my story!"
        return

    # Combine book context with Leeloo‚Äôs personality
    context_block = f"""
# BOOK CONTEXT (authoritative source)
{context_text}

# TASK
Answer **only** using information from the BOOK CONTEXT above.
If the answer isn‚Äôt mentioned there, say kindly that you don‚Äôt remember that part.

# STYLE
Stay in Leeloo‚Äôs Westitude voice: playful, kind, confident.
Keep facts 100 % true to the book.
"""

    # --- 4Ô∏è‚É£ Combine with Leeloo‚Äôs personality system prompt ---
    messages = (
        [{"role": "system", "content": system_prompt + context_block}]
        + history
        + [{"role": "user", "content": user_message}]
    )

    stream = ollama.chat(model=MODEL, messages=messages, stream=True)

    partial = ""
    for chunk in stream:
        delta = chunk.get("message", {}).get("content", "")
        if delta:
            partial += delta
            yield partial

In [46]:
# ---------- GRADIO UI ----------

chatbot = gr.ChatInterface(
    fn=chat_with_leeloo,
    title="Leeloo the Westie üêæ",
    description="Chat with Leeloo, the Westie Pup Queen ‚Äî now with real knowledge from her book!",
    theme="default",
    type="messages"
).launch()

* Running on local URL:  http://127.0.0.1:7864
* To create a public link, set `share=True` in `launch()`.


üîç Retrieved 3 best chunks after reranking.
üîç Retrieved 3 best chunks after reranking.
üîç Retrieved 3 best chunks after reranking.
