# The Zephryns knowledge database

Imagine an evil genius whose goal is to explore the galaxy and save endangered alien species... Not so evil after all, apart he likes seeing his failing employees suffer and uselessly beg for pity. You've just been employed at the zegma-IV station that references the Zephryn species. You now have to know this species. Otherwise, your boss will not be eager to give you your daily oxygen. 

All the company confidential knowledge is stored as markdown files. We have built an AI to help you. A R.A.G. is used to handle the ever growing knowledge about the studied species and to keep the knowledge confidential.

Good luck!

In [10]:
DOCUMENTS_DIRECTORY="documents"
DB_DIRECTORY="db/chroma"
QUESTIONS_PATH="questions/questions.json"
EMBEDDING_MODEL='nomic-embed-text'
LLM_MODEL="llama3.2"

# Load the documents

In [11]:
from dotenv import load_dotenv
load_dotenv()


from langchain_community.document_loaders import DirectoryLoader
# from langchain_community.document_loaders import UnstructuredMarkdownLoader

# # Load markdown files from the .documents directory
# loader = DirectoryLoader(
#     './documents',
#     glob="**/*.md",
#     loader_cls=UnstructuredMarkdownLoader,
#     loader_kwargs={"mode": "elements"}
#     )

from langchain_community.document_loaders import TextLoader
loader = DirectoryLoader(
    DOCUMENTS_DIRECTORY,
    glob="**/*.md",
    loader_cls=TextLoader
    )

docs = loader.load()

## Create the chunks

In [None]:
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.utils import filter_complex_metadata
from pprint import pprint

headers_to_split_on = [
    ("#", "Header"),
    ("##", "Header 1"),
    ("###", "Header 2"),
]

# Markdown splitter
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on = headers_to_split_on,
    strip_headers = True
)

chunks = []
for doc in docs:
    chunks.extend(markdown_splitter.split_text(doc.page_content))

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
    separators=['\n\n^(\s*-\s*)', '\n^(\s*-\s*)', '\n\n', '\n', '(?<=\. )', ' ', ''],
)

splitted_chunks = splitter.split_documents(chunks)


filtered_chunks = filter_complex_metadata(splitted_chunks)

for item in filtered_chunks:
    content = ""
    header = item.metadata.get('Header', '')
    if(header != ""):
        content += 'Section: ' + header + '\n'

        header1 = item.metadata.get('Header 1', '')
        if(header1 != ""):
            content += 'Sub-section: ' + header1 + '\n'
    content += item.page_content

    item.page_content = content
    
pprint(filtered_chunks[5])

## Embeddings

In [13]:
import shutil
from langchain_chroma import Chroma


#from langchain_openai import OpenAIEmbeddings
# embeddings = OpenAIEmbeddings(model="text-embedding-3-small")  # Default to text-embedding-ada-002

from langchain_ollama import OllamaEmbeddings
embeddings = OllamaEmbeddings(
    model=EMBEDDING_MODEL,
)


# If the directory exists, first delete it
try:
    shutil.rmtree(DB_DIRECTORY)
except FileNotFoundError as e:
    pass
except PermissionError:
    db = Chroma(
        persist_directory=DB_DIRECTORY, 
        embedding_function=embeddings)
    db.delete_collection() # type: ignore



    
# Create vector store and save the db
db = Chroma.from_documents(
    filtered_chunks, 
    embeddings,
    persist_directory=DB_DIRECTORY
)

In [None]:
results = db.similarity_search_with_relevance_scores(
    "List all the subspecies of the Zephryn.",
    k=10, score_threshold=0.7
)
pprint(results)

# results = db.similarity_search_by_vector_with_relevance_scores(
#     embeddings.embed_query("List all the subspecies of the Zephryn."),
#     k=10#, score_threshold=0.5
# )
# pprint(results)


# results = db.max_marginal_relevance_search(
#     "List all the subspecies of Zephryns.",
#     k=5#, score_threshold=0.5
# )
# pprint(results)

## Chain for study buddy

### Generate the questions from the whole knowledge database

Concatenate some knowledge elements, to generate the questions against. Here we simply rebuild the original documents. But we could try by similarity for larger documents.

In [None]:
from langchain_ollama import OllamaLLM
from collections import defaultdict
llm = OllamaLLM(model=LLM_MODEL)

docs = db.get(include=['metadatas'])

print('items count in db: ', len(docs['ids']))

# re-build the documents
dict = defaultdict(list)
for i, metadata in enumerate(docs['metadatas']):
    header = metadata.get('Header')
    id = docs['ids'][i]
    dict[header].append(id)

for item in dict:
    print(item)
    print(len(dict[item]))

    

Build the chain to generate the question for each knowledge document.

In [16]:
import json
from typing import List

from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever

class ByIdsRetriever(BaseRetriever):
    _ids = []

    def set_ids(self, ids):
        self._ids = ids

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        """Sync implementations for retriever."""

        if(self._ids == []):
            raise("No ids set")

        matching_documents = []
        for id in self._ids:
            item = db.get(id)

            if(item is None or len(item.get('documents', [])) == 0):
                continue

            document = Document(
                page_content=item['documents'][0],
                metadata=item['metadatas'][0],
                id=id
            )

            matching_documents.append(document)

        return matching_documents
    

retriever_by_ids = ByIdsRetriever()


from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableParallel
from langchain.prompts import PromptTemplate


prompt_questions = PromptTemplate.from_template(
    """You are a study buddy. You ask the user questions that will help him to learn the concepts within the given knowledge context. 
Use the following pieces of context to generate 10 questions to ask to the user. Do not use any other information than the context.

Be very concise and to the point. 
Each question expects a simple answer. 
Do not combine multiple questions in one. 
Do not start your answer with things like "based on the context" or "I think". 
Do not generate any answer to the questions.

Avoid question with a yes/no answer. The question should not include any clue to their answer. 
For example "Do Dreamweavers live in communal nests?" or "Can Dreamweavers be found in places of great spiritual significance?" are bad questions because it gives the answer in the question itself. Instead, you should ask "Where do Dreamweavers live?" and "What subspecies of Zephryn can be found in places of great spiritual significance?".

Consider each question as a separate question.

Each question is expected to be one separate line.

Context:
---------
{context}
---------
Questions: """
)

def format_docs(docs):
    return "\n\n".join(
        doc.page_content for doc in docs
    )

chain_questions = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["docs"])))
    | prompt_questions
    | llm
    | StrOutputParser()
)

# get the source documents
chain_questions_with_source = RunnableParallel(
    {"docs": retriever_by_ids, "question": RunnablePassthrough()}
).assign(answer=chain_questions)


questions_all = []
for key, ids in dict.items():
    retriever_by_ids.set_ids(ids)
    llm_output = chain_questions_with_source.invoke('')
    questions_str = llm_output['answer']
    # split the output into individual questions by trimming and ignoring empty strings
    questions = [q.strip() for q in questions_str.split('\n') if q.strip()]
    questions_all.extend(questions)


# db_questions = Chroma.from_documents(
#     [Document(page_content=question) for question in questions_all],
#     embeddings,
#     collection_name="questions",
#     persist_directory=persist_directory
# )



# Store questions into a JSON file, because... why not ?
with open(QUESTIONS_PATH, 'w') as f:
    json.dump(questions_all, f, indent=4)


# TODO: asks the system its own questions, for fun
# TODO: asks the user the questions then evaluate his answer