# The Zephryns knowledge database

Imagine an evil genius whose goal is to explore the galaxy and save endangered alien species... Not so evil after all, apart he likes seeing his failing employees suffer and uselessly beg for pity. You've just been employed at the zegma-IV station that references the Zephryn species. You now have to know this species. Otherwise, your boss will not be eager to give you your daily oxygen. 

All the company confidential knowledge is stored as markdown files. We have built an AI to help you. A R.A.G. is used to handle the ever growing knowledge about the studied species and to keep the knowledge confidential.

Good luck!

In [1]:
DOCUMENTS_DIRECTORY="documents"
DB_DIRECTORY="db/chroma"
QUESTIONS_PATH="questions/questions.json"
EMBEDDING_MODEL='nomic-embed-text'
LLM_MODEL="llama3.2"

# Load the documents

In [2]:
from dotenv import load_dotenv
load_dotenv()


from langchain_community.document_loaders import DirectoryLoader
# from langchain_community.document_loaders import UnstructuredMarkdownLoader

# # Load markdown files from the .documents directory
# loader = DirectoryLoader(
#     './documents',
#     glob="**/*.md",
#     loader_cls=UnstructuredMarkdownLoader,
#     loader_kwargs={"mode": "elements"}
#     )

from langchain_community.document_loaders import TextLoader
loader = DirectoryLoader(
    DOCUMENTS_DIRECTORY,
    glob="**/*.md",
    loader_cls=TextLoader
    )

docs = loader.load()

## Create the chunks

In [None]:
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.utils import filter_complex_metadata
from pprint import pprint

headers_to_split_on = [
    ("#", "Header"),
    ("##", "Header 1"),
    ("###", "Header 2"),
]

# Markdown splitter
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on = headers_to_split_on,
    strip_headers = True
)

chunks = []
for doc in docs:
    chunks.extend(markdown_splitter.split_text(doc.page_content))

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
    separators=['\n\n^(\s*-\s*)', '\n^(\s*-\s*)', '\n\n', '\n', '(?<=\. )', ' ', ''],
)

splitted_chunks = splitter.split_documents(chunks)


filtered_chunks = filter_complex_metadata(splitted_chunks)

for item in filtered_chunks:
    content = ""
    header = item.metadata.get('Header', '')
    if(header != ""):
        content += 'Section: ' + header + '\n'

        header1 = item.metadata.get('Header 1', '')
        if(header1 != ""):
            content += 'Sub-section: ' + header1 + '\n'
    content += item.page_content

    item.page_content = content
    
pprint(filtered_chunks[5])

## Embeddings

In [7]:
import shutil
from langchain_chroma import Chroma


#from langchain_openai import OpenAIEmbeddings
# embeddings = OpenAIEmbeddings(model="text-embedding-3-small")  # Default to text-embedding-ada-002

from langchain_ollama import OllamaEmbeddings
embeddings = OllamaEmbeddings(
    model=EMBEDDING_MODEL,
)


# If the directory exists, first delete it
try:
    shutil.rmtree(DB_DIRECTORY)
except FileNotFoundError as e:
    pass
except PermissionError:
    db = Chroma(
        persist_directory=DB_DIRECTORY, 
        embedding_function=embeddings)
    db.delete_collection() # type: ignore



    
# Create vector store and save the db
db = Chroma.from_documents(
    filtered_chunks, 
    embeddings,
    persist_directory=DB_DIRECTORY
)

In [None]:
results = db.similarity_search_with_relevance_scores(
    "List all the subspecies of the Zephryn.",
    k=10, score_threshold=0.7
)
pprint(results)

# results = db.similarity_search_by_vector_with_relevance_scores(
#     embeddings.embed_query("List all the subspecies of the Zephryn."),
#     k=10#, score_threshold=0.5
# )
# pprint(results)


# results = db.max_marginal_relevance_search(
#     "List all the subspecies of Zephryns.",
#     k=5#, score_threshold=0.5
# )
# pprint(results)