In [6]:
import argparse
import os
import shutil
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.vectorstores.chroma import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama
from langchain_community.embeddings.ollama import OllamaEmbeddings

In [10]:
CHROMA_PATH = "chroma"
DATA_PATH = "data"

In [5]:
def get_embedding_function():
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    return embeddings

In [4]:
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()


def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)


def add_to_chroma(chunks: list[Document]):
    # Load the existing database.
    db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
    else:
        print("No new documents to add")


def calculate_chunk_ids(chunks):
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks


def clear_database():
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

# Only run next block if database is not built.

In [None]:
#Check if the database should be cleared (using the --clear flag).
parser = argparse.ArgumentParser()
parser.add_argument("--reset", action="store_true", help="Reset the database.")
args = parser.parse_args()
if args.reset:
    print("Clearing Database")
    clear_database()

#Create (or update) the data store.
documents = load_documents()
chunks = split_documents(documents)
add_to_chroma(chunks)

In [7]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [8]:
def query_rag(query_text: str):
    #Prepare the DB.
    embedding_function = get_embedding_function()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    #Search the DB.
    results = db.similarity_search_with_score(query_text, k=5)

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)

    model = Ollama(model="mistral")
    response_text = model.invoke(prompt)

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)
    return response_text

In [11]:
query_text = "What is the sustainability strategy?"
query_rag(query_text)

Response:  Based on the context provided, the sustainability strategy at ZF includes several key components. These include sustainability strategy and governance, climate and nature, people, supply chain, and product quality and safety. The Sustainability Council, made up of representatives from divisions and functions, is responsible for integrating the strategy into core processes and business activities. Regular reviews are conducted to ensure effectiveness and alignment with customer excellence. Data analytics are used to drive decision making. Compliance with relevant processes and structures is also ensured through an Internal Control System, Corporate Audit, Compliance Management, and Enterprise Risk Management.
Sources: ['data\\ZF_AnnualReport23.pdf:31:0', 'data\\ZF_AnnualReport23.pdf:64:3', 'data\\ZF_AnnualReport23.pdf:73:0', 'data\\ZF_AnnualReport23.pdf:34:1', 'data\\ZF_AnnualReport23.pdf:77:3']


' Based on the context provided, the sustainability strategy at ZF includes several key components. These include sustainability strategy and governance, climate and nature, people, supply chain, and product quality and safety. The Sustainability Council, made up of representatives from divisions and functions, is responsible for integrating the strategy into core processes and business activities. Regular reviews are conducted to ensure effectiveness and alignment with customer excellence. Data analytics are used to drive decision making. Compliance with relevant processes and structures is also ensured through an Internal Control System, Corporate Audit, Compliance Management, and Enterprise Risk Management.'