In [2]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings
from langchain_core.prompts import ChatPromptTemplate

from langchain.schema import Document 
# from dotenv import load_dotenv 
from langchain_community.chat_models import ChatOpenAI
import os 
import shutil 
import ollama

In [3]:
PDFS_DIR = "pdfs"
def load_documents():
  document_loader = PyPDFDirectoryLoader(PDFS_DIR) 
  return document_loader.load() 


In [4]:
def split_text(documents: list[Document]):

  text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300, 
    chunk_overlap=50,
    length_function=len, 
    add_start_index=True,
  )

  chunks = text_splitter.split_documents(documents)
  print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

  return chunks 




In [5]:
CHROMA_PATH = "chromadb"

def save_to_chroma(chunks: list[Document]):
  if os.path.exists(CHROMA_PATH):
    shutil.rmtree(CHROMA_PATH)

  db = Chroma.from_documents(
    chunks,
    embeddings.OllamaEmbeddings(model='llama3'),
    persist_directory=CHROMA_PATH
  )

  db.persist()
  print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")


In [6]:
def initialize_data_store():
  documents = load_documents() 
  chunks = split_text(documents) 
  save_to_chroma(chunks) 

initialize_data_store()


Split 2 documents into 28 chunks.
Saved 28 chunks to chromadb.


  warn_deprecated(


In [7]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:
{context}
 - -
Answer the question based on the above context: {question}
"""

In [12]:
def query_rag(query_text):

  embedding_function = embeddings.OllamaEmbeddings(model='llama3')

  db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
  
  results = db.similarity_search_with_relevance_scores(query_text, k=3)

  if len(results) == 0 or results[0][1] < 0.7:
    print(f"Unable to find matching results.")


  context_text = "\n\n - -\n\n".join([doc.page_content for doc, _score in results])
 

  prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
  prompt = prompt_template.format(context=context_text, question=query_text)
  

  response = ollama.chat(
    model='llama3',
    messages=[
        {
          'role': 'user',
          'content':prompt,
        },
    ],
  )

  response_text = response['message']['content']

  sources = [doc.metadata.get("source", None) for doc, _score in results]
 
  formatted_response = f"Response: \n{response_text}\nSources: {sources}"
  return formatted_response, response_text


In [11]:
query = input("Enter your question: ")
formatted_response, response_text = query_rag(query)
print(formatted_response)
  



Unable to find matching results.
Response: Unfortunately, there is no mention of a new email address for the company in the provided context. The text only discusses rebranding, business strategy, loan products, and partnerships with banks and financial institutions, but does not provide any information about an updated email address.
Sources: ['pdfs/9d5b6cde-ba1a-46e0-8d4c-5a13d7bd16fd.pdf', 'pdfs/9d5b6cde-ba1a-46e0-8d4c-5a13d7bd16fd.pdf', 'pdfs/9d5b6cde-ba1a-46e0-8d4c-5a13d7bd16fd.pdf']
