In [1]:
pip install -r requirement.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_ollama import OllamaLLM 
from langchain.chains import RetrievalQA


In [3]:
# If you want to load all PDFs from a directory:
directory_path = "docs/"
pdf_files = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith(".pdf")]

# Load documents from multiple PDFs
documents = []
for pdf_file in pdf_files:
    loader = PyPDFLoader(pdf_file)
    documents.extend(loader.load())

In [4]:
# Initialize embeddings and vector store
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(documents, embeddings, persist_directory="./chroma_store")
retriever = vectorstore.as_retriever()
print(f"Chroma database created with {len(documents)} full documents!")

Chroma database created with 16 full documents!


In [5]:
# Function to assess query complexity (this can be more sophisticated)
def assess_query_complexity(query):
    # Simple logic: If query length > threshold, it's complex
    return len(query.split()) > 7  # Example threshold

In [9]:
# Query the vectorstore (Example Query)
query = "What is RAG?"
retrieved_docs = retriever.invoke(query)
print(f"Retrieved {len(retrieved_docs)} documents relevant to the query.")

Retrieved 4 documents relevant to the query.


In [10]:
# Decide on chunking approach based on query complexity
if assess_query_complexity(query):
    print("Using late chunking for query:", query)
    # Late chunking: dynamically chunk the retrieved documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = []
    for doc in retrieved_docs:
        chunks.extend(text_splitter.split_documents([doc]))
else:
    print("Using pre-chunking for query:", query)
    # Pre-chunking: documents were already split during vector store creation
    chunks = retrieved_docs

Using pre-chunking for query: What is RAG?


In [11]:
# Initialize the LLM
llm = OllamaLLM(model="llama3", base_url="http://127.0.0.1:11434")

In [12]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

## Prompt Engineering

In [13]:
# Combine the content of the documents into a single context
context = "\n\n".join([doc.page_content for doc in retrieved_docs])

# Advanced prompt with retrieval and Llama 3's knowledge
prompt = f"""
You are an expert AI assistant. Use the provided context and your own knowledge to answer the question in a clear, concise, and professional manner. 

### Instructions:
1. First, prioritize using the context to provide the answer.
2. If additional information is needed, supplement your response with your own knowledge.
3. Always provide the sources for any information retrieved from the context.
4. If the context does not answer the question and you rely solely on your own knowledge, clearly state that no external sources were used.

### Context:
{context}

### Question:
{query}

### Answer:
"""

# Generate the response using Ollama
response = llm.invoke(prompt)

# Print response and source information
print("Answer:", response)
print("Source Documents:")
for doc in retrieved_docs:
    print(f"- Page Content: {doc.page_content[:200]}...")  # Truncated for readability
    print(f"  Metadata: {doc.metadata}")


Answer: Based on the provided context, Retrieval-Augmented Generation (RAG) is a framework that combines pre-trained language models with external knowledge bases to enhance the accuracy and relevance of generated text. This approach retrieves relevant chunks of information from external sources, such as document collections, to assist in generation.

Sources:
1. Context: "Understanding Retrieval-Augmented Generation (RAG) with Chunking"
Source Documents:
- Page Content: Step 4: Retrieving Chunks for RAG
When a query is made, retrieve the most relevant chunks from the database using vector search.
These chunks are then fed into the language model to generate responses...
  Metadata: {'page': 1, 'source': 'docs/RAG_Chunking_Tutorial.pdf'}
- Page Content: Understanding Retrieval-Augmented Generation (RAG) with Chunking
1. Introduction to Retrieval-Augmented Generation (RAG)
Retrieval-Augmented Generation (RAG) is a framework that combines pre-trained l...
  Metadata: {'page': 0, 'source'