### Welcome!


# What is RAG?

# What are Chunks?
Chunks are the building blocks of RAG. They are the smallest unit of information that can be stored in Vector database. This information could be a single word, a sentence, a paragraph, or even an entire document. Chunks are stored in a way that allows them to be easily retrieved and manipulated.

In [None]:
import tqdm as notebook_tqdm
from llama_index.core import SimpleDirectoryReader

def load_documents(docs_path):
    documents = SimpleDirectoryReader(docs_path).load_data()
    print(f"Loaded {len(documents)} documents")
    print(f"First document: {documents[0]}")
    return documents

In [None]:
docs_path = "./data/docs"
documents = load_documents(docs_path)

In [None]:
print(f"documents is a {type(documents)}, of length {len(documents)}, where each element is a {type(documents[0])} object")

# Chunking Startegy

### Regex pattern used: 
#### Pattern: `r'\r?\n\s*\r?\n+'`
#### Explanation:
Let's break down this pattern to understand each component:

- \r?: This part of the pattern matches zero or one occurrence of a carriage return (\r). The question mark ? makes the carriage return optional, which allows the pattern to work with both Windows-style line endings (\r\n, where the line ends with both a carriage return and a newline character) and Unix-style line endings (\n, where the line ends with just a newline character).

- \n: This matches a newline character. Combined with the preceding \r?, this part matches a line break that could be represented either as \r\n (Windows) or \n (Unix/Linux).

- \s*: This matches any whitespace characters (including spaces, tabs, and line breaks) zero or more times. The asterisk * denotes "zero or more occurrences," allowing for any number of whitespace characters, including none, between line breaks. This is useful for catching cases where paragraphs might be separated by one or more blank lines, possibly containing spaces or tabs.

- \r?\n: This is similar to the first part, matching another optional carriage return followed by a newline character, indicating the end of the blank line(s) and the start of a new paragraph.

- +: Placed at the end of the pattern, this quantifier matches one or more occurrences of the preceding pattern. It ensures that the regex can match multiple consecutive paragraph breaks, treating them as a single split point. This is useful for separating paragraphs that might be divided by more than one blank line.

In [None]:
import re

# Define the pattern for paragraphs and newlines
split_pattern = r"\r?\n\s*\r?\n+"

# Initialize lists to store the word counts of all chunks (now paragraphs) and entire texts across all documents
chunk_word_counts = []
entire_text_word_counts = []

# Initialize a variable to count the total number of paragraphs
total_paragraph_count = 0

# Iterate through each Document object in your list of documents
for doc in documents:
    # Assuming doc.text contains the full text of the PDF document
    paragraphs = re.split(split_pattern, doc.text)
    paragraphs = [paragraph.strip() for paragraph in paragraphs if paragraph.strip()]

    # Update the total paragraph count
    total_paragraph_count += len(paragraphs)

    # Calculate the number of words in each paragraph and store it
    chunk_word_counts.extend([len(paragraph.split()) for paragraph in paragraphs])

    # Calculate the number of words in the entire text and store it
    entire_word_count = len(doc.text.split())
    entire_text_word_counts.append(entire_word_count)

# Calculate summary statistics for paragraphs
average_paragraph_word_count = sum(chunk_word_counts) / len(chunk_word_counts)
max_paragraph_word_count = max(chunk_word_counts)

# Calculate average word count for entire texts
average_entire_text_word_count = sum(entire_text_word_counts) / len(entire_text_word_counts)

# Calculate the average number of paragraphs per document
average_paragraphs_per_document = total_paragraph_count / len(documents)

print(f"Average word count for a document: {average_entire_text_word_count}")
print(f"Average word count per paragraph: {average_paragraph_word_count}")
print(f"Longest paragraph: {max_paragraph_word_count}")
print(f"Total number of paragraphs: {total_paragraph_count}")
print(f"Average number of paragraphs per document: {average_paragraphs_per_document}")


In [None]:
from llama_index.core.node_parser import SentenceSplitter

text_splitter = SentenceSplitter(
    # paragraph_separator=r"\r?\n\s*\r?\n+", 
    chunk_size=512, 
    chunk_overlap=20
)
nodes = text_splitter.get_nodes_from_documents(documents)
print(f"Number of nodes: {len(nodes)}")

In [None]:
# checking the character count and word count for each node and getting avergaes
node_char_counts = [len(node.text) for node in nodes]
node_word_counts = [len(node.text.split()) for node in nodes]

average_node_char_count = sum(node_char_counts) / len(node_char_counts)
average_node_word_count = sum(node_word_counts) / len(node_word_counts)

print(f"Average character count for a node: {average_node_char_count}")
print(f"Average word count for a node: {average_node_word_count}")

In [None]:
# print all metadata values with keys for one node
for key in nodes[0].metadata.keys():
    print(f"{key}: {nodes[0].metadata[key]}")

# Ingestion Pipeline

In [None]:
from llama_index.llms.ollama import Ollama
from llama_index.core.settings import Settings
from llama_index.core import SimpleDirectoryReader
from llama_index.embeddings.langchain import LangchainEmbedding
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

# Configuration for llm, embedding model, and node parsing
ollama_model = "mistral"
ollama_base_url = "http://127.0.0.1:11434"
embedding_model = "sentence-transformers/all-mpnet-base-v2"

Settings.llm = Ollama(model=ollama_model, base_url=ollama_base_url, temperature=0, request_timeout=300.0)
Settings.embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name=embedding_model))


def load_documents(docs_path):
    documents = SimpleDirectoryReader(docs_path).load_data()
    print(f"Loaded {len(documents)} documents")
    if documents:
        print(f"First document: {documents[0]}")
    return documents

In [None]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, VectorStoreIndex


def build_index(client, documents, index_name):

    chroma_collection = client.get_or_create_collection(index_name, metadata={"hnsw:space": "cosine"})
    print(f"Created/existing collection {chroma_collection}")
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_documents(
        documents=documents,
        transformations=[SentenceSplitter(chunk_size=512, chunk_overlap=20)],
        storage_context=storage_context,
        show_progress=True
    )
    return index

In [None]:
import chromadb

# save documents to vector store
def run_ingestion_pipeline(vectoredb_path, docs_path):
    
    print("Connecting to ChromaDB...")
    chromadb_client = chromadb.PersistentClient(path=vectoredb_path)

    print("Loading documents...")
    documents = load_documents(docs_path)

    print("Building index...")
    index = build_index(
        client=chromadb_client, 
        documents=documents, 
        index_name="test"
    )

    return index

In [None]:
vectordb_path = "./data/vectordb"
docs_path = "./data"

run_ingestion_pipeline(vectordb_path, docs_path)

# Retrieval Pipeline

In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore

# def retrieve_from_index(chunk_size, llm, embed_model, chromadb_client, index_name):
def retrieve_from_index(chromadb_client, index_name):
    chroma_collection = chromadb_client.get_or_create_collection(index_name, metadata={"hnsw:space": "cosine"})
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    index = VectorStoreIndex.from_vector_store(
        vector_store,
        # chunk_size=chunk_size,
        show_progress=True
    )
    return index

In [None]:
import chromadb
from llama_index.llms.ollama  import Ollama
from llama_index.core.query_engine import CitationQueryEngine
# from llama_index.core.settings import Settings


vectordb_path = "./data/vectordb"
docs_path = "./data"
index_name = "test"
chunk_size = 500
ollama_model = "mistral"
ollama_base_url = "http://127.0.0.1:11434"
embedding_model = "sentence-transformers/all-mpnet-base-v2"


print("Connecting to Chromadb")
chromadb_client = chromadb.PersistentClient(path=vectordb_path)

print("Loading Ollama...")
llm = Ollama(model=ollama_model, base_url=ollama_base_url, temperature=0, request_timeout=300.0)

print("Retrieving index...")
index = retrieve_from_index(chromadb_client, index_name)

print("Constructing query engine...")
query_engine = CitationQueryEngine.from_args(
    index=index,
    llm=llm,
    similarity_top_k=3,
    verbose=True
)

In [None]:
response = query_engine.query("What is Buddhism?")
print(response)

In [None]:
from IPython.display import Markdown, display

# define prompt viewing function
def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown("<br><br>"))


prompts_dict = query_engine.get_prompts()
display_prompt_dict(prompts_dict)