# Text RAG

Steps
- Load the document
- Perform Chunking
- Covert to Text Embedding
- Store the Embedding to VectorDB
- Save VectorDB to disk

Query
- Convert Query to Text Embedding using the same model
- Check in Cache if the query exists then return from there else perform the VectorDB Search for X results
- Take VectorDB X results and give it to reranker for sorting based on query relavance. 
- Select the top Y records from reranker
- Give this Y records to the LLM for humanised response
- Send the response to the user

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

groq_key = os.environ.get("GROQ_API_KEY")
print(f"Groq Key: {groq_key[:5]}...{groq_key[-5:]}")

gsk_r...pCzko


## Step 1: Import Libraries and load the document

In [3]:
import pdfplumber
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

#FAISS, PINECONE
import chromadb
from chromadb import PersistentClient
from chromadb.config import Settings

In [4]:
pdf_reader = pdfplumber.open("../Data/Uber-2024-Annual-Report.pdf")
len(pdf_reader.pages)

142

## Step 2: Page Chunking

In [5]:
text_content = []

def find_middle_newline(s):
    # Step 1: Find all indexes of '\n'
    newline_indices = [i for i, char in enumerate(s) if char == '\n']
    
    if not newline_indices:
        return None  # No newline found
    
    # Step 2: Find the middle index
    middle_index = len(newline_indices) // 2
    
    # Step 3: Return the position of the middle '\n'
    return newline_indices[middle_index]


document_name = "".join(pdf_reader.stream.name.split("/")[-1].split(".")[:-1])


for i, page in enumerate(pdf_reader.pages):
    text_page = page.extract_text()

    if len(text_page.split(" ")) < 10:
        print(f"Page number: {i+1}, count: {len(text_page.split(" "))}")
        continue

    if len(text_page) > 5000:
        mid_index = find_middle_newline(text_page)
        text_content.append({
            "type" : "text",
            "document": document_name,
            "page": f"{i+1}",
            "split":f"0",
            "content": text_page[:mid_index]
        })

        text_content.append({
            "type" : "text",
            "document": document_name,
            "page": f"{i+1}",
            "split":f"1",
            "content": text_page[mid_index+1:]
        })
    else:
        text_content.append({
                    "type" : "text",
                    "document": document_name,
                    "page": f"{i+1}",
                    "split":f"0",
                    "content": text_page
                })

text_content[0]

Page number: 1, count: 5
Page number: 139, count: 2
Page number: 140, count: 5


{'type': 'text',
 'document': 'Uber-2024-Annual-Report',
 'page': '2',
 'split': '0',
 'content': 'Uber’s Mission\nWe reimagine the way the world moves for the better\nWe are Uber. The go-getters. The kind of people who are relentless about our\nmission to help people go anywhere and get anything and earn their way.\nMovement is what we power. It’s our lifeblood. It runs through our veins. It’s\nwhat gets us out of bed each morning. It pushes us to constantly reimagine\nhow we can move better. For you. For all the places you want to go. For all the\nthings you want to get. For all the ways you want to earn. Across the entire\nworld. In real time. At the incredible speed of now.'}

In [6]:
text_doc = pd.DataFrame(text_content)
text_doc["MetaData"] = text_doc.apply(lambda x: {"Document": x["document"], "Page": x["page"], "Split": x["split"], "Type": x["type"]}, axis=1)
text_doc = text_doc.drop(["type", "document", "page", "split"], axis=1)
text_doc.head()

Unnamed: 0,content,MetaData
0,Uber’s Mission\nWe reimagine the way the world...,"{'Document': 'Uber-2024-Annual-Report', 'Page'..."
1,UNITED STATES\nSECURITIES AND EXCHANGE COMMISS...,"{'Document': 'Uber-2024-Annual-Report', 'Page'..."
2,Large accelerated filer ☒ Accelerated filer ☐\...,"{'Document': 'Uber-2024-Annual-Report', 'Page'..."
3,"UBER TECHNOLOGIES, INC.\nTABLE OF CONTENTS\nPa...","{'Document': 'Uber-2024-Annual-Report', 'Page'..."
4,SPECIAL NOTE REGARDING FORWARD-LOOKING STATEME...,"{'Document': 'Uber-2024-Annual-Report', 'Page'..."


### Step 3: Text Embedding

In [7]:
model_name = "all-MiniLM-L6-v2"
embedding_model = SentenceTransformer(model_name)
only_text = text_doc["content"].tolist()

embeddings = embedding_model.encode(only_text)
ids = text_doc["MetaData"].apply(lambda x: f"{x['Document']}_p{x['Page']}_s{x['Split']}") 

## Step 4: Store the embedding to DB

In [8]:
Chroma_DB_Path = "../Store/2_VectorDB"
COLLECTION_NAME = "uber_revenue"

# chroma_client = chromadb.Client(Settings(
#     persist_directory=Chroma_DB_Path,
#     anonymized_telemetry=False
# ))

chroma_client = PersistentClient(path=Chroma_DB_Path)

# collection = chroma_client.get_or_create_collection(name=COLLECTION_NAME)

try:
    collection = chroma_client.get_collection(name=COLLECTION_NAME)
    print(f"Collection '{COLLECTION_NAME}' exists.")
    # You can now work with the 'collection' object
except Exception as e:
    print(f"Collection '{COLLECTION_NAME}' does not exist. {str(e)}")
    # You might choose to create the collection here
    collection = chroma_client.create_collection(name=COLLECTION_NAME)
    # print(f"Collection '{collection_name}' created.")
       
    collection.add(
        documents=text_doc['content'].tolist(),
        metadatas=text_doc['MetaData'].tolist(),
        ids=ids.tolist()
    )
    print("Successfully stored")

Collection 'uber_revenue' exists.


## Step 5: Search Logic for DB/ Cache

In [10]:
caching = []
cache_emd = []
def get_chroma_results(query):
    query_emd = embedding_model.encode([query])
    
    if len(cache_emd) > 0:
        cache_emd_array = np.vstack(cache_emd) 
        similarities = cosine_similarity(query_emd, cache_emd_array)
        best_match_indexes = [np.argmax(item) for item in similarities]

        if len(best_match_indexes) > 0 and similarities[0][best_match_indexes[0]] > 0.8:
            print(f"Returning from query: {caching[best_match_indexes[0]]["query"]} cache with score: {similarities[0][best_match_indexes[0]]:.4f}")
            return (similarities[0][best_match_indexes[0]], caching[best_match_indexes[0]]["query"], caching[best_match_indexes[0]]["results"])
    

    results = collection.query(
        query_texts=[query],
        n_results=15
    )

    caching.append({"query": query, "results": results}) 
    cache_emd.append(query_emd)
    
    return (0, caching[-1]["query"], results)

In [11]:
rerank_cache = {}

from sentence_transformers import CrossEncoder

def rerank_results(query, sim_score, ret_query, result):
    if sim_score > 0.9 and rerank_cache.get(ret_query):
        print(f"Reranker returning from cache for query {ret_query}")
        return rerank_cache.get(ret_query)

    model_name = 'cross-encoder/ms-marco-MiniLM-L-12-v2'

    reranker_model = CrossEncoder(model_name)

    pairs = [(query, doc) for doc in result['documents'][0]]

    scores = reranker_model.predict(pairs)

    scored_results = list(zip(result['documents'][0], scores))

    new_docs = [(doc, result['metadatas'][0][i], score) for i, (doc, score) in enumerate(scored_results)]

    # Sort by score (descending)
    sorted_results = sorted(new_docs, key=lambda x: x[2], reverse=True)

    # Select top 5
    top_5 = sorted_results[:5]

    rerank_cache[query] = top_5

    return top_5

In [12]:
query = "what is uber\'s revenue"
sim_score, ret_query, result = get_chroma_results(query=query)

top_5 = rerank_results(query=query, sim_score=sim_score, ret_query=ret_query, result=result)
for i, (doc, meta, score) in enumerate(top_5):
    print("--"*50)
    print(f"{i+1}. Score: {score:.4f} - \nDocument: {doc[:100]}...\nPage: {meta["Page"]}")

----------------------------------------------------------------------------------------------------
1. Score: 6.6189 - 
Document: UBER TECHNOLOGIES, INC.
CONSOLIDATED STATEMENTS OF OPERATIONS
(In millions, except share amounts whi...
Page: 79
----------------------------------------------------------------------------------------------------
2. Score: 4.4928 - 
Document: 15, 2026, and interim periods within fiscal years beginning after December 15, 2027. Early adoption ...
Page: 98
----------------------------------------------------------------------------------------------------
3. Score: 3.6021 - 
Document: financial statements included in Part II, Item 8, “Financial Statements and Supplementary Data,” of ...
Page: 55
----------------------------------------------------------------------------------------------------
4. Score: 3.5405 - 
Document: UBER TECHNOLOGIES, INC.
CONSOLIDATED STATEMENTS OF COMPREHENSIVE INCOME (LOSS)
(In millions)
Year En...
Page: 80
------------------------

In [14]:
search_results = ""

for i, (doc, meta, score) in enumerate(top_5):
    my_var = f"""
                __________
                Page Number: {meta["Page"]}
                Text content: {doc}
                ___________
            """
    search_results = f"{search_results}\n{my_var}"

search_results

'\n\n                __________\n                Page Number: 79\n                Text content: UBER TECHNOLOGIES, INC.\nCONSOLIDATED STATEMENTS OF OPERATIONS\n(In millions, except share amounts which are reflected in thousands, and per share amounts)\nYear Ended December 31,\n2022 2023 2024\nRevenue $ 31,877 $ 37,281 $ 43,978\nCosts and expenses\nCost of revenue, exclusive of depreciation and amortization shown separately below 19,659 22,457 26,651\nOperations and support 2,413 2,689 2,732\nSales and marketing 4,756 4,356 4,337\nResearch and development 2,798 3,164 3,109\nGeneral and administrative 3,136 2,682 3,639\nDepreciation and amortization 947 823 711\nTotal costs and expenses 33,709 36,171 41,179\nIncome (loss) from operations (1,832) 1,110 2,799\nInterest expense (565) (633) (523)\nOther income (expense), net (7,029) 1,844 1,849\nIncome (loss) before income taxes and income (loss) from equity method investments (9,426) 2,321 4,125\nProvision for (benefit from) income taxes (1

In [None]:
llm_prompt = f"""
You are a helpful assistant that summarizes and humanizes extracted information from a PDF document.

You are given a list of top search results extracted from a PDF via vector similarity search. Each result includes:
- Page number
- Text content

The results may come from different sections of the document but are all relevant to a user's query. Your goal is to:
- Merge the information into a single, coherent, human-readable explanation.
- Remove duplicate or redundant points.
- Clarify technical terms where necessary.
- Maintain important context and detail.
- Do not reference page numbers unless specifically requested.
- Do not list each result or say “Based on the results…”

Here is the extracted content:

[Insert vector DB results here in the format:  
{search_results}  
]

Now, write a clear, concise, and natural summary of the combined information above. The tone should be informative and accessible to someone unfamiliar with the source document. Also, give me the reference page numbers

"""

In [16]:
llm_prompt

'\nYou are a helpful assistant that summarizes and humanizes extracted information from a PDF document.\n\nYou are given a list of top search results extracted from a PDF via vector similarity search. Each result includes:\n- Page number\n- Text content\n\nThe results may come from different sections of the document but are all relevant to a user\'s query. Your goal is to:\n- Merge the information into a single, coherent, human-readable explanation.\n- Remove duplicate or redundant points.\n- Clarify technical terms where necessary.\n- Maintain important context and detail.\n- Do not reference page numbers unless specifically requested.\n- Do not list each result or say “Based on the results…”\n\nHere is the extracted content:\n\n[Insert vector DB results here in the format:  \n\n\n                __________\n                Page Number: 79\n                Text content: UBER TECHNOLOGIES, INC.\nCONSOLIDATED STATEMENTS OF OPERATIONS\n(In millions, except share amounts which are reflect

## Create the LLM

In [17]:
from groq import Groq

groq_client = Groq(api_key=groq_key)

chat_result = groq_client.chat.completions.create(
    model="llama-3.3-70b-versatile",
    messages=[
        {
            "role": "user",
            "content":llm_prompt
        }
    ]
)

In [20]:
from rich.console import Console
from rich.markdown import Markdown
from rich.json import JSON

def print_llm_response(response: str):
    console = Console()
    try:
        console.print(JSON(response))
    except Exception:
        console.print(Markdown(response))

In [22]:
print(query)
print_llm_response(chat_result.choices[0].message.content)

what is uber's revenue
