ollama pull nomic-embed-text, gemma3:4b

In [1]:
import sys
sys.path.append("/Users/arshath/play/naveen/library-manager-lite")

In [2]:
from nest_asyncio import apply
from pypdf import PdfReader
import numpy as np
from adapters.llm_provider import get_llm_provider, LitellmProvider

apply()


In [3]:
pdf_path = "/Users/arshath/play/aflow.pdf"
reader = PdfReader(pdf_path)

In [4]:
text = ""
for page in reader.pages:
    text += page.extract_text()

In [5]:
embedding_model = get_llm_provider("ollama-embed")
chat_model_ollama = get_llm_provider("ollama-chat")
chat_model_anthropic = get_llm_provider("recommender")

In [6]:
def chunk_text(text: str, chunk_size: int = 2000, overlap: int = 200) -> list[str]:
    """Splits text into overlapping chunks."""
    chunks = []
    if not text:
        return chunks
    for i in range(0, len(text), chunk_size - overlap):
        chunks.append(text[i:i + chunk_size])
    return chunks

def get_embeddings(texts: list[str], model: LitellmProvider) -> np.ndarray:
    """Generates embeddings for a list of texts and returns them as a numpy array."""
    if not model:
        raise ValueError("Embedding model is not initialized.")
    embeddings = model.embed_text(texts)
    if embeddings is None:
        raise RuntimeError("Failed to get embeddings.")
    return np.array(embeddings)

def find_top_k_chunks(
    query: str,
    text_chunks: list[str],
    chunk_embeddings: np.ndarray,
    embedding_model: LitellmProvider,
    top_k: int = 3
) -> list[str]:
    """Finds top_k most similar chunks to a query using cosine similarity."""
    if not embedding_model:
        raise ValueError("Embedding model is not initialized.")
    
    query_embedding = embedding_model.embed_text(query)
    if query_embedding is None:
        raise RuntimeError("Failed to get query embedding.")
    
    query_embedding = np.array(query_embedding)
    
    # Calculate cosine similarity
    dot_products = np.dot(chunk_embeddings, query_embedding)
    chunk_norms = np.linalg.norm(chunk_embeddings, axis=1)
    query_norm = np.linalg.norm(query_embedding)
    
    if query_norm == 0 or np.any(chunk_norms == 0):
        # Avoid division by zero
        return []
        
    similarities = dot_products / (chunk_norms * query_norm)
    
    # Get top_k indices in descending order
    top_k_indices = np.argsort(similarities)[-top_k:][::-1]
    
    return [text_chunks[i] for i in top_k_indices]

def simple_rag_retrieval(
    query: str, 
    text_chunks: list[str], 
    chunk_embeddings: np.ndarray, 
    embedding_model: LitellmProvider, 
    top_k: int = 3
) -> list[str]:
    """
    Performs the retrieval step of a simple RAG pipeline.
    """
    if not embedding_model:
        print("Embedding model not available.")
        return []

    return find_top_k_chunks(
        query=query, 
        text_chunks=text_chunks,
        chunk_embeddings=chunk_embeddings, 
        embedding_model=embedding_model, 
        top_k=top_k
    )

def question_answer(query: str, retrieved_chunks: list[str], chat_model: LitellmProvider) -> str:
    """
    Generates an answer to a query based on retrieved text chunks using a chat model.
    """

    if not chat_model:
        return "Chat model is not available."

    context = "\n\n".join(retrieved_chunks)
    
    prompt = f"""
    You are a helpful assistant. Please answer the user's query based on the provided context.
    If the answer is not available in the context, please state that you cannot answer the question from the given context.

    Context:
    {context}
    """
    
    user_content = f"Query: {query}"

    try:
        answer = chat_model.completion(prompt=prompt, user_content=user_content, output_format='text')
        return answer
    except Exception as e:
        return f"Error during chat completion: {e}"

In [7]:
chunks = chunk_text(text)
embeddings = get_embeddings(chunks, embedding_model)

In [8]:
query = "What is the main idea of the document?"
top_k = 3

retrieved_chunks = simple_rag_retrieval(
    query=query, 
    text_chunks=chunks,  
    chunk_embeddings=embeddings,
    embedding_model=embedding_model,
    top_k=top_k
)

In [11]:
final_answer_anthropic = question_answer(query, retrieved_chunks, chat_model_anthropic)
final_answer_ollama = question_answer(query, retrieved_chunks, chat_model_ollama)

In [12]:
from pprint import pprint
pprint("ollama")
pprint(final_answer_ollama)

pprint("anthropic")
pprint(final_answer_anthropic)

'ollama'
('The main idea of the document is to describe an optimal workflow for solving '
 'math problems, particularly in the context of HotpotQA and MATH tasks. This '
 'workflow utilizes an ensemble structure with multiple LLM invocations (using '
 'different approaches like algebraic, visual, or estimation) and incorporates '
 'techniques like self-consistency and automated workflow optimization. It '
 'emphasizes the importance of formatting and utilizes various node types '
 '(graph, neural network, code) to achieve a robust and adaptable solution '
 'process.')
'anthropic'
('Based on the provided context, the main idea seems to be describing an '
 'approach for automatically generating and optimizing workflows composed of '
 'large language models (LLMs) for solving different tasks. Some key points:\n'
 '\n'
 '- It formalizes the notion of an "agentic workflow" as a sequence of nodes, '
 'where each node represents invoking a specific LLM with certain parameters '
 'like the mod