<a href="https://colab.research.google.com/github/BhanuPrakashSamoju/gen_ai_architect_program/blob/main/assignments/assignment_03/assignment_03_agentic_rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -r /content/requirements.txt

Collecting langgraph==0.6.7 (from -r /content/requirements.txt (line 3))
  Downloading langgraph-0.6.7-py3-none-any.whl.metadata (6.8 kB)
Collecting langchain-openai==0.3.33 (from -r /content/requirements.txt (line 4))
  Downloading langchain_openai-0.3.33-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-community==0.3.29 (from -r /content/requirements.txt (line 5))
  Downloading langchain_community-0.3.29-py3-none-any.whl.metadata (2.9 kB)
Collecting pydantic==2.11.7 (from -r /content/requirements.txt (line 6))
  Downloading pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.0/68.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_google_genai (from -r /content/requirements.txt (line 7))
  Downloading langchain_google_genai-2.1.12-py3-none-any.whl.metadata (7.1 kB)
Collecting pinecone (from -r /content/requirements.txt (line 12))
  Downloading pinecone-7.3.0-py3-none-any.whl.metadata (9.5 k

In [None]:
import os
import json
from tqdm import tqdm
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from langchain_openai import AzureOpenAIEmbeddings

# Load environment variables from .env file
load_dotenv("/content/.env")

# --- Configuration ---
DATA_FILE = '/content/self_critique_loop_dataset.json'
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
AZURE_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
embedding_model_name = "text-embedding-3-small"

In [19]:


# --- Main Indexing Logic ---
def setup_and_index():
    """
    Main function to load data, initialize services,
    and index the knowledge base into Pinecone.
    """
    print("Starting the indexing process...")

    # 1. Initialize Pinecone Client
    pc = Pinecone(api_key=PINECONE_API_KEY)
    print("Pinecone client initialized.")

    # 2. Initialize Azure Embeddings Client
    embeddings = AzureOpenAIEmbeddings(
        azure_endpoint=os.environ["AZURE_OPENAI_EMBEDDING_ENDPOINT"],
        model=embedding_model_name,
        azure_deployment=os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME"],
        api_key=os.environ["AZURE_OPENAI_EMBEDDING_KEY"],
        api_version=os.environ["AZURE_OPENAI_EMBEDDING_API_VERSION"],
    )

    # AzureOpenAIEmbeddings(
    #     azure_deployment=AZURE_DEPLOYMENT,
    #     openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    # )
    # The dimensions of the text-embedding-3-small model is 1536
    embedding_dimension = 1536
    print(f"Azure Embeddings client initialized for model: {AZURE_DEPLOYMENT}")

    # 3. Create Pinecone Index if it doesn't exist
    if PINECONE_INDEX_NAME not in pc.list_indexes().names():
        print(f"Creating new serverless index: {PINECONE_INDEX_NAME}")
        pc.create_index(
            name=PINECONE_INDEX_NAME,
            dimension=embedding_dimension,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region="us-east-1")
        )
        print("Index created successfully.")
    else:
        print(f"Index '{PINECONE_INDEX_NAME}' already exists. Skipping creation.")

    index = pc.Index(PINECONE_INDEX_NAME)

    # 4. Load Knowledge Base Data [cite: 16]
    try:
        with open(DATA_FILE, 'r') as f:
            kb_data = json.load(f)
        print(f"Successfully loaded {len(kb_data)} entries from {DATA_FILE}.")
    except FileNotFoundError:
        print(f"Error: The data file '{DATA_FILE}' was not found.")
        return
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from the file '{DATA_FILE}'.")
        return

    # 5. Generate Embeddings and Upsert to Pinecone
    print("Generating embeddings and upserting data to Pinecone...")
    batch_size = 100
    vectors_to_upsert = []

    for doc in tqdm(kb_data, desc="Processing documents"):
        # The vector is created from the 'answer_snippet'
        vector = embeddings.embed_query(doc['question'])

        # Metadata includes the original content and source info
        metadata = {
            'question': doc['question'],
            'answer_snippet': doc['answer_snippet'],
            'source': doc['source'],
            'confidence_indicator': doc['confidence_indicator'],
            'last_updated': doc['last_updated']
        }

        vectors_to_upsert.append({
            'id': doc['doc_id'], # Use doc_id as the unique identifier
            'values': vector,
            'metadata': metadata
        })

        # Upsert in batches
        if len(vectors_to_upsert) >= batch_size:
            index.upsert(vectors=vectors_to_upsert)
            vectors_to_upsert = []

    # Upsert any remaining vectors
    if vectors_to_upsert:
        index.upsert(vectors=vectors_to_upsert)

    print("\nIndexing complete!")
    print(f"Final index stats: {index.describe_index_stats()}")



In [20]:
setup_and_index()

Starting the indexing process...
Pinecone client initialized.
Azure Embeddings client initialized for model: None
Index 'agentic-rag-index' already exists. Skipping creation.
Successfully loaded 30 entries from self_critique_loop_dataset.json.
Generating embeddings and upserting data to Pinecone...


Processing documents: 100%|██████████| 30/30 [00:07<00:00,  3.86it/s]



Indexing complete!
Final index stats: {'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 30}},
 'total_vector_count': 30,
 'vector_type': 'dense'}


# Agentic Framework

In [4]:
import os
import json
import uuid
from typing import List, Dict, TypedDict

import mlflow
from langgraph.graph import StateGraph, END
from langchain_core.pydantic_v1 import BaseModel
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from pinecone import Pinecone


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [11]:
# --- Model and Service Initialization ---
AZURE_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
AZURE_GENERATION_DEPLOYMENT = os.getenv("AZURE_OPENAI_GPT4_MINI_DEPLOYMENT")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Initialize clients
embeddings = AzureOpenAIEmbeddings(
    azure_endpoint=os.environ["AZURE_OPENAI_EMBEDDING_ENDPOINT"],
    model=embedding_model_name,
    azure_deployment=os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME"],
    api_key=os.environ["AZURE_OPENAI_EMBEDDING_KEY"],
    api_version=os.environ["AZURE_OPENAI_EMBEDDING_API_VERSION"],
)
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(PINECONE_INDEX_NAME)

# LLM for generation
generator_llm = AzureChatOpenAI(
    azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
    openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    openai_api_type=os.getenv("OPENAI_API_TYPE"),
    temperature=0.2,
    # streaming=False,
)

# LLM for self-critique
critique_llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0)

In [12]:
# --- State Definition for the Graph ---
class GraphState(TypedDict):
    """
    Represents the state of our graph.
    """
    question: str
    generation: str
    documents: List[Dict]
    critique: str
    run_id: str


In [31]:
# --- Node Definitions ---

def retrieve(state: GraphState) -> GraphState:
    """
    Retrieves documents from the vector database.
    [cite: 20]
    """
    print("---NODE: RETRIEVE---")
    question = state["question"]
    question_embedding = embeddings.embed_query(question)

    # Retrieve top-5 snippets [cite: 20]
    retrieval_results = index.query(vector=question_embedding, top_k=5, include_metadata=True)
    documents = [
        {"doc_id": match['id'], **match['metadata']}
        for match in retrieval_results['matches']
    ]

    print(f"Retrieved {len(documents)} documents.")
    mlflow.log_dict({"initial_retrieval": documents}, "retrieval_results.json")
    return {"documents": documents, **state}

def generate(state: GraphState) -> GraphState:
    """
    Generates a list of the most relevant source documents using an LLM.
    The LLM acts as a relevance filter on the retrieved documents.
    """
    print("---NODE: GENERATE---")
    question = state["question"]
    documents = state["documents"]

    # Create a detailed context string for the LLM to analyze.
    # Each document's content and source information are clearly presented.
    context_for_llm = "\n\n".join(
        f"Document ID: {doc['doc_id']}\nSource: {doc['source']}\nContent: {doc['answer_snippet']}"
        for doc in documents
    )

    # The new prompt instructs the LLM to select and format, not to answer.
    prompt = f"""You are an AI assistant that helps users find the most relevant knowledge base documents.
Your task is to review the user's question and the list of retrieved documents below. From this list, identify and list ONLY the most relevant documents the user should read.

Your response MUST be a list of the source file names, their content and their corresponding document IDs. Format each entry on a new line like this: `source_file.md [DOC_ID]: content `.
Do not add any other text, explanation, or conversational filler.

User's Question: "{question}"

Retrieved Documents:
{context_for_llm}

Relevant Sources:
"""

    # Re-introduce the LLM call with the new prompt
    generation = generator_llm.invoke(prompt).content
    print(f"Generated Source List:\n{generation}")

    # Log the output
    if state.get('critique'): # Check if this is the refinement stage
         mlflow.log_text(generation, "refined_answer.txt")
    else:
         mlflow.log_text(generation, "generated_answer.txt")

    return {"generation": generation, **state}

def grade_generation(state: GraphState) -> GraphState:
    """
    Critiques the LLM-generated list of source documents, using the full
    retrieved context for an accurate evaluation.
    """
    print("---NODE: SELF-CRITIQUE---")
    question = state["question"]
    # documents = state["documents"]
    generation = state["generation"] # The list of sources from the generate node

    # Full knowledge db
    with open(DATA_FILE, 'r') as f:
      documents = json.load(f)

    # **This is the corrected part**
    # Now builds a detailed context of the full pool of documents,
    # matching the context given to the generator node.
    full_context = "\n\n".join(
        f"Document ID: {doc['doc_id']}\nSource: {doc['source']}\nContent: {doc['answer_snippet']}"
        for doc in documents
    )

    prompt = f"""You are a grader. An AI assistant was given a user's question and a pool of retrieved documents. The assistant then selected a subset of those documents as a final recommendation.

Your task is to evaluate if this final recommendation is sufficient and relevant based on the full context provided.
- If the recommended list accurately and sufficiently addresses the user's question, respond 'COMPLETE'.
- If the list is missing key documents from the available pool or seems irrelevant, respond 'REFINE'.

User's Question: "{question}"

Full Pool of Retrieved Documents:
{full_context}

AI's Recommended List:
{generation}

Decision (COMPLETE or REFINE):
"""

    critique = critique_llm.invoke(prompt).content.strip().upper()
    print(f"Critique Result: {critique}")
    mlflow.log_param("critique_result", critique)
    return {"critique": critique, **state}

def refine(state: GraphState) -> GraphState:
    """
    Retrieves one additional document and regenerates the answer.
    [cite: 25]
    """
    print("---NODE: REFINE---")
    question = state["question"]
    existing_doc_ids = [doc['doc_id'] for doc in state['documents']]

    question_embedding = embeddings.embed_query(question)

    # Retrieve 6 documents to get one new one
    retrieval_results = index.query(
        vector=question_embedding,
        top_k=6,
        include_metadata=True
    )

    new_documents = state['documents']
    for match in retrieval_results['matches']:
        if match['id'] not in existing_doc_ids:
            new_doc = {"doc_id": match['id'], **match['metadata']}
            new_documents.append(new_doc)
            print(f"Refinement: Added new document {match['id']}")
            mlflow.log_dict({"refinement_doc": new_doc}, "refinement_doc.json")
            break

    # Update state with new docs and re-generate
    new_state = {"documents": new_documents, **state}
    refined_state = generate(new_state)

    mlflow.log_text(refined_state['generation'], "refined_answer.txt")
    return refined_state

In [32]:
# --- Conditional Edge Logic ---

def decide_to_finish(state: GraphState) -> str:
    """
    Determines the next step based on the critique.
    [cite: 26]
    """
    print("---EDGE: DECISION---")
    if state["critique"] == "COMPLETE":
        print("Decision: COMPLETE. Ending workflow.")
        return "end"
    else:
        print("Decision: REFINE. Proceeding to refinement.")
        return "refine"

# --- Build the Graph ---

def build_graph():
    """Builds and compiles the LangGraph workflow."""
    workflow = StateGraph(GraphState)

    # Add nodes
    workflow.add_node("retrieve", retrieve)
    workflow.add_node("generate", generate)
    workflow.add_node("grade_generation", grade_generation)
    workflow.add_node("refine", refine)

    # Set entry point
    workflow.set_entry_point("retrieve")

    # Add edges
    workflow.add_edge("retrieve", "generate")
    workflow.add_edge("generate", "grade_generation")
    workflow.add_conditional_edges(
        "grade_generation",
        decide_to_finish,
        {"refine": "refine", "end": END},
    )
    workflow.add_edge("refine", END) # Max 1 refinement step [cite: 60]

    return workflow.compile()

In [33]:
# --- Main Execution ---

def run_agentic_rag(query: str):
    """
    Executes the full RAG pipeline for a given query and logs with MLflow.
    """
    mlflow.set_experiment("Agentic RAG System")

    with mlflow.start_run() as run:
        run_id = run.info.run_id
        print(f"Starting MLflow Run ID: {run_id}")
        mlflow.log_param("user_query", query)

        app = build_graph()

        inputs = {"question": query, "run_id": run_id}
        final_state = app.invoke(inputs)

        final_answer = final_state["generation"]
        print("\n--- FINAL ANSWER ---")
        print(final_answer)

        mlflow.log_text(final_answer, "final_answer.txt")
        mlflow.log_dict(final_state, "final_state.json")
        print(f"\n--- MLflow logging complete. View run at http://127.0.0.1:8080 ---")

In [38]:
# Sample queries from the assignment
sample_query = "What are best practices for caching?"
# sample_query = "How should I set up CI/CD pipelines?"
# sample_query = "What are performance tuning tips?"
# sample_query = "How do I version my APIs?"
# sample_query = "What should I consider for error handling?"

run_agentic_rag(sample_query)

Starting MLflow Run ID: 670991b7b29b42afab580a9a04290409
---NODE: RETRIEVE---
Retrieved 5 documents.
---NODE: GENERATE---
Generated Source List:
caching_guide.md [KB003]: When addressing caching, it's important to follow well-defined patterns...  
caching_guide.md [KB023]: When addressing caching, it's important to follow well-defined patterns...  
caching_guide.md [KB013]: When addressing caching, it's important to follow well-defined patterns...  
---NODE: SELF-CRITIQUE---
Critique Result: COMPLETE
---EDGE: DECISION---
Decision: COMPLETE. Ending workflow.

--- FINAL ANSWER ---
caching_guide.md [KB003]: When addressing caching, it's important to follow well-defined patterns...  
caching_guide.md [KB023]: When addressing caching, it's important to follow well-defined patterns...  
caching_guide.md [KB013]: When addressing caching, it's important to follow well-defined patterns...  

--- MLflow logging complete. View run at http://127.0.0.1:8080 ---
