<a href="https://colab.research.google.com/github/maheshboj/agenticai_basics/blob/Langchain_components/RAG_CHAT_BOT_EVALUATION_TM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Install dependencies


In [33]:
%pip install -qU langchain langchain-community langchain-openai langchain-chroma pypdf

In [34]:
%pip install -qU "unstructured[all-docs]"

In [35]:
%pip install -qU langsmith

In [36]:
from google.colab import userdata


#Environment variables

In [37]:
import os
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_APIKEY')
os.environ["LANGSMITH_API_KEY"] = userdata.get('LANGSMITH_API')
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = "langsmith_rag_tm_eval"

# 2. Load documents (PDF example)

In [38]:
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import PyPDFLoader
loader = TextLoader("/content/Tech_Mahindra_Report_original.md")
doc = loader.load()

In [39]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader

loader = UnstructuredMarkdownLoader("/content/Tech_Mahindra_Report_original.md", mode="elements")
docs = loader.load()

In [40]:
len(docs)

162

# 3. Split into chunks

In [41]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,      # Character chunks
    chunk_overlap=200,    # Overlap for context
    add_start_index=True  # Track source location
)
splits = text_splitter.split_documents(docs)
print(f"Split into {len(splits)} chunks")

Split into 162 chunks


# 4. Embeddings & Vector Store (Chroma - local, persistent)

In [42]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")  # Fast/cheap embeddings

# Filter complex metadata before creating the vectorstore
filtered_splits = filter_complex_metadata(splits)

vectorstore = Chroma.from_documents(
    documents=filtered_splits,
    embedding=embeddings,
    persist_directory="./techm_db"  # Saves locally
)

# 5. Create retriever (k=4 most relevant docs)

In [43]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# 6. RAG Chain: Retrieve → Prompt → LLM → Parse

In [76]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough,RunnableParallel
from langchain_core.output_parsers import StrOutputParser
from langsmith import traceable
llm = ChatOpenAI(model="gpt-4.1", temperature=0)

prompt = ChatPromptTemplate.from_template("""
Answer the question based ONLY on the provided context.
If you can't answer from context, say "I don't have enough information."

Context: {context}

Question: {question}
Answer:
""")

# Chain: docs → text → prompt → LLM → string
@traceable(run_type="chain")
def call_rag_chain(question: str) -> str:
  rag_chain = (
          RunnableParallel({
          "context": retriever,
          "question": RunnablePassthrough()
      }) # Input → retrieve + pass-through
      | prompt
      | llm
      | StrOutputParser()
  )
  answer = rag_chain.invoke(question)
  print(f"Q: {question}\nA: {answer}")
  return answer

In [77]:
# Test it!
question = "give me details of work from home policy or WFH ?"
call_rag_chain(question)

Q: give me details of work from home policy or WFH ?
A: Work-from-Home (WFH) at Tech Mahindra includes flexible work-from-home arrangements that:

- Ensure work continuity and employee safety
- Support work-life balance and productivity
- Are particularly utilized for pandemic resilience

Additionally, there are multiple flexible work options across all countries, designed to balance work and personal commitments and to ensure a comfortable and productive work environment.


'Work-from-Home (WFH) at Tech Mahindra includes flexible work-from-home arrangements that:\n\n- Ensure work continuity and employee safety\n- Support work-life balance and productivity\n- Are particularly utilized for pandemic resilience\n\nAdditionally, there are multiple flexible work options across all countries, designed to balance work and personal commitments and to ensure a comfortable and productive work environment.'

In [78]:
import uuid
thread_id = uuid.uuid4()

In [79]:
thread_id

UUID('abf54791-da73-43e8-99e0-638147581291')

In [80]:
# 7. Test the bot!
question = ""
answer = call_rag_chain(question)
# Interactive chat loop
while True:
    query = input("\nAsk a question (or 'quit'): ")
    if query.lower() == 'quit':
        break
    print(f"A: {call_rag_chain(query,langsmith_extra={"metadata": {"thread_id": thread_id}})}")

Q: 
A: I don't have enough information.

Ask a question (or 'quit'): how many leave we have for paternity
Q: how many leave we have for paternity
A: Paternity leave duration is 2 days.
A: Paternity leave duration is 2 days.

Ask a question (or 'quit'): quit


# Generate Questions Automatically from the chunks

In [81]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

# 2. Define a ChatPromptTemplate named question_gen_prompt
question_gen_prompt = ChatPromptTemplate.from_template("""
Generate a single question that can be answered SOLELY from the following context.

Context: {context}

Question:
""")

# 3. Initialize a ChatOpenAI model for generating questions
question_gen_llm = ChatOpenAI(model="gpt-4.1", temperature=0)

# 4. Create an LLM chain named question_gen_chain
question_gen_chain = (
    {"context": lambda x: x}
    | question_gen_prompt
    | question_gen_llm
    | StrOutputParser()
)

# 5. Initialize an empty list called generated_questions
generated_questions = []

# 6. Iterate through each split and generate a question
for i, split in enumerate(splits):
    question = question_gen_chain.invoke(split.page_content)
    generated_questions.append(question)
    print(f"Generated question for chunk {i+1}: {question}")

print(f"\nTotal {len(generated_questions)} questions generated.")


Generated question for chunk 1: What is the title of the report discussed in the context?
Generated question for chunk 2: Question: What three topics are covered in the provided context?
Generated question for chunk 3: What is the primary focus of Tech Mahindra Limited's report dated December 2025?
Generated question for chunk 4: What is the main purpose of the Executive Summary?
Generated question for chunk 5: What is the total number of professionals employed by Tech Mahindra according to the report?
Generated question for chunk 6: What types of leave and work-life balance initiatives are included in the company's HR policies?
Generated question for chunk 7: What information does the context provide about current headcount, historical trends, and attrition rates?
Generated question for chunk 8: What is the name of the initiative mentioned in the AI Strategy & Future Roadmap that focuses on delivering AI effectively?
Generated question for chunk 9: What section of the document contain

**Reasoning**:
The next step is to iterate through the `generated_questions` and use the `call_rag_chain` function to find answers for each question. The questions and their answers will then be compiled into `document_qa_pairs`.



In [82]:
sample_generated_questions = []

for i, split in enumerate(splits[:10]):
    question = question_gen_chain.invoke(split.page_content)
    sample_generated_questions.append(question)
    print(f"Generated question for chunk {i+1}: {question}")

print(f"\nTotal {len(sample_generated_questions)} sample questions generated.")

Generated question for chunk 1: What is the title of the report discussed in the context?
Generated question for chunk 2: What are the three topics covered in the context provided?
Generated question for chunk 3: What is the primary focus of Tech Mahindra Limited's report dated December 2025?
Generated question for chunk 4: What is the main purpose of the Executive Summary?
Generated question for chunk 5: What is the total number of professionals employed by Tech Mahindra according to the report?
Generated question for chunk 6: What types of leave and work-life balance initiatives are included in the company's HR policies?
Generated question for chunk 7: What information does the context provide about current headcount, historical trends, and attrition rates?
Generated question for chunk 8: What is the name of the initiative mentioned in the AI Strategy & Future Roadmap that focuses on delivering AI effectively?
Generated question for chunk 9: What section of the document contains info

In [83]:
sample_qa_pairs = []
inputs=[]
outputs=[]

for question in sample_generated_questions:
    # Use the existing call_rag_chain function to get the answer
    answer = call_rag_chain(question)
    inputs.append({"question": question})
    outputs.append({"output": answer})


Q: What is the title of the report discussed in the context?
A: The title of the report discussed in the context is "Tech Mahindra: Comprehensive Report".
Q: What are the three topics covered in the context provided?
A: The three topics covered in the context provided are:
1. 3.4 Industry Focus Areas
2. 3.3 Key AI Technologies & Frameworks
3. (No third distinct topic is mentioned; only the two above are present in the context.)

I don't have enough information to identify a third topic.
Q: What is the primary focus of Tech Mahindra Limited's report dated December 2025?
A: The primary focus of Tech Mahindra Limited's report dated December 2025 is HR Policies, Workforce Metrics, and Strategic AI Direction.
Q: What is the main purpose of the Executive Summary?
A: I don't have enough information.
Q: What is the total number of professionals employed by Tech Mahindra according to the report?
A: According to the report, Tech Mahindra employs 150,000+ professionals.
Q: What types of leave and

In [84]:
inputs

[{'question': 'What is the title of the report discussed in the context?'},
 {'question': 'What are the three topics covered in the context provided?'},
 {'question': "What is the primary focus of Tech Mahindra Limited's report dated December 2025?"},
 {'question': 'What is the main purpose of the Executive Summary?'},
 {'question': 'What is the total number of professionals employed by Tech Mahindra according to the report?'},
 {'question': "What types of leave and work-life balance initiatives are included in the company's HR policies?"},
 {'question': 'What information does the context provide about current headcount, historical trends, and attrition rates?'},
 {'question': 'What is the name of the initiative mentioned in the AI Strategy & Future Roadmap that focuses on delivering AI effectively?'},
 {'question': 'What section of the document contains information about leave policy and employee benefits?'},
 {'question': 'What type of leave is described in section 1.1?'}]

In [85]:
outputs

[{'output': 'The title of the report discussed in the context is "Tech Mahindra: Comprehensive Report".'},
 {'output': "The three topics covered in the context provided are:\n1. 3.4 Industry Focus Areas\n2. 3.3 Key AI Technologies & Frameworks\n3. (No third distinct topic is mentioned; only the two above are present in the context.)\n\nI don't have enough information to identify a third topic."},
 {'output': "The primary focus of Tech Mahindra Limited's report dated December 2025 is HR Policies, Workforce Metrics, and Strategic AI Direction."},
 {'output': "I don't have enough information."},
 {'output': 'According to the report, Tech Mahindra employs 150,000+ professionals.'},
 {'output': "The company's HR policies include earned leaves, maternity/paternity benefits, and work-life balance initiatives supported through flexible policies."},
 {'output': 'The context provides information about attrition rates, stating that attrition has gradually increased from approximately 11% to 12.8%

In [86]:
from langsmith import Client

client = Client()
# TODO: Fill in dataset id
dataset_id = "5b2bdb15-bd58-4db6-8c63-381a9f034a91"

client.create_examples(
  inputs=inputs,
  outputs=outputs,
  dataset_id=dataset_id,
)

{'example_ids': ['224d485a-a93f-4002-822e-fb60f50cf625',
  '1f314c93-571b-4918-8035-dc30fd29b407',
  '638016bf-5e8e-4b7d-addb-64be03d8420a',
  '835ba8ba-9c44-4a9f-beb4-de074ea497f5',
  'f8eab131-4f58-4b40-b7ed-1b3a2085c337',
  '46a4b895-6d54-49c3-84a5-caf50d958ee2',
  '127da058-899c-4b91-ae3a-b4d0b79559c0',
  'bf2d8b13-efd2-4997-b7c5-0678dcdb50ec',
  '031efce0-3945-4d4c-afc4-879db55a6190',
  'fe575634-fa78-4085-86f6-cc177444de1b'],
 'count': 10}

# LLM as a Judge Evaluator

In [87]:
from langchain.agents import create_agent
from langchain.agents.structured_output import ToolStrategy

In [88]:
eval_prompt="""You are a semantic similarity evaluator. Compare the meanings of two responses to a question, "
                    "Reference Response and New Response, where the reference is the correct answer, and we are trying to judge if the new response is similar. "
                    "Provide a score between 1 and 10, where 1 means completely unrelated, and 10 means identical in meaning."""

In [89]:
from pydantic import BaseModel, Field
class Similarity_Score(BaseModel):
    similarity_score: int = Field(description="Semantic similarity score between 1 and 10, where 1 means unrelated and 10 means identical.")


In [90]:
agent= create_agent(
    model="gpt-4.1",
    tools=[],
    system_prompt=eval_prompt,
    response_format=ToolStrategy(Similarity_Score)
)

In [91]:
from openai import OpenAI
from pydantic import BaseModel, Field

client = OpenAI()

class Similarity_Score(BaseModel):
    similarity_score: int = Field(description="Semantic similarity score between 1 and 10, where 1 means unrelated and 10 means identical.")

# NOTE: This is our evaluator
def compare_semantic_similarity(inputs: dict, reference_outputs: dict, outputs: dict):
    input_question = inputs["question"]
    reference_response = reference_outputs["output"]
    run_response = outputs["output"]

    agent= create_agent(
    model="gpt-4o-mini",
    tools=[],
    system_prompt=eval_prompt,
    response_format=ToolStrategy(Similarity_Score)
    )
    result=agent.invoke({"role": "user", "content": f"Question: {input_question}\n Reference Response: {reference_response}\n Run Response: {run_response}"})

    return {"score": result['structured_response'].similarity_score, "key": "similarity"}


In [92]:
# From Dataset Example
inputs = {
  "question": "What is the primary focus of the report for Tech Mahindra Limited in December 2025?"
}
reference_outputs = {
  "output": "you can carru forward 17 leaves"
}


# From Run
outputs = {
  "output": "The primary focus of the report for Tech Mahindra Limited in December 2025 is on HR Policies, Workforce Metrics, and Strategic AI Direction"
}

In [93]:
compare_semantic_similarity(inputs,reference_outputs,outputs)

{'score': 8, 'key': 'similarity'}

In [94]:
def compare_semantic_similarity_eval(reference_outputs: dict, outputs: dict):
    input_question = inputs["question"]
    reference_response = reference_outputs["output"]
    run_response = outputs["output"]

    agent= create_agent(
    model="gpt-4o-mini",
    tools=[],
    system_prompt=eval_prompt,
    response_format=ToolStrategy(Similarity_Score)
    )
    result=agent.invoke({"role": "user", "content": f"Question: {input_question}\n Reference Response: {reference_response}\n Run Response: {run_response}"})

    return {"score": result['structured_response'].similarity_score, "key": "similarity"}

# Evaluate

In [95]:
from langsmith import evaluate, Client

client = Client()
dataset_name = "techm_chatbot_golden_dataset"

def is_concise_enough(reference_outputs: dict, outputs: dict) -> dict:
    score = len(outputs["output"]) < 1.5 * len(reference_outputs["output"])
    return {"key": "is_concise", "score": int(score)}

def target_function(inputs: dict):
    return call_rag_chain(inputs["question"])

evaluate(
    target_function,
    data=dataset_name,
    evaluators=[is_concise_enough,compare_semantic_similarity_eval],
    experiment_prefix="gpt-4o-mini"
)

View the evaluation results for experiment: 'gpt-4o-mini-c17d0f93' at:
https://smith.langchain.com/o/b14e6248-7eea-4871-b9cf-55af6d804596/datasets/5b2bdb15-bd58-4db6-8c63-381a9f034a91/compare?selectedSessions=4bfa2696-c6a1-4bbc-973c-92ab9ac74cdd




0it [00:00, ?it/s]

Q: What is the name of the initiative mentioned in the AI Strategy & Future Roadmap that focuses on delivering AI effectively?
A: The name of the initiative is "AI Delivered Right."
Q: What is the total number of professionals employed by Tech Mahindra according to the report?
A: According to the report, Tech Mahindra employs 150,000+ professionals.
Q: What are the three topics covered in the context provided?
A: The three topics covered in the context provided are:
1. 3.3 Key AI Technologies & Frameworks
2. 3.4 Industry Focus Areas
3. Title (implied by the category, but the explicit topics listed are the two above)

Note: Only "3.3 Key AI Technologies & Frameworks" and "3.4 Industry Focus Areas" are explicitly mentioned as topics in the provided context.
Q: What is the title of the report discussed in the context?
A: The title of the report discussed in the context is "Tech Mahindra: Comprehensive Report".
Q: What is the main purpose of the Executive Summary?
A: I don't have enough in

Unnamed: 0,inputs.question,outputs.output,error,reference.output,feedback.is_concise,feedback.similarity,execution_time,example_id,id
0,What is the name of the initiative mentioned i...,"The name of the initiative is ""AI Delivered Ri...",,"The name of the initiative is ""AI Delivered Ri...",1,7,1.199694,031efce0-3945-4d4c-afc4-879db55a6190,019b01bd-c99a-79f1-bdee-47c7f12c1cd0
1,What is the total number of professionals empl...,"According to the report, Tech Mahindra employs...",,"According to the report, Tech Mahindra employs...",1,8,1.085359,127da058-899c-4b91-ae3a-b4d0b79559c0,019b01bd-d9cb-7421-804f-587af57ee039
2,What are the three topics covered in the conte...,The three topics covered in the context provid...,,The three topics covered in the context provid...,1,7,2.42708,1f314c93-571b-4918-8035-dc30fd29b407,019b01bd-ebc0-7ed2-af00-bb954e5d7fea
3,What is the title of the report discussed in t...,The title of the report discussed in the conte...,,The title of the report discussed in the conte...,1,8,0.857889,224d485a-a93f-4002-822e-fb60f50cf625,019b01bd-ff5d-7182-9ef2-eb1475efd0b7
4,What is the main purpose of the Executive Summ...,I don't have enough information.,,I don't have enough information.,1,8,0.869548,46a4b895-6d54-49c3-84a5-caf50d958ee2,019b01be-0c2e-7a52-bcb1-0bda64a29f7e
5,What is the primary focus of Tech Mahindra Lim...,The primary focus of Tech Mahindra Limited's r...,,The primary focus of Tech Mahindra Limited's r...,1,8,1.063294,638016bf-5e8e-4b7d-addb-64be03d8420a,019b01be-1a0e-7a20-a508-3ace034b4f1e
6,What information does the context provide abou...,The context provides information about attriti...,,The context provides information about attriti...,1,8,1.801732,835ba8ba-9c44-4a9f-beb4-de074ea497f5,019b01be-28b6-7bb0-8d81-ad4cecbe0300
7,What types of leave and work-life balance init...,The company's HR policies include earned leave...,,The company's HR policies include earned leave...,1,8,1.21589,bf2d8b13-efd2-4997-b7c5-0678dcdb50ec,019b01be-3d1c-7681-bd40-a31685c32396
8,What type of leave is described in section 1.1?,I don't have enough information.,,I don't have enough information.,1,8,1.016769,f8eab131-4f58-4b40-b7ed-1b3a2085c337,019b01be-4cc8-7152-a178-c90315a44839
9,What section of the document contains informat...,Section 1: Leave Policy & Employee Benefits,,Section 1: Leave Policy & Employee Benefits,1,8,1.130552,fe575634-fa78-4085-86f6-cc177444de1b,019b01be-59b0-79c1-868c-f04ec9b4a844
