In [2]:

import os
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv
load_dotenv()
os.environ["GOOGLE_API_KEY"] =os.getenv("GOOGLE_API_KEY")


llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",

)

In [3]:
# List of Questions
questions = [
    "What major trends were observed in large language model (LLM) development between 2024 and 2025?",
    "What improvements were made in embedding models during this time?",
    "What is meant by 'agentic AI,' and how did it evolve in 2025?",
    "What challenges or cautions were associated with agentic AI adoption?",
    "What practical advice does the summary offer to engineers and researchers?"
]

# List of Answers
answers = [
    "The 2024–2025 period saw major LLM releases like Meta’s Llama 4 family with multimodal and mixture-of-experts features, Google’s Gemini 2.5 and Gemma models with strong on-device capabilities, and continued updates from OpenAI, Anthropic, and others such as GPT-5, Grok 4, and Claude 4.",
    "OpenAI launched the text-embedding-3 family (small and large), offering better cost-performance for RAG, while Google introduced EmbeddingGemma for efficient on-device semantic search. The ecosystem overall focused on latency, recall metrics, and quantization for production use.",
    "Agentic AI refers to systems that autonomously plan and execute multi-step tasks. In 2025, it became a major industry trend, with vendors releasing SDKs, production toolkits, and enterprise frameworks like Anthropic’s 'Skills' and Salesforce’s Agentforce.",
    "Analysts warned of 'agent washing'—projects branded as agentic without real value. Gartner predicted over 40% of agentic AI projects might be scrapped by 2027 due to unclear ROI and high engineering costs.",
    "The document advises selecting model families by task type, benchmarking embeddings for RAG pipelines, using hybrid retrieval for cost efficiency, and defining clear KPIs when building agentic systems to avoid over-generalization and failure."
]



In [4]:

# Example: print each question with its answer
for q, a in zip(questions, answers):
    print(f"Q: {q}\nA: {a}\n")

Q: What major trends were observed in large language model (LLM) development between 2024 and 2025?
A: The 2024–2025 period saw major LLM releases like Meta’s Llama 4 family with multimodal and mixture-of-experts features, Google’s Gemini 2.5 and Gemma models with strong on-device capabilities, and continued updates from OpenAI, Anthropic, and others such as GPT-5, Grok 4, and Claude 4.

Q: What improvements were made in embedding models during this time?
A: OpenAI launched the text-embedding-3 family (small and large), offering better cost-performance for RAG, while Google introduced EmbeddingGemma for efficient on-device semantic search. The ecosystem overall focused on latency, recall metrics, and quantization for production use.

Q: What is meant by 'agentic AI,' and how did it evolve in 2025?
A: Agentic AI refers to systems that autonomously plan and execute multi-step tasks. In 2025, it became a major industry trend, with vendors releasing SDKs, production toolkits, and enterpris

In [9]:

QA_pairs=[{"question":q,"answer":a}for q, a in zip(questions, answers)]


In [10]:
QA_pairs

[{'question': 'What major trends were observed in large language model (LLM) development between 2024 and 2025?',
  'answer': 'The 2024–2025 period saw major LLM releases like Meta’s Llama 4 family with multimodal and mixture-of-experts features, Google’s Gemini 2.5 and Gemma models with strong on-device capabilities, and continued updates from OpenAI, Anthropic, and others such as GPT-5, Grok 4, and Claude 4.'},
 {'question': 'What improvements were made in embedding models during this time?',
  'answer': 'OpenAI launched the text-embedding-3 family (small and large), offering better cost-performance for RAG, while Google introduced EmbeddingGemma for efficient on-device semantic search. The ecosystem overall focused on latency, recall metrics, and quantization for production use.'},
 {'question': "What is meant by 'agentic AI,' and how did it evolve in 2025?",
  'answer': "Agentic AI refers to systems that autonomously plan and execute multi-step tasks. In 2025, it became a major ind

In [11]:
import pandas as pd
df=pd.DataFrame(QA_pairs)

In [12]:
df

Unnamed: 0,question,answer
0,What major trends were observed in large langu...,The 2024–2025 period saw major LLM releases li...
1,What improvements were made in embedding model...,OpenAI launched the text-embedding-3 family (s...
2,"What is meant by 'agentic AI,' and how did it ...",Agentic AI refers to systems that autonomously...
3,What challenges or cautions were associated wi...,Analysts warned of 'agent washing'—projects br...
4,What practical advice does the summary offer t...,The document advises selecting model families ...


In [13]:
df.to_csv(r"C:\Users\manas\OneDrive\Desktop\multi-doc-chat\notebook\domy.csv",index=False)

In [14]:
from langsmith import Client
client=Client()
dataset_name="aidommydataset"
#storing the dataset in langsmit d
dataset=client.create_dataset(
    dataset_name=dataset_name,
    description="question and expected output of respectinve question "

)

In [15]:
client.create_examples(
    inputs=[{"question": q} for q in questions],
    outputs=[{"answer": a} for a in answers],
    dataset_id=dataset.id,
)

{'example_ids': ['87ef4f47-9017-487c-94b0-6c0708254b13',
  'd5b0b18f-bb12-44a5-aabd-ba77ce2fa7bd',
  'bb9e6614-dba3-40f0-963e-d5fa3c607e30',
  'ff5c6d3a-d537-46c1-9e9f-7d950e9fd14d',
  '7feddda8-87b0-4646-a106-82cc2cbc2b25'],
 'count': 5}

In [19]:
import sys
sys.path.append(r"C:\Users\manas\OneDrive\Desktop\multi-doc-chat")

from pathlib import Path
from MultiDocChat.src.document_ingetion.ingest import ChatIngestor
from MultiDocChat.src.document_chat.retriver import ConversationalRAG
import os

# Simple file adapter for local file paths
class LocalFileAdapter:
    """Adapter for local file paths to work with ChatIngestor."""
    def __init__(self, file_path: str):
        self.path = Path(file_path)
        self.name = self.path.name
    
    def getbuffer(self) -> bytes:
        return self.path.read_bytes()
def answer_ai_report_question(
    inputs: dict,
    data_path: str = r"C:\Users\manas\OneDrive\Desktop\multi-doc-chat\data\rag_data.txt",
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    k: int = 5
) -> dict:
    """
    Answer questions about the AI Engineering Report using RAG.
    
    Args:
        inputs: Dictionary containing the question, e.g., {"question": "What is RAG?"}
        data_path: Path to the AI Engineering Report text file
        chunk_size: Size of text chunks for splitting
        chunk_overlap: Overlap between chunks
        k: Number of documents to retrieve
    
    Returns:
        Dictionary with the answer, e.g., {"answer": "RAG stands for..."}
    """
    try:
        # Extract question from inputs
        question = inputs.get("question", "")
        if not question:
            return {"answer": "No question provided"}
        
        # Check if file exists
        if not Path(data_path).exists():
            return {"answer": f"Data file not found: {data_path}"}
        # Create file adapter
        file_adapter = LocalFileAdapter(data_path)
        
        # Build index using ChatIngestor
        ingestor = ChatIngestor(
            temp_base_file="data",
            faiss_base_file="faiss_index",
            use_session_dirs=True
        )
        
        # Build retriever
        ingestor.built_retriver(
            uploaded_files=[file_adapter],
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            k=k
        )
        
        # Get session ID and index path
        session_id = ingestor.session_id
        index_path = f"faiss_index/{session_id}"
        
        # Create RAG instance and load retriever
        rag = ConversationalRAG(session_id=session_id)
        rag.load_retriever_from_faiss(
            index_path=index_path,
            k=k,
            index_name=os.getenv("FAISS_INDEX_NAME", "index")
        )
        
        # Get answer
        answer = rag.invoke(question, chat_history=[])
        
        return {"answer": answer}
        
    except Exception as e:
        return {"answer": f"Error: {str(e)}"}

In [21]:
test_input = {"question":  "Which companies integrated agentic AI capabilities into their enterprise products in 2025?"}
result = answer_ai_report_question(test_input)
print("Question:", test_input["question"])
print("\nAnswer:", result["answer"])

{"timestamp": "2025-11-12T09:29:19.692740Z", "level": "info", "event": "running in local mode:.env loaded"}
{"timestamp": "2025-11-12T09:29:19.694741Z", "level": "info", "event": "loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-12T09:29:19.695737Z", "level": "info", "event": "API keys loaded"}
{"timestamp": "2025-11-12T09:29:19.699281Z", "level": "info", "event": "YAML CONFIG LOADED"}
{"session_id": "session_20251112_145919_d7bfaae8", "temp_dir": "data\\session_20251112_145919_d7bfaae8", "faiss_dir": "faiss_index\\session_20251112_145919_d7bfaae8", "sessionized": true, "timestamp": "2025-11-12T09:29:19.702285Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "rag_data.txt", "saved_as": "data\\session_20251112_145919_d7bfaae8\\9163ee0d.txt", "timestamp": "2025-11-12T09:29:19.704752Z", "level": "info", "event": "File saved for ingestion"}
{"count": 1, "timestamp": "2025-11-12T09:29:19.710351Z", "level": "

Question: Which companies integrated agentic AI capabilities into their enterprise products in 2025?

Answer: In 2025, companies such as Dalet, Salesforce, IBM, and Oracle integrated agentic AI capabilities into their enterprise products, offering guides and product offerings for specific workflows. Additionally, vendors like Anthropic rolled out 'Skills' and OpenAI announced 'AgentKit' as part of their efforts in enterprise agents and toolkits.


In [22]:
print("Testing all questions from the dataset:\n")
for i, q in enumerate(questions, 1):
    test_input = {"question": q}
    result = answer_ai_report_question(test_input)
    print(f"Q{i}: {q}")
    print(f"A{i}: {result['answer']}\n")
    print("-" * 80 + "\n")

{"timestamp": "2025-11-12T09:32:18.040068Z", "level": "info", "event": "running in local mode:.env loaded"}
{"timestamp": "2025-11-12T09:32:18.041069Z", "level": "info", "event": "loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-12T09:32:18.042068Z", "level": "info", "event": "API keys loaded"}
{"timestamp": "2025-11-12T09:32:18.047535Z", "level": "info", "event": "YAML CONFIG LOADED"}
{"session_id": "session_20251112_150218_d5e8873e", "temp_dir": "data\\session_20251112_150218_d5e8873e", "faiss_dir": "faiss_index\\session_20251112_150218_d5e8873e", "sessionized": true, "timestamp": "2025-11-12T09:32:18.054537Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "rag_data.txt", "saved_as": "data\\session_20251112_150218_d5e8873e\\c4bcd527.txt", "timestamp": "2025-11-12T09:32:18.058535Z", "level": "info", "event": "File saved for ingestion"}
{"count": 1, "timestamp": "2025-11-12T09:32:18.062534Z", "level": "

Testing all questions from the dataset:



{"added": 1, "index": "faiss_index\\session_20251112_150218_d5e8873e", "timestamp": "2025-11-12T09:32:20.028853Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2025-11-12T09:32:20.029862Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2025-11-12T09:32:20.033878Z", "level": "info", "event": "running in local mode:.env loaded"}
{"timestamp": "2025-11-12T09:32:20.034962Z", "level": "info", "event": "loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-12T09:32:20.036859Z", "level": "info", "event": "API keys loaded"}
{"timestamp": "2025-11-12T09:32:20.039863Z", "level": "info", "event": "YAML CONFIG LOADED"}
{"provider": "google", "model": "gemini-2.5-flash", "timestamp": "2025-11-12T09:32:20.041874Z", "level": "info", "event": "Loading LLM"}
{"session_id": "session_20251112_150218_d5e8873e", "timestamp": "2025-11-12T09:32:20.045861Z", "level": "info", "

Q1: What major trends were observed in large language model (LLM) development between 2024 and 2025?
A1: Between 2024 and 2025, major trends in LLM development included the expansion of multimodal capabilities, such as image and video understanding, seen in Meta's Llama 4 family and Google's Gemini models. There was also an increased adoption of mixture-of-experts architectures to enhance efficiency. Additionally, companies like OpenAI, Alibaba, Anthropic, Mistral, and Falcon continued to release iterative models, expanding available choices with varied licensing and sizes.

--------------------------------------------------------------------------------



{"added": 1, "index": "faiss_index\\session_20251112_150224_c4d33f10", "timestamp": "2025-11-12T09:32:26.664666Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2025-11-12T09:32:26.666676Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2025-11-12T09:32:26.669764Z", "level": "info", "event": "running in local mode:.env loaded"}
{"timestamp": "2025-11-12T09:32:26.671674Z", "level": "info", "event": "loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-12T09:32:26.672769Z", "level": "info", "event": "API keys loaded"}
{"timestamp": "2025-11-12T09:32:26.675763Z", "level": "info", "event": "YAML CONFIG LOADED"}
{"provider": "google", "model": "gemini-2.5-flash", "timestamp": "2025-11-12T09:32:26.675763Z", "level": "info", "event": "Loading LLM"}
{"session_id": "session_20251112_150224_c4d33f10", "timestamp": "2025-11-12T09:32:26.679676Z", "level": "info", "

Q2: What improvements were made in embedding models during this time?
A2: During 2024-2025, embedding models saw improvements including OpenAI's text-embedding-3 family, which aimed for lower cost and better performance for retrieval/RAG. Specialized and on-device embeddings, like Google's EmbeddingGemma, were released for small-footprint, multilingual applications. The ecosystem also matured, with a focus on benchmarks like Recall@k, p95 latency, and optimization for production.

--------------------------------------------------------------------------------



{"added": 1, "index": "faiss_index\\session_20251112_150234_b7012e05", "timestamp": "2025-11-12T09:32:35.684029Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2025-11-12T09:32:35.685021Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2025-11-12T09:32:35.688022Z", "level": "info", "event": "running in local mode:.env loaded"}
{"timestamp": "2025-11-12T09:32:35.689024Z", "level": "info", "event": "loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-12T09:32:35.690022Z", "level": "info", "event": "API keys loaded"}
{"timestamp": "2025-11-12T09:32:35.693072Z", "level": "info", "event": "YAML CONFIG LOADED"}
{"provider": "google", "model": "gemini-2.5-flash", "timestamp": "2025-11-12T09:32:35.694089Z", "level": "info", "event": "Loading LLM"}
{"session_id": "session_20251112_150234_b7012e05", "timestamp": "2025-11-12T09:32:35.699022Z", "level": "info", "

Q3: What is meant by 'agentic AI,' and how did it evolve in 2025?
A3: Agentic AI refers to systems that autonomously plan and execute multi-step tasks. In 2025, it emerged as a prominent strategic trend, with vendors rolling out SDKs, production-focused toolkits, and enterprise agents like Anthropic 'Skills' and OpenAI AgentKit. Additionally, major companies such as Dalet, Salesforce, IBM, and Oracle published guides and product offerings to integrate agentic capabilities into specific enterprise workflows.

--------------------------------------------------------------------------------



{"added": 1, "index": "faiss_index\\session_20251112_150240_8932a980", "timestamp": "2025-11-12T09:32:42.387292Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2025-11-12T09:32:42.389403Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2025-11-12T09:32:42.392401Z", "level": "info", "event": "running in local mode:.env loaded"}
{"timestamp": "2025-11-12T09:32:42.393406Z", "level": "info", "event": "loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-12T09:32:42.394402Z", "level": "info", "event": "API keys loaded"}
{"timestamp": "2025-11-12T09:32:42.397413Z", "level": "info", "event": "YAML CONFIG LOADED"}
{"provider": "google", "model": "gemini-2.5-flash", "timestamp": "2025-11-12T09:32:42.399418Z", "level": "info", "event": "Loading LLM"}
{"session_id": "session_20251112_150240_8932a980", "timestamp": "2025-11-12T09:32:42.402400Z", "level": "info", "

Q4: What challenges or cautions were associated with agentic AI adoption?
A4: Industry analysts warned of "agent washing" and predicted many agentic projects would fail to deliver ROI without clear business value and engineering investment. Gartner estimated that over 40% of agentic AI projects might be scrapped by the end of 2027 due to high costs and unclear value. Additionally, building agentic systems requires heavy engineering and maintenance.

--------------------------------------------------------------------------------



{"added": 1, "index": "faiss_index\\session_20251112_150247_4a4d060f", "timestamp": "2025-11-12T09:32:48.966948Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2025-11-12T09:32:48.970953Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2025-11-12T09:32:48.977013Z", "level": "info", "event": "running in local mode:.env loaded"}
{"timestamp": "2025-11-12T09:32:48.978948Z", "level": "info", "event": "loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-12T09:32:48.981950Z", "level": "info", "event": "API keys loaded"}
{"timestamp": "2025-11-12T09:32:48.988950Z", "level": "info", "event": "YAML CONFIG LOADED"}
{"provider": "google", "model": "gemini-2.5-flash", "timestamp": "2025-11-12T09:32:48.990948Z", "level": "info", "event": "Loading LLM"}
{"session_id": "session_20251112_150247_4a4d060f", "timestamp": "2025-11-12T09:32:48.998383Z", "level": "info", "

Q5: What practical advice does the summary offer to engineers and researchers?
A5: For engineers and researchers, the summary advises choosing the right model family for the task, with top-tier closed models favored for reasoning/math/coding and open models for customization. For retrieval and RAG, it recommends benchmarking embeddings on specific data, considering hybrid pipelines, and optimizing for cost and latency. When building agentic systems, it's crucial to start with clear KPIs, limit scope, invest in orchestration and tool safety, and avoid premature generalization claims.

--------------------------------------------------------------------------------



In [23]:
from langsmith.evaluation import evaluate,LangChainStringEvaluator
#evaluator
qa_evaluator=[LangChainStringEvaluator("cot_qa")]
dataset_name="aidommydataset"
experiment_results=evaluate(
    answer_ai_report_question,
    data=dataset_name,
    evaluators=qa_evaluator,
    experiment_prefix="test airag",
    # Experiment metadata
    metadata={
        "variant": "RAG with FAISS and AI Engineering Report",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
    
)

ModuleNotFoundError: No module named 'langchain.evaluation'

In [28]:
from langsmith import evaluate, Client

client = Client()

# Wrapper to adapt your RAG function to LangSmith's expected signature
def run_ai_report_eval(example: dict):
    """
    LangSmith expects a function that takes a dataset row (example)
    and returns a string or dict answer.
    """
    question = example.get("input") or example.get("question")
    result = answer_ai_report_question({"question": question})
    # Return only the answer string (LangSmith prefers plain text output)
    return result["answer"]

# Dataset in your LangSmith workspace
dataset_name = "aidommydataset"

# Run evaluation
experiment_results = evaluate(
    run_ai_report_eval,  # ✅ the callable function
    data=dataset_name,
    evaluators=["correctness"],  # ✅ built-in LangSmith evaluator
    experiment_prefix="test_airag",
    metadata={
        "variant": "RAG with FAISS and AI Engineering Report",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
)

print(experiment_results)


View the evaluation results for experiment: 'test_airag-e5c5410a' at:
https://smith.langchain.com/o/e92dd95e-0a41-4dd5-a85e-23f891be8c42/datasets/4133aa80-e314-4124-bfb0-d989d1a551fe/compare?selectedSessions=a9a9dd29-1c40-4d18-b3cf-9b4e632a6e2b




TypeError: 'correctness' is not a callable object

In [29]:
# custom
from langsmith.schemas import Run, Example
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate

def correctness_evaluator(run: Run, example: Example) -> dict:
    """
    Custom LLM-as-a-Judge evaluator for correctness.
    
    Correctness means how well the actual model output matches the reference output 
    in terms of factual accuracy, coverage, and meaning.
    
    Args:
        run: The Run object containing the actual outputs
        example: The Example object containing the expected outputs
    
    Returns:
        dict with 'score' (1 for correct, 0 for incorrect) and 'reasoning'
    """
    # Extract actual and expected outputs
    actual_output = run.outputs.get("answer", "")
    expected_output = example.outputs.get("answer", "")
    input_question = example.inputs.get("question", "")
    
    # Define the evaluation prompt
    eval_prompt = ChatPromptTemplate.from_messages([
        ("system", """You are an evaluator whose job is to judge correctness.

Correctness means how well the actual model output matches the reference output in terms of factual accuracy, coverage, and meaning.

- If the actual output matches the reference output semantically (even if wording differs), it should be marked correct.
- If the output misses key facts, introduces contradictions, or is factually incorrect, it should be marked incorrect.

Do not penalize for stylistic or formatting differences unless they change meaning."""),
        ("human", """<example>
<input>
{input}
</input>

<output>
Expected Output: {expected_output}

Actual Output: {actual_output}
</output>
</example>

Please grade the following agent run given the input, expected output, and actual output.
Focus only on correctness (semantic and factual alignment).

Respond with:
1. A brief reasoning (1-2 sentences)
2. A final verdict: either "CORRECT" or "INCORRECT"

Format your response as:
Reasoning: [your reasoning]
Verdict: [CORRECT or INCORRECT]""")
    ])
    
    # Initialize LLM (using Gemini as shown in your config)
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.5-pro",
        temperature=0
    )
    
    # Create chain and invoke
    chain = eval_prompt | llm
    
    try:
        response = chain.invoke({
            "input": input_question,
            "expected_output": expected_output,
            "actual_output": actual_output
        })
        
        response_text = response.content
        
        # Parse the response
        reasoning = ""
        verdict = ""
        
        for line in response_text.split('\n'):
            if line.startswith("Reasoning:"):
                reasoning = line.replace("Reasoning:", "").strip()
            elif line.startswith("Verdict:"):
                verdict = line.replace("Verdict:", "").strip()
        
        # Convert verdict to score (1 for correct, 0 for incorrect)
        score = 1 if "CORRECT" in verdict.upper() else 0
        
        return {
            "key": "correctness",
            "score": score,
            "reasoning": reasoning,
            "comment": f"Verdict: {verdict}"
        }
        
    except Exception as e:
        return {
            "key": "correctness",
            "score": 0,
            "reasoning": f"Error during evaluation: {str(e)}"
        }

In [31]:
from langsmith import evaluate, Client
from langsmith.schemas import Run, Example
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate

# Custom correctness evaluator: LLM-as-judge
def correctness_evaluator(run: Run, example: Example) -> dict:
    actual_output = run.outputs.get("answer", "")
    expected_output = example.outputs.get("answer", "")
    input_question = example.inputs.get("question", "")

    eval_prompt = ChatPromptTemplate.from_messages([
        ("system", """You are an evaluator whose job is to judge correctness.

Correctness means how well the actual model output matches the reference output in terms of factual accuracy, coverage, and meaning.

- If the actual output matches the reference output semantically (even if wording differs), it should be marked correct.
- If the output misses key facts, introduces contradictions, or is factually incorrect, it should be marked incorrect.

Do not penalize for stylistic or formatting differences unless they change meaning."""),

        ("human", f"""<example>
<input>
{input_question}
</input>

<output>
Expected Output: {expected_output}

Actual Output: {actual_output}
</output>
</example>

Please grade the following agent run given the input, expected output, and actual output.
Focus only on correctness (semantic and factual alignment).

Respond with:
1. A brief reasoning (1-2 sentences)
2. A final verdict: either "CORRECT" or "INCORRECT"

Format your response as:
Reasoning: [your reasoning]
Verdict: [CORRECT or INCORRECT]""")
    ])

    llm = ChatGoogleGenerativeAI(model="gemini-2.5-pro", temperature=0)
    chain = eval_prompt | llm

    try:
        response = chain.invoke({
            "input": input_question,
            "expected_output": expected_output,
            "actual_output": actual_output
        })

        response_text = response.content
        reasoning = None
        verdict = None
        for line in response_text.split("\n"):
            if line.startswith("Reasoning:"):
                reasoning = line.replace("Reasoning:", "").strip()
            elif line.startswith("Verdict:"):
                verdict = line.replace("Verdict:", "").strip()

        score = 1 if verdict and verdict.upper() == "CORRECT" else 0

        return {
            "key": "correctness",
            "score": score,
            "reasoning": reasoning or "",
            "comment": f"Verdict: {verdict}"
        }

    except Exception as e:
        return {
            "key": "correctness",
            "score": 0,
            "reasoning": f"Error during evaluation: {str(e)}"
        }


# Evaluation run using the custom evaluator
client = Client()

dataset_name = "aidommydataset" # your dataset name

experiment_results = evaluate(
    answer_ai_report_question,     # your RAG function (callable)
    data=dataset_name,
    evaluators=[correctness_evaluator],  # list of evaluator functions
    experiment_prefix="ai_rag",
    description="Evaluating RAG system with custom correctness evaluator (LLM-as-a-Judge)",
    metadata={
        "variant": "RAG with FAISS and AI Engineering Report",
        "evaluator": "custom_correctness_llm_judge",
        "model": "gemini-2.5-pro",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
)

print(experiment_results)


{"timestamp": "2025-11-12T10:02:38.024547Z", "level": "info", "event": "running in local mode:.env loaded"}
{"timestamp": "2025-11-12T10:02:38.033908Z", "level": "info", "event": "loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-12T10:02:38.036913Z", "level": "info", "event": "API keys loaded"}
{"timestamp": "2025-11-12T10:02:38.055270Z", "level": "info", "event": "YAML CONFIG LOADED"}
{"session_id": "session_20251112_153238_5f071eec", "temp_dir": "data\\session_20251112_153238_5f071eec", "faiss_dir": "faiss_index\\session_20251112_153238_5f071eec", "sessionized": true, "timestamp": "2025-11-12T10:02:38.060933Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "rag_data.txt", "saved_as": "data\\session_20251112_153238_5f071eec\\9bbb066c.txt", "timestamp": "2025-11-12T10:02:38.068270Z", "level": "info", "event": "File saved for ingestion"}
{"count": 1, "timestamp": "2025-11-12T10:02:38.088444Z", "level": "

View the evaluation results for experiment: 'ai_rag-cf0f6eb4' at:
https://smith.langchain.com/o/e92dd95e-0a41-4dd5-a85e-23f891be8c42/datasets/4133aa80-e314-4124-bfb0-d989d1a551fe/compare?selectedSessions=f615b6b1-a343-46ab-9899-7b76be138e98




{"added": 1, "index": "faiss_index\\session_20251112_153238_5f071eec", "timestamp": "2025-11-12T10:02:40.879510Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2025-11-12T10:02:40.882519Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2025-11-12T10:02:40.890165Z", "level": "info", "event": "running in local mode:.env loaded"}
{"timestamp": "2025-11-12T10:02:40.891169Z", "level": "info", "event": "loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-12T10:02:40.892166Z", "level": "info", "event": "API keys loaded"}
{"timestamp": "2025-11-12T10:02:40.896674Z", "level": "info", "event": "YAML CONFIG LOADED"}
{"provider": "google", "model": "gemini-2.5-flash", "timestamp": "2025-11-12T10:02:40.898693Z", "level": "info", "event": "Loading LLM"}
{"session_id": "session_20251112_153238_5f071eec", "timestamp": "2025-11-12T10:02:40.911157Z", "level": "info", "

<ExperimentResults ai_rag-cf0f6eb4>
