In [1]:
from dotenv import load_dotenv
load_dotenv()

True

### Create Dataset

In [13]:
import pandas as pd

# QA
inputs = [
    "Question: what is R language and where we used it?",
    "define and explain predictive model?",
    "what is unsupervised learning?",
]

outputs = [
    "R is a free, open‑source programming language designed for statistical computing and graphics. It is the preferred tool for academic statisticians and data scientists, enabling the implementation of a wide range of statistical methods and the creation of plots, models, and simulations. R is used for data analysis, model building, and visualizing results in research, industry, and education.",
    "A predictive model is a statistical algorithm that uses one or more predictor variables to estimate or forecast the value of a response variable. It learns the relationship between predictors and the response from training data—often via techniques like linear regression—and then applies that learned relationship to new data to make predictions. The model’s usefulness is judged by how well it explains the response and how accurately it predicts unseen observations.",
    " Unsupervised learning is a type of machine learning where the data have no labeled outputs, so the algorithm must discover patterns, structure, or groupings on its own—such as clustering or dimensionality‑reduction techniques—without a known “true answer” to validate against.",
]

# Dataset
qa_pairs = [{"question": q, "answer": a} for q, a in zip(inputs, outputs)]
df = pd.DataFrame(qa_pairs)

# Write to csv
csv_path = "G:/Data Science/MLOPs/SmartDocs-Multi-Document-Chat-using-LLMOps/data/Q&A.csv"

df.to_csv(csv_path, index = False)

### Upload Dataset on Langsmith

In [None]:
from langsmith import Client

client = Client()
dataset_name = "llmops_dataset"
description="Input and expected output pairs for R Language PDF document."

# Store
dataset = client.create_dataset(
    dataset_name = dataset_name,
    description = description
)

client.create_examples(
    inputs = [{"question": q} for q in inputs],
    outputs = [{"answer": a} for a in outputs],
    dataset_id = dataset.id,
)

### Upload RAG Application on Langsmith

In [15]:
import sys
sys.path.append("G:/Data Science/MLOPs/SmartDocs-Multi-Document-Chat-using-LLMOps")
from pathlib import Path
from multi_doc_chat.src.document_ingestion.data_ingestion import ChatIngestor
from multi_doc_chat.src.document_chat.retrieval import ConversationalRAG
import os


# Simple file adapter for local file paths
class LocalFileAdapter:
    """Adapter for local file paths to work with ChatIngestor."""
    def __init__(self, file_path: str):
        self.path = Path(file_path)
        self.name = self.path.name
    
    def getbuffer(self) -> bytes:
        return self.path.read_bytes()


def answer_ai_report_question(
    inputs: dict,
    data_path: str = "G:/Data Science/MLOPs/SmartDocs-Multi-Document-Chat-using-LLMOps\data\islr.pdf",
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    k: int = 5
) -> dict:
    """
    Answer questions about the AI Engineering Report using RAG.
    
    Args:
        inputs: Dictionary containing the question, e.g., {"question": "What is RAG?"}
        data_path: Path to the AI Engineering Report text file
        chunk_size: Size of text chunks for splitting
        chunk_overlap: Overlap between chunks
        k: Number of documents to retrieve
    
    Returns:
        Dictionary with the answer, e.g., {"answer": "RAG stands for..."}
    """
    try:
        # Extract question from inputs
        question = inputs.get("question", "")
        if not question:
            return {"answer": "No question provided"}
        
        # Check if file exists
        if not Path(data_path).exists():
            return {"answer": f"Data file not found: {data_path}"}
        
        # Create file adapter
        file_adapter = LocalFileAdapter(data_path)
        
        # Build index using ChatIngestor
        ingestor = ChatIngestor(
            temp_base="data",
            faiss_base="faiss_index",
            use_session_dirs=True
        )
        
        # Build retriever
        ingestor.built_retriver(
            uploaded_files=[file_adapter],
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            k=k
        )
        
        # Get session ID and index path
        session_id = ingestor.session_id
        index_path = f"faiss_index/{session_id}"
        
        # Create RAG instance and load retriever
        rag = ConversationalRAG(session_id=session_id)
        rag.load_retriever_from_faiss(
            index_path=index_path,
            k=k,
            index_name=os.getenv("FAISS_INDEX_NAME", "index")
        )
        
        # Get answer
        answer = rag.invoke(question, chat_history=[])
        
        return {"answer": answer}
        
    except Exception as e:
        return {"answer": f"Error: {str(e)}"}

In [16]:
# Test the function with a sample question
test_input = {"question": "what is R language and where we used it?"}
result = answer_ai_report_question(test_input)

print("Question:", test_input["question"])
print("\nAnswer:", result["answer"])

{"timestamp": "2025-10-24T20:38:26.452016Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-10-24T20:38:26.454020Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-10-24T20:38:26.456017Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_6Z...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-10-24T20:38:26.459016Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-10-24T20:38:26.520924Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251024_163826_4028e278", "temp_dir": "data\\session_20251024_163826_4028e278", "faiss_dir": "faiss_index\\session_20251024_163826_4028e278", "sessionized": true, "timestamp": "2025-10-24T20:38:26.523928Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "islr.pdf", "saved_as": "data\\session_202510

Question: what is R language and where we used it?

Answer: R is a free, open‑source programming language designed for statistical computing and graphics. It is the preferred tool for academic statisticians and data scientists, enabling the implementation of a wide range of statistical methods and the creation of plots, models, and simulations. R is used for data analysis, model building, and visualizing results in research, industry, and education.


In [17]:
# Example: Test with all questions and Answers from the dataset
print("Testing all questions from the dataset:\n")
for i, q in enumerate(inputs, 1):
    test_input = {"question": q}
    result = answer_ai_report_question(test_input)
    print(f"Q{i}: {q}")
    print(f"A{i}: {result['answer']}\n")
    print("-" * 80 + "\n")

Testing all questions from the dataset:



{"timestamp": "2025-10-24T20:40:20.679665Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-10-24T20:40:20.682666Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-10-24T20:40:20.684665Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_6Z...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-10-24T20:40:20.686665Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-10-24T20:40:20.692666Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251024_164020_de9d1067", "temp_dir": "data\\session_20251024_164020_de9d1067", "faiss_dir": "faiss_index\\session_20251024_164020_de9d1067", "sessionized": true, "timestamp": "2025-10-24T20:40:20.725550Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "islr.pdf", "saved_as": "data\\session_202510

Q1: Question: what is R language and where we used it?
A1: R is a freely available programming language designed for statistical computing and data analysis. It is the language of choice for academic statisticians and is used to implement a wide range of statistical methods, often with optional packages that add thousands of additional functions. R is employed in research, teaching, and any context where advanced data analysis or modeling is required.

--------------------------------------------------------------------------------



{"timestamp": "2025-10-24T20:42:10.327792Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-10-24T20:42:10.330794Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-10-24T20:42:10.331789Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_6Z...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-10-24T20:42:10.333791Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-10-24T20:42:10.340791Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251024_164210_682d339a", "temp_dir": "data\\session_20251024_164210_682d339a", "faiss_dir": "faiss_index\\session_20251024_164210_682d339a", "sessionized": true, "timestamp": "2025-10-24T20:42:10.343792Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "islr.pdf", "saved_as": "data\\session_202510

Q2: define and explain predictive model?
A2: A predictive model is a statistical or machine‑learning framework that learns a relationship between input variables (predictors) and an outcome (response) from training data, and then uses that learned relationship to forecast future or unseen outcomes. In practice, the model is fitted to a training set, its parameters are estimated, and its performance is evaluated on a separate test set to gauge accuracy. Simple approaches such as linear or logistic regression are common examples, but more complex models can be used when the underlying relationship is not adequately captured by a linear form.

--------------------------------------------------------------------------------



{"timestamp": "2025-10-24T20:44:02.614926Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-10-24T20:44:02.617927Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-10-24T20:44:02.619928Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_6Z...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-10-24T20:44:02.623681Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-10-24T20:44:02.633129Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251024_164402_00052534", "temp_dir": "data\\session_20251024_164402_00052534", "faiss_dir": "faiss_index\\session_20251024_164402_00052534", "sessionized": true, "timestamp": "2025-10-24T20:44:02.637132Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "islr.pdf", "saved_as": "data\\session_202510

Q3: what is unsupervised learning?
A3: Unsupervised learning analyzes data that has no labeled outputs, aiming to discover hidden structure or patterns such as clusters or low‑dimensional representations. It is often used in exploratory data analysis, where there is no clear prediction target. Typical methods include clustering, principal component analysis, and other dimensionality‑reduction techniques.

--------------------------------------------------------------------------------



In [None]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator

# Evaluators
qa_evaluator = [LangChainStringEvaluator("cot_qa")]
dataset_name = "llmops_dataset"

# Run evaluation using our RAG function
experiment_results = evaluate(
    answer_ai_report_question,
    data=dataset_name,
    evaluators=qa_evaluator,
    experiment_prefix="test-agenticAIReport-qa-rag",
    metadata={
        "variant": "RAG with FAISS and R language PDF",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
)

### Custom Correctness Evaluator
    Creating an LLM-as-a-Judge evaluator to assess semantic and factual alignment

In [24]:
from langsmith.schemas import Run, Example
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate

def correctness_evaluator(run: Run, example: Example) -> dict:
    """
    Custom LLM-as-a-Judge evaluator for correctness.
    
    Correctness means how well the actual model output matches the reference output 
    in terms of factual accuracy, coverage, and meaning.
    
    Args:
        run: The Run object containing the actual outputs
        example: The Example object containing the expected outputs
    
    Returns:
        dict with 'score' (1 for correct, 0 for incorrect) and 'reasoning'
    """
    # Extract actual and expected outputs
    actual_output = run.outputs.get("answer", "")
    expected_output = example.outputs.get("answer", "")
    input_question = example.inputs.get("question", "")
    
    # Define the evaluation prompt
    eval_prompt = ChatPromptTemplate.from_messages([
        ("system", """You are an evaluator whose job is to judge correctness.

Correctness means how well the actual model output matches the reference output in terms of factual accuracy, coverage, and meaning.

- If the actual output matches the reference output semantically (even if wording differs), it should be marked correct.
- If the output misses key facts, introduces contradictions, or is factually incorrect, it should be marked incorrect.

Do not penalize for stylistic or formatting differences unless they change meaning."""),
        ("human", """<example>
<input>
{input}
</input>

<output>
Expected Output: {expected_output}

Actual Output: {actual_output}
</output>
</example>

Please grade the following agent run given the input, expected output, and actual output.
Focus only on correctness (semantic and factual alignment).

Respond with:
1. A brief reasoning (1-2 sentences)
2. A final verdict: either "CORRECT" or "INCORRECT"

Format your response as:
Reasoning: [your reasoning]
Verdict: [CORRECT or INCORRECT]""")
    ])
    
    # Initialize LLM (using Gemini as shown in your config)
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.5-pro",
        temperature=0
    )
    
    # Create chain and invoke
    chain = eval_prompt | llm
    
    try:
        response = chain.invoke({
            "input": input_question,
            "expected_output": expected_output,
            "actual_output": actual_output
        })
        
        response_text = response.content
        
        # Parse the response
        reasoning = ""
        verdict = ""
        
        for line in response_text.split('\n'):
            if line.startswith("Reasoning:"):
                reasoning = line.replace("Reasoning:", "").strip()
            elif line.startswith("Verdict:"):
                verdict = line.replace("Verdict:", "").strip()
        
        # Convert verdict to score (1 for correct, 0 for incorrect)
        score = 1 if "CORRECT" in verdict.upper() else 0
        
        return {
            "key": "correctness",
            "score": score,
            "reasoning": reasoning,
            "comment": f"Verdict: {verdict}"
        }
        
    except Exception as e:
        return {
            "key": "correctness",
            "score": 0,
            "reasoning": f"Error during evaluation: {str(e)}"
        }

### Run Evaluation with Custom Correctness Evaluator

In [26]:
# Run evaluation with the custom correctness evaluator
from langsmith.evaluation import evaluate

# Define evaluators - using custom correctness evaluator
evaluators = [correctness_evaluator]

dataset_name = "llmops_dataset"

# Run evaluation
experiment_results = evaluate(
    answer_ai_report_question,
    data=dataset_name,
    evaluators=evaluators,
    experiment_prefix="agenticAIReport-correctness-eval",
    description="Evaluating RAG system with custom correctness evaluator (LLM-as-a-Judge)",
    metadata={
        "variant": "RAG with FAISS and AI Engineering Report",
        "evaluator": "custom_correctness_llm_judge",
        "model": "gemini-2.5-pro",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5,
    },
)

print("\nEvaluation completed! Check the LangSmith UI for detailed results.")

View the evaluation results for experiment: 'agenticAIReport-correctness-eval-df2f0942' at:
https://smith.langchain.com/o/82d84e60-5c73-4bf6-a898-7a09feddd805/datasets/f43b02e9-ab76-4e87-89f9-d512be08d9cb/compare?selectedSessions=0983e2e4-b58c-4297-8419-d2a6e350baaa




  from .autonotebook import tqdm as notebook_tqdm
0it [00:00, ?it/s]{"timestamp": "2025-10-24T21:18:31.944868Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-10-24T21:18:31.944868Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-10-24T21:18:31.944868Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_6Z...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-10-24T21:18:31.944868Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-10-24T21:18:31.979199Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251024_171831_8e51b681", "temp_dir": "data\\session_20251024_171831_8e51b681", "faiss_dir": "faiss_index\\session_20251024_171831_8e51b681", "sessionized": true, "timestamp": "2025-10-24T21:18:31.979199Z", "level": "info", "event": "ChatIngestor init


Evaluation completed! Check the LangSmith UI for detailed results.


### Optional: Combine Multiple Evaluators
    You can use multiple evaluators together to get different perspectives on your RAG system's performance.

In [None]:
# Example: Combine custom correctness evaluator with LangChain's built-in evaluators
from langsmith.evaluation import evaluate, LangChainStringEvaluator

# Combine custom and built-in evaluators
combined_evaluators = [
    correctness_evaluator,  # Custom LLM-as-a-Judge
    LangChainStringEvaluator("cot_qa"),  # Chain-of-thought QA evaluator
]

# Run evaluation with multiple evaluators
# Uncomment to run:
# experiment_results_combined = evaluate(
#     answer_ai_report_question,
#     data=dataset_name,
#     evaluators=combined_evaluators,
#     experiment_prefix="agenticAIReport-multi-eval",
#     description="Evaluating RAG system with multiple evaluators",
#     metadata={
#         "variant": "RAG with FAISS",
#         "evaluators": "correctness + cot_qa",
#         "chunk_size": 1000,
#         "chunk_overlap": 200,
#         "k": 5,
#     },
# )
 