## Task 1:  Dependencies


In [230]:
from dotenv import load_dotenv

load_dotenv(dotenv_path=".env")

True

In [231]:
def check_if_env_var_is_set(env_var_name: str, human_readable_string: str = "API Key"):
    api_key = os.getenv(env_var_name)
  
    if api_key:
       print(f"{env_var_name} is present")
    else:
      print(f"{env_var_name} is NOT present, paste key at the prompt:")
      os.environ[env_var_name] = getpass.getpass(f"Please enter your {human_readable_string}: ")

## Task 2: Environment Variables

We'll want to set both our OpenAI API key and our LangSmith environment variables.

In [232]:
import os
import getpass

check_if_env_var_is_set("OPENAI_API_KEY", "OpenAI API key")
check_if_env_var_is_set("COHERE_API_KEY", "Cohere API key")
check_if_env_var_is_set("TAVILY_API_KEY", "TAVILY API key")

OPENAI_API_KEY is present
COHERE_API_KEY is present
TAVILY_API_KEY is present


In [233]:
from uuid import uuid4

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"AIE7 - Certification Challenge"
check_if_env_var_is_set("LANGCHAIN_API_KEY", "LangSmith API Key")

LANGCHAIN_API_KEY is present


In [None]:
import importlib
import graph_rag_functions
importlib.reload(graph_rag_functions)
from graph_rag_functions import naive_graph, contextual_compression_graph, multi_query_graph, parent_document_graph

from langchain_core.tools import tool

In [None]:
@tool
def ask_naive_llm_tool(question: str):
     """PRIMARY TOOL: Query comprehensive federal student loan policy documents using  specialized RAG retrieval.
      
      USE THIS FIRST for ALL student loan questions including:
      - Loan repayment plans and options
      - Forgiveness programs and eligibility
      - Payment problems and solutions
      - Application processes and requirements
      - Policy explanations and guidance
      
      This tool contains the most complete and up-to-date federal student loan information.
    """
    response = naive_graph.invoke({"question": question})
    return {
        "messages": [HumanMessage(content=response["response"])],
        "context": response["context"]
    }

In [None]:
tavily_tool = TavilySearchResults(max_results=5)

class AgentState(TypedDict):
    messages: Annotated[list, add_messages]

model = ChatOpenAI(
    model="gpt-4.1-nano",
    temperature=0,  # Lower temperature for more consistent outputs
    request_timeout=120,  # Longer timeout for complex operations
)

def call_model(state):
    messages = state["messages"]
    response = model.invoke(messages)
    return {"messages": [response]}

def should_continue(state):
    last_message = state["messages"][-1]

    if last_message.tool_calls:
        return "action"

    return END

tool_belt = [
    ask_naive_llm_tool,
    tavily_tool,
    Tool(
        name="StudentAid_Federal_Search",
        description="Search ONLY StudentAid.gov for official federal information: FAFSA applications, federal loan forgiveness programs, federal repayment plans, eligibility requirements",
        func=tavily_studentaid_search,
    ),
    Tool(
        name="Mohela_Servicer_Search",
        description="Search ONLY Mohela loan servicer for account-specific help: making payments, login issues, servicer-specific repayment options, customer service contacts",
        func=tavily_mohela_search,
    ),
    Tool(
        name="Student_Loan_Comparison_Search",
        description="Compare information across BOTH federal sources and Mohela when user needs comprehensive view or comparison of student loan options",
        func=tavily_student_loan_search,
    ),
]

model = model.bind_tools(tool_belt)
tool_node = ToolNode(tool_belt)

uncompiled_graph = StateGraph(AgentState)

uncompiled_graph.add_node("agent", call_model)
uncompiled_graph.add_node("action", tool_node)

uncompiled_graph.set_entry_point("agent")
uncompiled_graph.add_conditional_edges("agent", should_continue)

uncompiled_graph.add_edge("action", "agent")

naive_agent_graph = uncompiled_graph.compile()

In [None]:
import tool_calls_parser_for_eval
importlib.reload(tool_calls_parser_for_eval)
from tool_calls_parser_for_eval import parse_logs, print_formatted_results, extract_contexts_for_eval, parse_langchain_messages

In [None]:
from langchain_core.messages import HumanMessage

# inputs = {"messages" : [HumanMessage(content="Who is the current captain of the Winnipeg Jets?")]}
# inputs = {"messages" : [HumanMessage(content="What concerns does the borrower have regarding Nelnet's communication about their student loan issuer?")]}
inputs = {"messages" : [HumanMessage(content="What is the issue with Aidvantage in the borrower's complaint?")]}

async for chunk in naive_agent_graph.astream(inputs, stream_mode="updates"):
    for node, values in chunk.items():
        print(f"Receiving update from node: '{node}'")
        if node == "action":
          print(f"Tool Used: {values['messages'][0].name}")
          print_formatted_results(parse_logs(str(values["messages"])))
        print("\n")
        print(values["messages"])
        print("\n\n")

In [None]:
# inputs = {"messages" : [HumanMessage(content='')]}
response = naive_agent_graph.invoke(inputs)

evaluation_contexts = extract_contexts_for_eval(response["messages"])
print(f"✅ Extracted {len(evaluation_contexts)} contexts for evaluation")
parsed_data = parse_langchain_messages(response["messages"])
print_formatted_results(parsed_data)

eval_sample = {
    "user_input": inputs["messages"][0].content,
    "response": response["messages"][-1].content,  # Final AI response
    "retrieved_contexts": evaluation_contexts,
    "tools_used": parsed_data['summary']['tools'],
    "num_contexts": len(evaluation_contexts)
}

print(f"\n🎯 EVALUATION SAMPLE:")
print(f"Query: {eval_sample['user_input']}")
print(f"Response: {eval_sample['response'][:200]}...")
print(f"Contexts: {eval_sample['num_contexts']} extracted")
print(f"Tools: {eval_sample['tools_used']}")

### Generate Golden master

In [None]:
import importlib
import core_functions
importlib.reload(core_functions)

from core_functions import load_and_prepare_pdf_loan_docs, generate_golden_master

In [None]:
%%time
student_loan_docs_dataset = load_and_prepare_pdf_loan_docs()

In [None]:
%%time
golden_master_dataset = generate_golden_master(student_loan_docs_dataset)

In [None]:
golden_master_dataset.to_pandas()

In [None]:
from ragas import EvaluationDataset

from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.cost import get_token_usage_for_openai

from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity
from ragas import evaluate, RunConfig
from ragas_metrics import extract_ragas_metrics

### Synthetic Data Generation using RAGAS using the Golden master (Naive Retriever)

In [None]:
%%time
from tqdm.notebook import tqdm
for test_row in tqdm(golden_master_dataset):
    inputs = {"messages" : [HumanMessage(content=test_row.eval_sample.user_input)]}
    response = naive_agent_graph.invoke(inputs)

    evaluation_contexts = extract_contexts_for_eval(response["messages"])
    eval_sample = {
        "user_input": inputs["messages"][0].content,
        "response": response["messages"][-1].content,  # Final AI response
        "retrieved_contexts": evaluation_contexts,
        "tools_used": parsed_data['summary']['tools'],
        "num_contexts": len(evaluation_contexts)
    }
    test_row.eval_sample.response = eval_sample["response"]
    test_row.eval_sample.retrieved_contexts = eval_sample["retrieved_contexts"]

In [None]:
golden_master_dataset.to_pandas().to_csv('golden-masters/naive_golden_master_dataset.csv', index=False)
golden_master_dataset.to_pandas()

In [None]:
# golden_master_dataset.to_pandas()[10:11]['retrieved_contexts'].values[0]

In [None]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [None]:
%%time
evaluation_dataset = EvaluationDataset.from_pandas(golden_master_dataset.to_pandas())

evaluator_llm = LangchainLLMWrapper(
    ChatOpenAI(
        model="gpt-4.1-mini",
        temperature=0, # Lower temperature for more consistent outputs
        request_timeout=120   # Longer timeout for complex operations
    )
)

custom_run_config = RunConfig(timeout=360)

result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), 
             ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    token_usage_parser=get_token_usage_for_openai,
    run_config=custom_run_config
)

In [None]:
import pandas as pd
naive_raw_stats_df = pd.DataFrame([extract_ragas_metrics(result, 'gpt-4.1-mini')])
record_metrics_from_run('Naive', naive_raw_stats_df)

In [None]:
import os
from datetime import datetime
def record_metrics_from_run(retriever_name, dataframe: pd.DataFrame):
    new_dataframe = dataframe.copy()
    columns=['context_recall', 'faithfulness', 'factual_correctness', 'answer_relevancy', 'context_entity_recall', 'noise_sensitivity_relevant']
    metrics_filename = 'ragas-evaluation-metrics.csv'
    dataset_df = pd.DataFrame()
    if os.path.exists(metrics_filename):
        dataset_df = pd.read_csv(metrics_filename)
    new_dataframe['datetime'] = datetime.now().strftime('%Y-%m-%d %T')
    new_dataframe['retriever'] = retriever_name
    new_dataframe = new_dataframe[['datetime', 'retriever'] + columns]
    dataset_df = pd.concat([dataset_df, new_dataframe])

    dataset_df.to_csv(metrics_filename, index=False)

### Synthetic Data Generation using RAGAS using the Golden master (Contextual Compression Retriever)

In [None]:
@tool
def ask_contextual_compression_llm_tool(question: str):
    """PRIMARY TOOL: Query comprehensive federal student loan policy documents using  specialized RAG retrieval.
      
      USE THIS FIRST for ALL student loan questions including:
      - Loan repayment plans and options
      - Forgiveness programs and eligibility
      - Payment problems and solutions
      - Application processes and requirements
      - Policy explanations and guidance
      
      This tool contains the most complete and up-to-date federal student loan information.
    """
    response = contextual_compression_graph.invoke({"question": question})
    return {
        "messages": [HumanMessage(content=response["response"])],
        "context": response["context"]
    }
model = ChatOpenAI(
    model="gpt-4.1-nano",
    temperature=0,  # Lower temperature for more consistent outputs
    request_timeout=120,  # Longer timeout for complex operations
)

def call_model(state):
    messages = state["messages"]
    response = model.invoke(messages)
    return {"messages": [response]}


def should_continue(state):
    last_message = state["messages"][-1]

    if last_message.tool_calls:
        return "action"

    return END

tool_belt = [
    ask_contextual_compression_llm_tool,
    tavily_tool,
    Tool(
        name="StudentAid_Federal_Search",
        description="Search ONLY StudentAid.gov for official federal information: FAFSA applications, federal loan forgiveness programs, federal repayment plans, eligibility requirements",
        func=tavily_studentaid_search,
    ),
    Tool(
        name="Mohela_Servicer_Search",
        description="Search ONLY Mohela loan servicer for account-specific help: making payments, login issues, servicer-specific repayment options, customer service contacts",
        func=tavily_mohela_search,
    ),
    Tool(
        name="Student_Loan_Comparison_Search",
        description="Compare information across BOTH federal sources and Mohela when user needs comprehensive view or comparison of student loan options",
        func=tavily_student_loan_search,
    ),
]

model = model.bind_tools(tool_belt)
tool_node = ToolNode(tool_belt)

uncompiled_graph = StateGraph(AgentState)

uncompiled_graph.add_node("agent", call_model)
uncompiled_graph.add_node("action", tool_node)

uncompiled_graph.set_entry_point("agent")
uncompiled_graph.add_conditional_edges("agent", should_continue)

uncompiled_graph.add_edge("action", "agent")

contextual_compression_agent_graph = uncompiled_graph.compile()

In [None]:
import copy
import time
cc_golden_master_dataset = copy.deepcopy(golden_master_dataset)

In [None]:
# inputs = {"messages" : [HumanMessage(content="What concerns does the borrower have regarding Nelnet's communication about their student loan issuer?")]}
# response = simple_agent_graph.invoke(inputs)
# response

In [None]:
%%time
from tqdm.notebook import tqdm
for test_row in tqdm(cc_golden_master_dataset):
    inputs = {"messages" : [HumanMessage(content=test_row.eval_sample.user_input)]}
    response = contextual_compression_agent_graph.invoke(inputs)

    evaluation_contexts = extract_contexts_for_eval(response["messages"])
    eval_sample = {
        "user_input": inputs["messages"][0].content,
        "response": response["messages"][-1].content,  # Final AI response
        "retrieved_contexts": evaluation_contexts,
        "tools_used": parsed_data['summary']['tools'],
        "num_contexts": len(evaluation_contexts)
    }
    test_row.eval_sample.response = eval_sample["response"]
    test_row.eval_sample.retrieved_contexts = eval_sample["retrieved_contexts"]
    time.sleep(2)

In [None]:
cc_golden_master_dataset.to_pandas().to_csv('golden-masters/cc_golden_master_dataset.csv', index=False)
cc_golden_master_dataset.to_pandas()

In [None]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [None]:
%%time
cc_evaluation_dataset = EvaluationDataset.from_pandas(cc_golden_master_dataset.to_pandas())

evaluator_llm = LangchainLLMWrapper(
    ChatOpenAI(
        model="gpt-4.1-mini",
        temperature=0, # Lower temperature for more consistent outputs
        request_timeout=120   # Longer timeout for complex operations
    )
)

custom_run_config = RunConfig(timeout=360)

cc_result = evaluate(
    dataset=cc_evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), 
             ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    token_usage_parser=get_token_usage_for_openai,
    run_config=custom_run_config
)

In [None]:
cc_raw_stats_df = pd.DataFrame([extract_ragas_metrics(cc_result, 'gpt-4.1-mini')])
record_metrics_from_run('Contextual Compression', cc_raw_stats_df)

### Synthetic Data Generation using RAGAS using the Golden master (Multi Query Retriver)

In [None]:
@tool
def ask_multi_query_llm_tool(question: str):
      """PRIMARY TOOL: Query comprehensive federal student loan policy documents using  specialized RAG retrieval.
      
      USE THIS FIRST for ALL student loan questions including:
      - Loan repayment plans and options
      - Forgiveness programs and eligibility
      - Payment problems and solutions
      - Application processes and requirements
      - Policy explanations and guidance
      
      This tool contains the most complete and up-to-date federal student loan information.
    """
    response = multi_query_graph.invoke({"question": question})
    return {
        "messages": [HumanMessage(content=response["response"])],
        "context": response["context"]
    }

In [None]:
import copy
import time
mq_golden_master_dataset = copy.deepcopy(golden_master_dataset)

In [None]:
%%time
from tqdm.notebook import tqdm
for test_row in tqdm(mq_golden_master_dataset):
    inputs = {"messages" : [HumanMessage(content=test_row.eval_sample.user_input)]}
    response = multi_query_agent_graph.invoke(inputs)

    evaluation_contexts = extract_contexts_for_eval(response["messages"])
    eval_sample = {
        "user_input": inputs["messages"][0].content,
        "response": response["messages"][-1].content,  # Final AI response
        "retrieved_contexts": evaluation_contexts,
        "tools_used": parsed_data['summary']['tools'],
        "num_contexts": len(evaluation_contexts)
    }
    test_row.eval_sample.response = eval_sample["response"]
    test_row.eval_sample.retrieved_contexts = eval_sample["retrieved_contexts"]
    time.sleep(2)

In [None]:
mq_golden_master_dataset.to_pandas().to_csv('golden-masters/mq_golden_master_dataset.csv', index=False)
mq_golden_master_dataset.to_pandas()

In [None]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [None]:
%%time
mq_evaluation_dataset = EvaluationDataset.from_pandas(mq_golden_master_dataset.to_pandas())

evaluator_llm = LangchainLLMWrapper(
    ChatOpenAI(
        model="gpt-4.1-mini",
        temperature=0, # Lower temperature for more consistent outputs
        request_timeout=120   # Longer timeout for complex operations
    )
)

custom_run_config = RunConfig(timeout=360)

mq_result = evaluate(
    dataset=mq_evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), 
             ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    token_usage_parser=get_token_usage_for_openai,
    run_config=custom_run_config
)

In [None]:
mq_raw_stats_df = pd.DataFrame([extract_ragas_metrics(mq_result, 'gpt-4.1-mini')])
record_metrics_from_run('MultiQuery', mq_raw_stats_df)

In [213]:
import build_agent_graph
importlib.reload(build_agent_graph)

from build_agent_graph import get_agent_graph

In [223]:
@tool
def ask_parent_document_llm_tool(question: str):
    """PRIMARY TOOL: Query comprehensive federal student loan policy documents using  specialized RAG retrieval.
      
      USE THIS FIRST for ALL student loan questions including:
      - Loan repayment plans and options
      - Forgiveness programs and eligibility
      - Payment problems and solutions
      - Application processes and requirements
      - Policy explanations and guidance
      
      This tool contains the most complete and up-to-date federal student loan information.
    """
    response = parent_document_graph.invoke({"question": question})
    return {
        "messages": [HumanMessage(content=response["response"])],
        "context": response["context"]
    }

parent_document_graph_agent = get_agent_graph([ask_parent_document_llm_tool])

In [224]:
%%time
pd_golden_master_dataset = copy.deepcopy(golden_master_dataset)
from tqdm.notebook import tqdm
for test_row in tqdm(pd_golden_master_dataset):
    inputs = {"messages" : [HumanMessage(content=test_row.eval_sample.user_input)]}
    response = parent_document_graph_agent.invoke(inputs)

    evaluation_contexts = extract_contexts_for_eval(response["messages"])
    eval_sample = {
        "user_input": inputs["messages"][0].content,
        "response": response["messages"][-1].content,  # Final AI response
        "retrieved_contexts": evaluation_contexts,
        "tools_used": parsed_data['summary']['tools'],
        "num_contexts": len(evaluation_contexts)
    }
    test_row.eval_sample.response = eval_sample["response"]
    test_row.eval_sample.retrieved_contexts = eval_sample["retrieved_contexts"]

  0%|          | 0/12 [00:00<?, ?it/s]

CPU times: user 5.85 s, sys: 177 ms, total: 6.02 s
Wall time: 1min 35s


In [225]:
pd_golden_master_dataset.to_pandas().to_csv('golden-masters/pd_golden_master_dataset.csv', index=False)
pd_golden_master_dataset.to_pandas()

Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,synthesizer_name
0,How does the structure of an academic calendar...,[{'messages': [HumanMessage(content='The struc...,"[non-term (includes clock-hour calendars), or ...",The structure of an academic calendar—such as ...,The structure of an academic calendar—whether ...,single_hop_specifc_query_synthesizer
1,"medicine program clinical work, it gotta be in...",[],[Inclusion of Clinical Work in a Standard Term...,Could you please clarify your question a bit m...,If the clinical work in a medicine program mee...,single_hop_specifc_query_synthesizer
2,Wen do Title IV program disbursements need to ...,"[{'messages': [HumanMessage(content=""Title IV ...",[Non-Term Characteristics A program that measu...,Title IV program disbursements for non-term pr...,"Title IV program disbursements, except for Fed...",single_hop_specifc_query_synthesizer
3,so like if a student gettin a TEACH Grant in a...,"[{'messages': [HumanMessage(content=""Exceeding...",[both the credit or clock hours and the weeks ...,Exceeding scheduled weeks or hours in a paymen...,if a student gettin a TEACH Grant in a clock-h...,single_hop_specifc_query_synthesizer
4,How does the disbursement timing for federal s...,[{'messages': [HumanMessage(content='The disbu...,[<1-hop>\n\nboth the credit or clock hours and...,The disbursement timing for federal student ai...,In clock-hour or non-term credit-hour programs...,multi_hop_abstract_query_synthesizer
5,How do the disbursement timing requirements fo...,[{'messages': [HumanMessage(content='The disbu...,[<1-hop>\n\nboth the credit or clock hours and...,The disbursement timing requirements for feder...,In clock-hour or non-term credit-hour programs...,multi_hop_abstract_query_synthesizer
6,If a practicum or clinical experience is requi...,[{'messages': [HumanMessage(content='Condition...,[<1-hop>\n\nInclusion of Clinical Work in a St...,Clinical or practicum experiences required for...,A practicum or clinical experience required fo...,multi_hop_abstract_query_synthesizer
7,How do the disbursement timing requirements fo...,[{'messages': [HumanMessage(content='The disbu...,[<1-hop>\n\nboth the credit or clock hours and...,The disbursement timing requirements for feder...,In clock-hour or non-term credit-hour programs...,multi_hop_abstract_query_synthesizer
8,"Acccording to Volume 8, Chapter 3, how does th...",[{'messages': [HumanMessage(content='According...,[<1-hop>\n\nInclusion of Clinical Work in a St...,"According to Volume 8, Chapter 3, the inclusio...","Volume 8, Chapter 3 explains that clinical wor...",multi_hop_specific_query_synthesizer
9,how do appendix a and appendix b help schools ...,[{'messages': [HumanMessage(content='Appendix ...,[<1-hop>\n\nboth the credit or clock hours and...,Appendix A and Appendix B provide detailed gui...,appendix a gives examples that illustrate the ...,multi_hop_specific_query_synthesizer


In [226]:
%%time
pd_evaluation_dataset = EvaluationDataset.from_pandas(pd_golden_master_dataset.to_pandas())

evaluator_llm = LangchainLLMWrapper(
    ChatOpenAI(
        model="gpt-4.1-mini",
        temperature=0, # Lower temperature for more consistent outputs
        request_timeout=120   # Longer timeout for complex operations
    )
)

custom_run_config = RunConfig(timeout=360)

pd_result = evaluate(
    dataset=pd_evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), 
             ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    token_usage_parser=get_token_usage_for_openai,
    run_config=custom_run_config
)

Evaluating:   0%|          | 0/72 [00:00<?, ?it/s]

Exception raised in Job[11]: ValueError(zero-size array to reduction operation maximum which has no identity)
Exception raised in Job[35]: TimeoutError()
Exception raised in Job[47]: TimeoutError()
Exception raised in Job[71]: TimeoutError()


CPU times: user 9.73 s, sys: 1.42 s, total: 11.1 s
Wall time: 7min 50s


In [227]:
pd_raw_stats_df = pd.DataFrame([extract_ragas_metrics(pd_result, 'gpt-4.1-mini')])
record_metrics_from_run('Parent Document', pd_raw_stats_df)

In [None]:
collected_df = pd.read_csv('ragas-evaluation-metrics.csv')
collected_df

In [None]:
import visualize_retriever_performance
importlib.reload(visualize_retriever_performance)