In [2]:
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import AzureOpenAIEmbeddings
from dotenv import load_dotenv
import os
from langchain_cohere import CohereEmbeddings

load_dotenv()

embeddings = CohereEmbeddings(model="embed-multilingual-v3.0")



neo4j_vector_retriever = Neo4jVector.from_existing_index(embeddings, index_name="file_chunks").as_retriever(search_kwargs={"k": 10})


In [3]:
from neo4j import GraphDatabase

neo4j_graph = GraphDatabase.driver(uri=os.environ["NEO4J_URI"], auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"]))

In [4]:
query = "MATCH (n) RETURN count(n) as count"

with neo4j_graph.session() as session:
    result = session.run(query)
    count = result.single()[0]

print(f"Number of nodes in the graph: {count}")

Number of nodes in the graph: 103882


In [5]:
# Calculate Corvee vector using Cohere
from langchain_cohere import CohereEmbeddings

# Initialize Cohere embeddings
cohere_embeddings = CohereEmbeddings(model="embed-multilingual-v3.0")

# Define a search prompt (you may want to replace this with your actual search prompt)
search_prompt = "Your search prompt here"

# Calculate the Corvee vector
corvee_vector = cohere_embeddings.embed_query(search_prompt)

print(f'Corvee vector length: {len(corvee_vector)}')
print(f'Corvee vector sample: {corvee_vector[:10]}')
print(f'Result:')

# You may want to adjust this Cypher query based on your specific Neo4j setup
cypher_query = '''
CALL db.index.vector.queryNodes("file_chunks", 10, $queryVector)
YIELD node, score
RETURN node.id AS id,
    node.text AS text,
    score
'''

# Execute the Cypher query (assuming you have a Neo4j connection set up)
with neo4j_graph.session() as session:
    result = session.run(cypher_query, queryVector=corvee_vector)
    for record in result:
        print(f"ID: {record['id']}, Score: {record['score']}")
        print(f"Text: {record['text'][:100]}...")  # Print first 100 characters of text
        print("---")

Corvee vector length: 1024
Corvee vector sample: [0.010772705, 0.018371582, -0.052825928, 0.031677246, -0.030197144, -0.03338623, -0.010223389, -0.021606445, -0.023880005, 0.01687622]
Result:
ID: b6f21ac7-beeb-453f-8a7b-c22bd3b2188a, Score: 0.7287719249725342
Text: - 3 -...
---
ID: 2e25a206-7003-4d84-a644-4be29c30d3d4, Score: 0.7287719249725342
Text: - 3 -...
---
ID: 89927bbf-996d-439c-9ab7-1a34f0d3c227, Score: 0.7287719249725342
Text: - 3 -...
---
ID: 0289625c-a19c-4d85-aae3-bcdd8c17be07, Score: 0.7287719249725342
Text: - 3 -...
---
ID: bc186d80-25b9-46d6-b58d-6cfb42c351a6, Score: 0.7284402847290039
Text: - 3 -...
---
ID: 29a85b62-6244-4a3b-a46f-4d3091c1bd6c, Score: 0.7284402847290039
Text: - 3 -...
---
ID: cbeec90e-6bee-4295-b486-b39c5b1ab104, Score: 0.7283279895782471
Text: - 3 -...
---
ID: 43e85bcd-98a0-4ed7-a99c-ce5aa9672e87, Score: 0.7267098426818848
Text: - 2 -...
---
ID: 1733c0a2-0a81-462a-8efd-6cfcb0959b73, Score: 0.7266347408294678
Text: - 2 -...
---
ID: 2b92ab79-1f4d-42a3-a2

In [6]:
# next parent doc retrieval

pdr_cypher ="""

MATCH (node)-[]-(fs_FileSection)-[]-(f:File)
WITH f, max(score) AS score // deduplicate parents
RETURN f.extracted_text AS text, score, {source: f.id} AS metadata
"""

In [7]:
parent_document_retriever = Neo4jVector.from_existing_index(embeddings, index_name="file_chunks", retrieval_query=pdr_cypher).as_retriever()

In [8]:
from langchain_core.runnables import chain
from neurapolis_retriever.graph import graph as retriever_graph
from langchain_core.documents import Document

# @chain
# def graph_retriever_chain(query):


#     state = {"query": query}
#     results = retriever_graph.invoke(state)

#     searches = results.get('searches', [])
#     docs = []

#     for search in searches:
#         hits = search.hits if hasattr(search, 'hits') else []
#         for hit in hits:
#             print(hit)
#             if hasattr(hit, 'related_file') and hit.related_file is not None:
#                 if hasattr(hit.related_file, 'extracted_text'):
#                     docs.append(Document(page_content=hit.related_file.extracted_text))

#     print(f"Number of extracted texts: {len(docs)}")

#     for i, doc in enumerate(docs, 1):
#         print(f"Extracted text {i}:")
#         print(doc.page_content[:500])  # Print first 500 characters of each text
#         print("\n---\n")

#     return docs


In [9]:
from langsmith import Client
from langsmith.evaluation import evaluate

import os
from dotenv import load_dotenv

load_dotenv()

client= Client(api_key=os.environ["LANGCHAIN_API_KEY"], api_url=os.environ["LANGCHAIN_ENDPOINT"])

# Define dataset: these are your test cases

from langsmith import Client
from langsmith.evaluation import evaluate
from langchain_openai import AzureChatOpenAI
from langsmith.evaluation import evaluate
from langsmith.schemas import Example, Run
from langchain_core.documents import Document

def document_id_matcher(root_run: Run, example: Example) -> dict:
    """
    Evaluates if any retrieved document matches the expected document ID.
    Supports both graph retriever output and standard document output.
    """
    example_output_id = example.outputs.get('f.id', '')
    
    if root_run.outputs:
        if 'searches' in root_run.outputs:
            # Graph retriever output
            searches = root_run.outputs['searches']
            for search in searches:
                if hasattr(search, 'hits'):
                    for hit in search.hits:
                        if hasattr(hit, 'related_file') and hit.related_file is not None:
                            doc_id = getattr(hit.related_file, 'id', None)
                            if doc_id == example_output_id:
                                comment = f"Found document with matching ID: {example_output_id}. Search type: {search.type.value}"
                                print(comment)
                                return {"key": "document_id_match", "score": 1.0, "comment": comment}
        elif 'output' in root_run.outputs:
            # Standard document output
            output = root_run.outputs['output']
            if isinstance(output, list) and output and isinstance(output[0], Document):
                for doc in output:
                    doc_id = doc.metadata.get('source')
                    if doc_id == example_output_id:
                        comment = f"Found document with matching source ID: {example_output_id}. Search type: Standard"
                        print(comment)
                        return {"key": "document_id_match", "score": 1.0, "comment": comment}

    comment = f"No document found with matching ID: {example_output_id}"
    print(comment)
    return {"key": "document_id_match", "score": 0.0, "comment": comment}



def document_relevance_grader(root_run: Run, example: Example) -> dict:
    
    print("root_run", root_run)
    """
    A simple evaluator that checks if retrieved documents are relevant to the question
    """
    doc_txt = ""
    input_question = example.inputs.get('question', '')
    


    if root_run.outputs and 'output' in root_run.outputs:
        output = root_run.outputs['output']
        if isinstance(output, list) and output and isinstance(output[0], Document):
            limited_docs = output
            doc_txt = "\n\n".join(doc.page_content for doc in limited_docs)
            
    us_client = Client(api_key="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")
    llm = AzureChatOpenAI(deployment_name="gpt-4o-mini", temperature=0)
    grade_prompt_doc_relevance = us_client.pull_prompt("rag-document-relevance")
    answer_grader = grade_prompt_doc_relevance | llm

    if not doc_txt:
        doc_txt = "No relevant documents found."

    score = answer_grader.invoke({"input":{
        "question": input_question,
        "documents": doc_txt 
    }})

    return {"key": "document_relevance", "score": float(score["Score"])}


In [10]:
os.environ["LANGCHAIN_PROJECT"] = "neurapolis-retriever-evaluations"
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://eu.api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = "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"


In [11]:
outputs = None

import json
def example_evaluator(root_run: Run, example: Example) -> dict:

    
    global outputs
    outputs = root_run.outputs
    print(outputs)
    
    # Store run to file
    run_data = json.dumps(root_run.dict(), default=str, indent=2)
    with open('root_run_debug.txt', 'w') as f:
        f.write(run_data)
    # global outputs
    # outputs = root_run.outputs
    # print(outputs)


    return {"key": "example", "score": 1.0}




In [12]:
outputs

In [13]:
import langsmith
from langsmith.evaluation import EvaluationResult
from neurapolis_retriever.state import *

def evaluate_search(query, search_type, hits):
    gradings = []
    feedback_summary = []
    for hit in hits:
        if hasattr(hit, 'grading') and hit.grading is not None:
            gradings.append(hit.grading)
            feedback_summary.append({
                'is_relevant': hit.grading.is_relevant,
                'feedback': hit.grading.feedback,
                'file_id': hit.file_chunk.id if hasattr(hit, 'file_chunk') else None
            })
    
    relevant_gradings = [grading for grading in gradings if grading.is_relevant]
    irrelevant_gradings = [grading for grading in gradings if not grading.is_relevant]
    
    summary = summarize_gradings(relevant_gradings, irrelevant_gradings, feedback_summary)
    
    return {
        "query": query,
        "type": search_type,
        "relevant_hits": len(relevant_gradings),
        "irrelevant_hits": len(irrelevant_gradings),
        "summary": summary
    }

def evaluate_search_level(level, searches):
    level_summary = {
        "level": level,
        "queries": [],
        "total_relevant_hits": 0,
        "total_irrelevant_hits": 0,
        "summary": ""
    }
    
    for search in searches:
        if search.level == level:
            result = evaluate_search(search.query, search.type, search.hits)
            level_summary["queries"].append({"query": result["query"], "type": result["type"]})
            level_summary["total_relevant_hits"] += result["relevant_hits"]
            level_summary["total_irrelevant_hits"] += result["irrelevant_hits"]
            level_summary["summary"] += f"{result['summary']}\n"
    
    return level_summary

def summarize_gradings(relevant_gradings, irrelevant_gradings, feedback_summary):
    summary = f"Total gradings: {len(relevant_gradings) + len(irrelevant_gradings)}\n"
    summary += f"Relevant gradings: {len(relevant_gradings)}\n"
    summary += f"Irrelevant gradings: {len(irrelevant_gradings)}\n\n"
    summary += "Feedback summary:\n"
    for item in feedback_summary:
        relevance = "Relevant" if item['is_relevant'] else "Irrelevant"
        summary += f"- {relevance}: {item['feedback']} (File ID: {item['file_id']})\n"
    return summary

def evaluate_run(run: Run, example: Example) -> EvaluationResult:
    outputs = run.outputs
    searches = outputs['searches']
    results = []
    for search in searches:
        result = evaluate_search(search.query, search.type, search.hits)
        results.append(result)
    
    level_0_summary = evaluate_search_level(0, searches)
    level_1_summary = evaluate_search_level(1, searches)
    
    return EvaluationResult(
        key="custom_rag_evaluation",
        score=level_0_summary["total_relevant_hits"] / (level_0_summary["total_relevant_hits"] + level_0_summary["total_irrelevant_hits"]) if level_0_summary["total_relevant_hits"] + level_0_summary["total_irrelevant_hits"] > 0 else 0,
        comment=f"Level 0: {level_0_summary}\nLevel 1: {level_1_summary}",
        additional_results={
            "search_results": results,
            "level_0": level_0_summary,
            "level_1": level_1_summary
        }
    )


In [14]:
from neurapolis_retriever.state.grading import Grading
from collections import defaultdict
from neurapolis_retriever.state.hit import HitStep

def parent_retrieval_evaluation(run: Run, example: Example) -> EvaluationResult:
    query = example.inputs.get('question', '')
    outputs = run.outputs
    if not outputs or 'searches' not in outputs or not outputs['searches']:
        return EvaluationResult(
            key="parent_retrieval_evaluation",
            score=0,
            comment="No searches found in outputs"
        )

    relevant_count = 0
    irrelevant_count = 0
    search_type_relevance = defaultdict(lambda: {"relevant": 0, "irrelevant": 0})
    
    for search in outputs['searches']:
        if search.level != 0:
            continue
        search_type = search.type.value
        if search.hits:
            for hit in search.hits:
                if hasattr(hit, 'grading') and hit.grading is not None and hit.step != HitStep.DOUBLED:
                    if hit.grading.is_relevant:
                        relevant_count += 1
                        search_type_relevance[search_type]["relevant"] += 1
                    else:
                        irrelevant_count += 1
                        search_type_relevance[search_type]["irrelevant"] += 1

    total_count = relevant_count + irrelevant_count
    if total_count == 0:
        return EvaluationResult(
            key="parent_retrieval_evaluation",
            score=0,
            comment="No graded hits found for parent retrieval"
        )
    
    relevance_score = relevant_count / total_count
    
    comment = f"Query: {query}\n"
    comment += f"Relevant hits: {relevant_count}\n"
    comment += f"Irrelevant hits: {irrelevant_count}\n"
    
    comment += "\nSearch Type Relevance:\n"
    for search_type, counts in search_type_relevance.items():
        total = counts["relevant"] + counts["irrelevant"]
        if total > 0:
            type_relevance_score = counts["relevant"] / total
            comment += f"{search_type}: {type_relevance_score:.2f} ({counts['relevant']} relevant, {counts['irrelevant']} irrelevant)\n"
    
    return EvaluationResult(
        key="parent_retrieval_evaluation",
        score=relevance_score,
        comment=comment
    )

def sub_retrieval_evaluation(run: Run, example: Example) -> EvaluationResult:
    query = example.inputs.get('question', '')
    outputs = run.outputs
    if not outputs or 'searches' not in outputs or not outputs['searches']:
        return EvaluationResult(
            key="sub_retrieval_evaluation",
            score=0,
            comment="No searches found in outputs"
        )

    relevant_count = 0
    irrelevant_count = 0
    search_type_relevance = defaultdict(lambda: {"relevant": 0, "irrelevant": 0})
    
    for search in outputs['searches']:
        if search.level != 1:
            continue
        search_type = search.type.value
        if search.hits:
            for hit in search.hits:
                if hasattr(hit, 'grading') and hit.grading is not None and hit.step != HitStep.DOUBLED:
                    if hit.grading.is_relevant:
                        relevant_count += 1
                        search_type_relevance[search_type]["relevant"] += 1
                    else:
                        irrelevant_count += 1
                        search_type_relevance[search_type]["irrelevant"] += 1

    total_count = relevant_count + irrelevant_count
    if total_count == 0:
        return EvaluationResult(
            key="sub_retrieval_evaluation",
            score=0,
            comment="No graded hits found for sub-retrieval"
        )
    
    relevance_score = relevant_count / total_count
    
    comment = f"Query: {query}\n"
    comment += f"Relevant hits: {relevant_count}\n"
    comment += f"Irrelevant hits: {irrelevant_count}\n"
    
    comment += "\nSearch Type Relevance:\n"
    for search_type, counts in search_type_relevance.items():
        total = counts["relevant"] + counts["irrelevant"]
        if total > 0:
            type_relevance_score = counts["relevant"] / total
            comment += f"{search_type}: {type_relevance_score:.2f} ({counts['relevant']} relevant, {counts['irrelevant']} irrelevant)\n"
    
    return EvaluationResult(
        key="sub_retrieval_evaluation",
        score=relevance_score,
        comment=comment
    )

def best_search_type_evaluation(run: Run, example: Example) -> EvaluationResult:
    outputs = run.outputs
    if not outputs or 'searches' not in outputs or not outputs['searches']:
        return EvaluationResult(
            key="best_search_type_evaluation",
            score=0,
            comment="No searches found in outputs"
        )

    search_type_relevance = defaultdict(lambda: {"relevant": 0, "irrelevant": 0})
    
    for search in outputs['searches']:
        search_type = search.type.value
        if search.hits:
            for hit in search.hits:
                if hasattr(hit, 'grading') and hit.grading is not None and hit.step != HitStep.DOUBLED:
                    if hit.grading.is_relevant:
                        search_type_relevance[search_type]["relevant"] += 1
                    else:
                        search_type_relevance[search_type]["irrelevant"] += 1

    best_score = 0
    best_type = None
    comment = "Search Type Relevance:\n"
    
    for search_type, counts in search_type_relevance.items():
        total = counts["relevant"] + counts["irrelevant"]
        if total > 0:
            type_relevance_score = counts["relevant"] / total
            comment += f"{search_type}: {type_relevance_score:.2f} ({counts['relevant']} relevant, {counts['irrelevant']} irrelevant)\n"
            if type_relevance_score > best_score:
                best_score = type_relevance_score
                best_type = search_type

    if best_type:
        comment += f"\nBest performing search type: {best_type}"
    else:
        comment += "\nNo search types with relevance data found"

    return EvaluationResult(
        key="best_search_type_evaluation",
        score=best_score,
        comment=comment
    )

def overall_relevance_evaluation(run: Run, example: Example) -> EvaluationResult:
    outputs = run.outputs
    if not outputs or 'searches' not in outputs or not outputs['searches']:
        return EvaluationResult(
            key="overall_relevance_evaluation",
            score=0,
            comment="No searches found in outputs"
        )

    relevant_count = 0
    irrelevant_count = 0
    
    for search in outputs['searches']:
        if search.hits:
            for hit in search.hits:
                if hasattr(hit, 'grading') and hit.grading is not None and hit.step != HitStep.DOUBLED:
                    if hit.grading.is_relevant:
                        relevant_count += 1
                    else:
                        irrelevant_count += 1

    total_count = relevant_count + irrelevant_count
    if total_count == 0:
        return EvaluationResult(
            key="overall_relevance_evaluation",
            score=0,
            comment="No graded hits found"
        )
    
    overall_score = relevant_count / total_count
    
    comment = f"Overall relevance score: {overall_score:.2f}\n"
    comment += f"Total relevant hits: {relevant_count}\n"
    comment += f"Total irrelevant hits: {irrelevant_count}\n"
    
    return EvaluationResult(
        key="overall_relevance_evaluation",
        score=overall_score,
        comment=comment
    )

evaluators = [
    parent_retrieval_evaluation,
    sub_retrieval_evaluation,
    best_search_type_evaluation,
    overall_relevance_evaluation
]


In [15]:
examples = list(client.list_examples(dataset_name="Freiburg_QA_ID"))
samples = examples[:1]

In [16]:
from neurapolis_retriever.graph import graph as graph_retriever

In [17]:


# result = evaluate( 
#     lambda x: graph_retriever.invoke({"query": x["question"]}),
#     data=samples,  # Recreated dataset for this experiment
#     evaluators=[document_id_matcher],
#     experiment_prefix="example",  # The name of the experiment
#     metadata={
#         "version": "1.0.0",
#         "revision_id": "beta"
#     },
#     client=client   
# )


In [18]:
from langchain_core.runnables import chain

@chain
def get_page_contents(query):
    results = graph_retriever.invoke({"query": query})
    searches = results.get("searches", [])
    page_contents = ""
    for search in searches[:1]:
        hits = search.hits if hasattr(search, "hits") else []
        for hit in hits:
            print(hit)
            if hasattr(hit, "related_file") and hit.related_file is not None:
                if hasattr(hit.related_file, "extracted_text"):
                    page_contents += hit.related_file.extracted_text + "\n"
                if hasattr(hit.related_file, "access_url"):
                    page_contents += f"Access URL: {hit.related_file.access_url}\n"
    return page_contents.strip()

In [19]:
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(deployment_name="gpt-4o-mini", temperature=0)




prompt = client.pull_prompt("rag-prompt")

prompt

# # Convert the Prompt object to a PromptTemplate
# prompt_template = PromptTemplate(
#     input_variables=["question", "context"],
#     template=prompt.template
# )

# Create a chain using the RunnablePassthrough and PromptTemplate
chain = (
    {
        "question": RunnablePassthrough(),
        "context": get_page_contents
    } | prompt | llm | StrOutputParser()
   

)


  prompt = loads(json.dumps(prompt_object.manifest))


In [20]:
chain.invoke("Welche Themen werden voraussichtlich in der Sitzung am 25.07.2024 behandelt??")

filesection_id 88c4e328-5646-43e1-9203-f909139ecd2e file https://ris.freiburg.de/oparl/file/sia%7C2024-GR-269%7C2
filesection_id 3d0eb6f9-ec7e-41e0-90b3-dfe889c13c78 file https://ris.freiburg.de/oparl/file/sia%7C2024-GR-268%7C2
filesection_id 30bf6011-fdb4-4f34-b141-b6918d58db56 file https://ris.freiburg.de/oparl/file/sia%7C2024-GR-268%7C1
filesection_id fd9d62ae-9c18-48d3-92f9-46db8a542cf3 file https://ris.freiburg.de/oparl/file/sia%7C2024-GR-267%7C1
filesection_id 7f0b864f-b857-47e8-a3e5-b07d86882d58 file https://ris.freiburg.de/oparl/file/sia%7C2024-OR_TI-115%7C3
filesection_id 338287d2-24a2-4d4c-a7a5-fb9b5faa1d49 file https://ris.freiburg.de/oparl/file/sia%7C2024-OR_TI-115%7C3
filesection_id 09dca5bc-ee0a-483a-9f16-8e957d5b332b file https://ris.freiburg.de/oparl/file/sia%7C2024-OR_TI-115%7C3
filesection_id 96c16d3d-fd9a-41f2-a7d4-d8b962ee8031 file https://ris.freiburg.de/oparl/file/sia%7C2024-HFA-87%7C1
filesection_id daa4f31b-e577-4f71-a7b0-6f3e228845c3 file https://ris.freiburg.d



https://ris.freiburg.de/oparl/file/sia%7C2024-GR-268%7C1
related_meeting id='https://ris.freiburg.de/oparl/meeting/ni_2024-GR-268' type='https://schema.oparl.org/1.0/Meeting' name='8. Sitzung des Gemeinderates (Verabschiedung)' meeting_state='terminiert' cancelled=False start=datetime.datetime(2024, 7, 24, 16, 9) end=datetime.datetime(2024, 7, 24, 18, 20) location=None organization=None participant=None invitation=None results_protocol=None verbatim_protocol=None auxiliary_file=None agenda_item=None license=None keyword=None created=datetime.datetime(2024, 9, 13, 0, 0) modified=datetime.datetime(2024, 9, 13, 0, 0) web=None deleted=False
related_paper None
related_consultation None
related_agenda_item None
https://ris.freiburg.de/oparl/file/sia%7C2024-GR-268%7C2
related_meeting id='https://ris.freiburg.de/oparl/meeting/ni_2024-GR-268' type='https://schema.oparl.org/1.0/Meeting' name='8. Sitzung des Gemeinderates (Verabschiedung)' meeting_state='terminiert' cancelled=False start=datetime

'In der Sitzung am 25.07.2024 werden voraussichtlich die Verabschiedungen der ausscheidenden Ortsvorsteher_innen und des Gemeinderats der Amtsperiode 2019 - 2024 behandelt. Zudem sind Bekanntgaben und Aktuelles auf der Tagesordnung. Die konstituierende Sitzung des neuen Gemeinderates für die Amtsperiode 2024 - 2029 folgt im Anschluss. \n\nAccess URL: https://ris.freiburg.de/documents.php?id=69&inline=1&document_type_id=11&meeting_attachment_id=ni_2024-GR-268%7C20240712125630-0_2024-GR-268_1.pdf'

In [23]:
from langchain import hub
from langchain_openai import ChatOpenAI

us_client = Client(api_key="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", api_url="https://api.smith.langchain.com")


# Grade prompt 
grade_prompt_answer_accuracy = prompt = us_client.pull_prompt("langchain-ai/rag-answer-vs-reference",)

def answer_evaluator(run, example) -> dict:
    """
    A simple evaluator for RAG answer accuracy
    """
    
    # Get summary
    input_question = example.inputs["question"]
    reference = example.outputs["answer"]
    prediction = run.outputs["output"]  # Changed from "answer" to "output"

    # LLM grader
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    # Structured prompt
    
    answer_grader = grade_prompt_answer_accuracy | llm

    # Get score
    score = answer_grader.invoke({"question": input_question,
                                  "correct_answer": reference,
                                  "student_answer": prediction})
    score = score["Score"]

    return {"key": "answer_score", "score": score}

In [24]:
result = evaluate( 
    lambda x:  chain.invoke(x["question"]),
    data=samples,  # Recreated dataset for this experiment
    evaluators=[answer_evaluator],
    experiment_prefix="example",  # The name of the experiment
    metadata={
        "version": "1.0.0",
        "revision_id": "beta"
    },
    client=client   
)


View the evaluation results for experiment: 'example-ed91b322' at:
https://eu.smith.langchain.com/o/f2baf51a-5907-4625-b2ab-a98883dd8671/datasets/53d8174c-38ca-41fd-9dc8-faa6ef72478a/compare?selectedSessions=089e3a28-6246-4d7f-b181-d5ed297af052




0it [00:00, ?it/s]

filesection_id 82132830-3e1b-4294-9e09-f216ba0c32da file https://ris.freiburg.de/oparl/file/sia%7C2024-GR-262%7C3
filesection_id 47658b3f-5c01-4776-ba9a-04adbfc20c75 file https://ris.freiburg.de/oparl/file/sia%7C2024-KA-156%7C1
filesection_id 436714a0-80f1-45c4-95f8-88870ef10f4b file https://ris.freiburg.de/oparl/file/sia%7C2024-OR_OPF-92%7C3
filesection_id 33388212-f1d7-4567-b9ee-7aa40fe79b02 file https://ris.freiburg.de/oparl/file/sia%7C2023-BehB-14%7C5
filesection_id d9d1d2e8-37f6-4886-ab91-ca8f28f7ef56 file https://ris.freiburg.de/oparl/file/sia%7C2023-BehB-14%7C8
filesection_id 8c836e0c-f930-4f10-aca4-876b76e112a2 file https://ris.freiburg.de/oparl/file/sia%7C2021-OR_MU-63%7C2
filesection_id f6e98a07-92e4-4df4-aad0-ed6d1d40e218 file https://ris.freiburg.de/oparl/file/sia%7C2021-OR_MU-58%7C2
filesection_id 2714a009-bd42-449e-8bb8-8740a5829b6e file https://ris.freiburg.de/oparl/file/vla%7C5160109100198%7C3
filesection_id d8b71126-7b19-4856-88bc-c721e13040c0 file https://ris.freiburg



https://ris.freiburg.de/oparl/file/5161210100002
related_meeting id='https://ris.freiburg.de/oparl/meeting/ni_2023-GR-249' type='https://schema.oparl.org/1.0/Meeting' name='4. Sitzung des Gemeinderates' meeting_state='terminiert' cancelled=False start=datetime.datetime(2023, 4, 25, 16, 6) end=datetime.datetime(2023, 4, 25, 20, 25) location=None organization=None participant=None invitation=None results_protocol=None verbatim_protocol=None auxiliary_file=None agenda_item=None license=None keyword=None created=datetime.datetime(2024, 9, 13, 0, 0) modified=datetime.datetime(2024, 9, 13, 0, 0) web=None deleted=False
related_paper id='https://ris.freiburg.de/oparl/paper/5161210100002' type='https://schema.oparl.org/1.0/Paper' body=None name='Sonderrechnung Kleineschholz:\r\nFortschreibung der Sonderrechnung für 2023/2024 mit Kosten- und Finanzierungsübersicht' reference='G-23/003' date=datetime.datetime(2023, 4, 6, 0, 0) paper_type='BESCHLUSS-VORLAGE' related_paper=None superordinated_paper

Error running evaluator <DynamicRunEvaluator answer_evaluator> on run 3f8ba053-f785-43bb-b1a9-080b91a70546: KeyError('answer')
Traceback (most recent call last):
  File "/Users/pascal/neurapolis/evals/retriever/.venv/lib/python3.12/site-packages/langsmith/evaluation/_runner.py", line 1344, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(
                         ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/pascal/neurapolis/evals/retriever/.venv/lib/python3.12/site-packages/langsmith/evaluation/evaluator.py", line 327, in evaluate_run
    result = self.func(
             ^^^^^^^^^^
  File "/Users/pascal/neurapolis/evals/retriever/.venv/lib/python3.12/site-packages/langsmith/run_helpers.py", line 646, in wrapper
    raise e
  File "/Users/pascal/neurapolis/evals/retriever/.venv/lib/python3.12/site-packages/langsmith/run_helpers.py", line 643, in wrapper
    function_result = run_container["context"].run(func, *args, **kwargs)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^