In [7]:
import os
import json
from langsmith import Client, traceable
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langsmith.evaluation import evaluate
from dotenv import load_dotenv

# Load environment variables
load_dotenv(override=True)

# Initialize LangSmith client
client = Client()

# Define the evaluation target
@traceable
def evaluate_sectionizer(input_data):
    messages = [
        {
            "role": "system",
            "content": """You are tasked with evaluating a submission based on the provided original file text and the assigned sectionizer type. You will also receive the first 500 characters of each section generated by the submission. Your evaluation involves two tasks:

Task 1: Validate Sectionizer Type
• Objective: Determine if the assigned sectionizer type is correct for the given file.
• Possible Types:
  • NOT_RELATED_TOPICS: The document contains multiple unrelated topics, often seen in meeting minutes with varied agenda items.
  • RELATED_TOPICS: The document revolves around a single main topic, possibly with several subtopics related to that central theme.
  • OTHER: The document requires further analysis or pertains to a topic that cannot be extracted from text alone (e.g., election results).
• Output: Provide a boolean (True/False) indicating whether the sectionizer type is correct, along with a brief justification for your decision.

Task 2: Evaluate Document Splitting
• Objective: Verify if the document was split correctly based on the assigned sectionizer type.
• Criteria:
  • The document should only be split into multiple sections if the sectionizer type is NOT_RELATED_TOPICS.
  • Evaluate the logical placement of the splits, if applicable.
• Output: Confirm whether the document was split correctly (True/False) and provide a brief explanation.

Respond with a JSON object containing the following keys:

    "sectionizer_type_correct": boolean,
    "sectionizer_type_justification": "string",
    "splitting_correct": boolean,
    "splitting_explanation": "string"
""",
        },
        {"role": "user", "content": f"Original File Text and sectionizer type:\n\n{input_data['input']}\n\nGenerated Sections (First 500 Characters Each):\n\n{input_data['submission']}"},
    ]
    chat_model = ChatOpenAI(model="gpt-4", temperature=0)
    result = chat_model.invoke(messages)
    print("\033[94m" + result.content + "\033[0m")
    return result.content


dataset_name = "neurapolis-file-sections"
# dataset = client.create_dataset(dataset_name=dataset_name)
# inputs, outputs = zip(*[(example[0], example[1]) for example in examples])
# client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)

# Define the evaluator
def correct_evaluation(run, example):
    # Print the input data for debugging
    print("Input data:")
    print(json.dumps(example.inputs, indent=2))
    
    # Print the run output for debugging
    print("\nRun output:")
    print(json.dumps(run.outputs, indent=2))
    run_output = run.outputs.get("sections")
    expected_output = example.outputs
    
    # Parse the JSON string in run_output
    run_output_dict = json.loads(run_output)
    
    # Compare each field
    sectionizer_type_correct = run_output_dict["sectionizer_type_correct"] == expected_output["sectionizer_type_correct"]
    splitting_correct = run_output_dict["splitting_correct"] == expected_output["splitting_correct"]
    
    # Calculate overall score
    score = (sectionizer_type_correct + splitting_correct) / 2
    
    return {"score": score, "key": "correct_evaluation"}

# Run the evaluation
results = evaluate(
    evaluate_sectionizer,
    data=dataset_name,
    evaluators=[correct_evaluation],
    experiment_prefix="Sectionizer Evaluation",
    description="Testing the sectionizer evaluation system.",
)

print(f"Evaluation results: {results}")

# Example of using a LangChain runnable
prompt = ChatPromptTemplate.from_messages([
    ("system", """You are tasked with evaluating a submission based on the provided original file text and the assigned sectionizer type. You will also receive the first 500 characters of each section generated by the submission. Your evaluation involves two tasks:

Task 1: Validate Sectionizer Type
• Objective: Determine if the assigned sectionizer type is correct for the given file.
• Possible Types:
  • NOT_RELATED_TOPICS: The document contains multiple unrelated topics, often seen in meeting minutes with varied agenda items.
  • RELATED_TOPICS: The document revolves around a single main topic, possibly with several subtopics related to that central theme.
  • OTHER: The document requires further analysis or pertains to a topic that cannot be extracted from text alone (e.g., election results).
• Output: Provide a boolean (True/False) indicating whether the sectionizer type is correct, along with a brief justification for your decision.

Task 2: Evaluate Document Splitting
• Objective: Verify if the document was split correctly based on the assigned sectionizer type.
• Criteria:
  • The document should only be split into multiple sections if the sectionizer type is NOT_RELATED_TOPICS.
  • Evaluate the logical placement of the splits, if applicable.
• Output: Confirm whether the document was split correctly (True/False) and provide a brief explanation.

Respond with a JSON object containing the following keys:

    "sectionizer_type_correct": boolean,
    "sectionizer_type_justification": "string",
    "splitting_correct": boolean,
    "splitting_explanation": "string"
"""),
    ("user", "Original File Text and sectionizer type:\n\n{input}\n\nGenerated Sections (First 500 Characters Each):\n\n{submission}")
])
chat_model = ChatOpenAI(model="gpt-4")
output_parser = StrOutputParser()

chain = prompt | chat_model | output_parser

# Evaluate the LangChain runnable
langchain_results = evaluate(
    chain.invoke,
    data=dataset_name,
    evaluators=[correct_evaluation],
    experiment_prefix="Sectionizer Evaluation LangChain",
)

print(f"LangChain evaluation results: {langchain_results}")


View the evaluation results for experiment: 'Sectionizer Evaluation-19674873' at:
https://eu.smith.langchain.com/o/f2baf51a-5907-4625-b2ab-a98883dd8671/datasets/20233d7f-efe8-40b1-afbb-457beb5f271f/compare?selectedSessions=979899df-af92-44f9-abf2-775c999ae917




0it [00:00, ?it/s]Error running target function: 'input'
Error running target function: 'input'
Error running target function: 'input'
Error running target function: 'input'
Error running target function: 'input'
Error running target function: 'input'
Error running target function: 'input'
Error running target function: 'input'
Error running target function: 'input'
Error running target function: 'input'
Error running target function: 'input'
Error running target function: 'input'
Error running target function: 'input'
Error running target function: 'input'
Error running target function: 'input'
Error running target function: 'input'
Error running target function: 'input'
Error running target function: 'input'
Error running target function: 'input'
Error running target function: 'input'
Error running target function: 'input'
Error running target function: 'input'
Error running evaluator <DynamicRunEvaluator correct_evaluation> on run 397134b4-63c3-4b42-9077-fa7f59546c7a: TypeError('the

Input data:
{
  "file": "Das B\u00fcrgermeisteramt der Stadt Freiburg im Breisgau - Dezernat I -\n\nFreiburg i. Br., 16.02.2024  \nTel.: 0761/201-1110  \nHerr Knobloch\n\n2. Sitzung des Gemeinderates\n\nMitglieder des Gemeinderates\n\nIch lade zu der am\n\nDienstag, 27. Februar 2024, 18:00 Uhr\n\nim Neuen Ratssaal des Rathauses stattfindenden Sitzung des Gemeinderates ein.\n\nT a g e s o r d n u n g\n\n\u00d6ffentlicher Teil\n\n1. Bebauungsplan mit \u00f6rtlichen Bauvorschriften und Ausgleichsfl\u00e4chen auf den Gemarkungen Freiburg, Lehen, Waltershofen, Opfingen und Hochdorf \"Dietenbach - Am Frohnholz\", Plan-Nr. 6-175  \n   a) Entscheidung \u00fcber die im Rahmen der ersten f\u00f6rmlichen \u00d6ffentlichkeits- und Beh\u00f6rdenbeteiligung eingegangenen Stellungnahmen  \n   b) Beschluss des Planentwurfs f\u00fcr die erneute f\u00f6rmliche \u00d6ffentlichkeits- und Beh\u00f6rdenbeteiligung (Offenlagebeschluss) und das weitere Verfahren  \n   c) Beschluss des zentralen Versorgungsber




View the evaluation results for experiment: 'Sectionizer Evaluation LangChain-f6e9b5cd' at:
https://eu.smith.langchain.com/o/f2baf51a-5907-4625-b2ab-a98883dd8671/datasets/20233d7f-efe8-40b1-afbb-457beb5f271f/compare?selectedSessions=2590c352-e084-4708-a3b3-3aa21a6979bb




0it [00:00, ?it/s]Error running target function: "Input to ChatPromptTemplate is missing variables {'input', 'submission'}.  Expected: ['input', 'submission'] Received: ['file', 'sectionizer_type']\nNote: if you intended {input} to be part of the string and not a variable, please escape it with double curly braces like: '{{input}}'."
Error running evaluator <DynamicRunEvaluator correct_evaluation> on run bd6add38-a3de-4394-9e59-fea24284e051: TypeError('the JSON object must be str, bytes or bytearray, not NoneType')
Traceback (most recent call last):
  File "/Users/pascal/neurapolis/evals/.venv/lib/python3.12/site-packages/langsmith/evaluation/_runner.py", line 1323, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(
                         ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/pascal/neurapolis/evals/.venv/lib/python3.12/site-packages/langsmith/evaluation/evaluator.py", line 327, in evaluate_run
    result = self.func(
             ^^^^^^^^^^
  File "/Users/pascal/ne

Input data:
{
  "file": "# Freiburg IM BREISGAU\n\nB\u00fcrgermeisteramt\n\nDezernat III\n\nAdresse: Rathausplatz 2-4 D-79098 Freiburg i. Br.\n\nTelefon: +49 761 201-3505  \nTelefax: +49 761 201-3508  \nInternet: www.freiburg.de  \nE-Mail*: dez-iii@stadt.freiburg.de\n\n---\n\nStadt Freiburg im Breisgau \u00b7 B\u00fcrgermeisteramt Dezernat III  \nPostfach, D-79095 Freiburg\n\nMitglieder des Beirates f\u00fcr Menschen mit Behinderung der Stadt Freiburg\n\n- per E-Mail im PDF-Format\n\n---\n\nIhr Zeichen/Schreiben vom Unser Aktenzeichen  \nIhnen schreibt Frau Baumgart Freiburg, den 22.02.2024\n\n---\n\nSitzung des Beirates f\u00fcr Menschen mit Behinderung am 05.03.2024\n\nSehr geehrte Damen und Herren,\n\nhiermit lade ich Sie herzlich zur n\u00e4chsten Sitzung des Behindertenbeirates am\n\nDienstag, den 05.03.2024,  \nvon 16:00 Uhr bis 18:00 Uhr,  \nim Innenstadtrathaus,  \nNeuer Ratssaal\n\nein.\n\nFolgende Tagesordnung ist vorgesehen:\n\n## \u00d6ffentlicher Teil\n\n- TOP 1: Miet-E-Sc

Error running target function: "Input to ChatPromptTemplate is missing variables {'input', 'submission'}.  Expected: ['input', 'submission'] Received: ['file', 'sectionizer_type']\nNote: if you intended {input} to be part of the string and not a variable, please escape it with double curly braces like: '{{input}}'."
Error running evaluator <DynamicRunEvaluator correct_evaluation> on run b33c5c9b-b366-4f0c-b532-f1357c757c35: TypeError('the JSON object must be str, bytes or bytearray, not NoneType')
Traceback (most recent call last):
  File "/Users/pascal/neurapolis/evals/.venv/lib/python3.12/site-packages/langsmith/evaluation/_runner.py", line 1323, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(
                         ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/pascal/neurapolis/evals/.venv/lib/python3.12/site-packages/langsmith/evaluation/evaluator.py", line 327, in evaluate_run
    result = self.func(
             ^^^^^^^^^^
  File "/Users/pascal/neurapolis/evals/.ve

Input data:
{
  "file": "- ![Image](image_url)\n- **Projektvorstellung AINS \u2013 AH**\n- 08.04.2024, Ausschuss f\u00fcr Wirtschaft und Wissenschaft, Stadt Freiburg\n- Impulsvortrag von Dr. Handirk von Ungern-Sternberg Handwerkskammer Freiburg\n- 08.04.2024\n- 1\n\n# AINS \u2013AH: Aus Indien nach S\u00fcdbaden - Auszubildende f\u00fcrs Handwerk\n\n![Handwerkskammer Freiburg](https://www.hwk-freiburg.de)\n\n![Image](https://www.hwk-freiburg.de)\n\n08.04.2024\n\n2\n\n![Handwerkskammer Freiburg](ly Handwerkskammer Freiburg)\n\n# Magic Billion Training Center, Noida Uttar Pradesh\n\n![Image](https://via.placeholder.com/1366x480)\n![Image](https://via.placeholder.com/158x523)\n\n08.04.2024\n\n11\n\n# Besuch von Bildungseinrichtungen\n\n![Bild](dl d a ee | ime)\n\n![Bild](dl d a ee | ime)\n\n![Bild](Handwerkskammer Freiburg)\n\n![Bild]()\n\n08.04.2024\n\n12\n\n![Handwerkskammer Freiburg](Handwerkskammer_Freiburg.jpg)\n\n![T-TEP WORKSHOP](T-TEP_WORKSHOP.jpg)\n\n08.04.2024\n\n13\n\n![Image 1

42it [00:00, 132.27it/s]

LangChain evaluation results: <ExperimentResults Sectionizer Evaluation LangChain-f6e9b5cd>





In [5]:
# Import necessary libraries
from neo4j import GraphDatabase

# Neo4j connection details
uri = "neo4j+s://efdb5c8e.databases.neo4j.io"  # Neo4j Aura URI
username = "neo4j"  # Default username for Neo4j
password = "ipzncqdCIFfFo87sMyKnAJ1p-D8ecUcGsxJdr2vLH_k"  # Provided password

# Function to create a Neo4j driver
def create_neo4j_driver():
    return GraphDatabase.driver(uri, auth=(username, password))

# Function to close the Neo4j driver
def close_neo4j_driver(driver):
    driver.close()

# Function to create nodes and relationships in Neo4j
def create_graph(driver, data):
    with driver.session() as session:
        # Create nodes for sections
        for i, section in enumerate(data['sections']):
            session.run(
                "CREATE (s:Section {id: $id, content: $content})",
                id=f"section_{i}", content=section[:500]  # Using first 500 characters
            )
        
        # Create relationships between sections
        for i in range(len(data['sections']) - 1):
            session.run(
                "MATCH (s1:Section {id: $id1}), (s2:Section {id: $id2}) "
                "CREATE (s1)-[:NEXT]->(s2)",
                id1=f"section_{i}", id2=f"section_{i+1}"
            )

        # Create a node for the document and link it to sections
        session.run(
            "CREATE (d:Document {type: $type}) "
            "WITH d "
            "MATCH (s:Section) "
            "CREATE (d)-[:CONTAINS]->(s)",
            type=data['sectionizer_type']
        )

# Utility function to perform custom queries
def execute_query(driver, query, parameters=None):
    with driver.session() as session:
        result = session.run(query, parameters)
        return [record.data() for record in result]

# Main execution
try:
    driver = create_neo4j_driver()
    
    # Assuming 'results' contains the evaluation data
    for result in results:
        create_graph(driver, result)
    
    print("Graph created successfully in Neo4j")

    # Example usage of the utility function
    custom_query = "MATCH (d:Document)-[:CONTAINS]->(s:Section) RETURN d.type, count(s) as section_count"
    query_result = execute_query(driver, custom_query)
    print("Custom query result:", query_result)

except Exception as e:
    print(f"An error occurred: {e}")
finally:
    close_neo4j_driver(driver)



An error occurred: name 'results' is not defined


In [8]:
# Query to get an overview of the graph structure
query = """


MATCH (f:File)-[:FILE_HAS_FILE_SECTION]->(h:HqFileSection)
RETURN f, h
"""
result = execute_query(driver, query)
print(result)

  with driver.session() as session:


[{'f': {'date': '2022-05-16', 'file_name': '20220516132044-0_ni_2022-BehB-7_5.pdf', 'created': neo4j.time.DateTime(2024, 9, 13, 0, 0, 0, 0, tzinfo=pytz.FixedOffset(120)), 'access_url': 'https://ris.freiburg.de/documents.php?id=69&inline=1&document_type_id=11&meeting_attachment_id=ni_2022-BehB-7%7C20220516132044-0_ni_2022-BehB-7_5.pdf', 'sectionizer_reason': 'Das Dokument befasst sich umfassend mit dem Thema der kommunalen Teilhabeplanung (THP) für Menschen mit Behinderungen in Freiburg. Es behandelt verschiedene Aspekte und Unterthemen dieses Hauptthemas, bleibt jedoch stets auf das übergeordnete Thema der Teilhabeplanung fokussiert.', 'type': 'https://schema.oparl.org/1.0/File', 'extracted_text': 'Kommunale Teilhabeplanung (THP) für Menschen mit Behinderungen\nInformation des Beirats für Menschen mit Behinderung der Stadt Freiburg\nBehindertenbeirat Freiburg Seite 1 29.03.2022\nAmt für Soziales Teilhabeplanung und Psychiatriekoordination\n\nBegriffsklärung Teilhabeplanung (THP)\n\n„Te

In [13]:
# Map file texts to array of sections texts
file_sections = {}

for item in result:
    file = item['f']
    section = item['h']
    
    file_id = file['id']
    section_text = section['text']
    
    if file_id not in file_sections:
        file_sections[file_id] = {
            'file_name': file['file_name'],
            'extracted_text': file['extracted_text'],
            'sections': []
        }
    
    file_sections[file_id]['sections'].append(section_text)

# Print the result
for file_id, file_data in file_sections.items():
    print(f"File: {file_data['file_name']}")
    print(f"Number of sections: {len(file_data['sections'])}")
    print("First 100 characters of extracted text:", file_data['extracted_text'][:100])
    print("First 100 characters of each section:")
    for i, section_text in enumerate(file_data['sections'], 1):
        print(f"  Section {i}: {section_text[:100]}")
    print()


File: 20220516132044-0_ni_2022-BehB-7_5.pdf
Number of sections: 1
First 100 characters of extracted text: Kommunale Teilhabeplanung (THP) für Menschen mit Behinderungen
Information des Beirats für Menschen 
First 100 characters of each section:
  Section 1: Kommunale Teilhabeplanung (THP) für Menschen mit Behinderungen
Information des Beirats für Menschen 

File: 20200313115729-0_ni_2020-MIA-2_2.pdf
Number of sections: 6
First 100 characters of extracted text: Dezernat III -
Freiburg, den 13.03.2020 Tel.: 3012 / Frau Müller
Ergebnismitteilung
zur 1. Sitzung d
First 100 characters of each section:
  Section 1: 3 -

  Section 2: 
Auf Nachfrage aus dem Gremium werden von der Verwaltung die Regelungen für Sachkundige und Sachvers
  Section 3: - Drucksache ASW-20/002 - zur Information

Ergebnis
Der Ausschuss für Migration und Integration nimm
  Section 4: - mündlicher Bericht -

Ergebnis
Frau Dr. Niethammer, Leiterin des Amtes für Migration und Integrati
  Section 5: 
Es erfolgt ein Hinweis

In [16]:
from langsmith import Client
import os

# Setup Langsmith client
os.environ["LANGCHAIN_PROJECT"] = "neurabot-ca-evaluation"
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://eu.api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = "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"
client = Client(api_key="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", api_url="https://eu.api.smith.langchain.com")

# Create dataset
dataset = []
for file_id, file_data in file_sections.items():
    dataset.append({
        "input": {"file_text": file_data['extracted_text']},
        "output": {"sections": file_data['sections']}
    })

# Save to Langsmith
dataset_name = "File_Sections_Datasets"
langsmith_dataset = client.create_dataset(dataset_name, description="File texts and their corresponding sections")

# Create examples in the dataset
client.create_examples(
    inputs=[item["input"] for item in dataset],
    outputs=[item["output"] for item in dataset],
    dataset_id=langsmith_dataset.id,
)

print(f"Created dataset '{dataset_name}' with {len(dataset)} examples.")

Created dataset 'File_Sections_Datasets' with 15 examples.
