In [6]:
import os
from dotenv import load_dotenv
from langsmith import Client
from langsmith import evaluate
from langsmith.schemas import Example, Run
from pydantic import BaseModel, Field
import numpy as np
from langchain_core.prompts import PromptTemplate
from langchain_openai import AzureChatOpenAI

load_dotenv()

os.environ["LANGCHAIN_API_KEY"] = "LANGCHAIN_API_KEY"

client = Client(api_key=os.environ["LANGCHAIN_API_KEY"], api_url=os.environ["LANGCHAIN_ENDPOINT"])
examples = list(client.list_examples(dataset_name="Neurapolis ETL Sectionizer"))

def type_comparator(root_run: Run, example: Example) -> dict:
    outputs = root_run.outputs
    example_output = example.outputs
    if outputs['output'].type == example_output['type']:
        score = 1
    else:
        score = 0
    return {
        "key": "type_comparison",
        "score": score,
    }

def calculate_splits_similarity(proposed_splits, golden_splits):
    # Calculate total lines
    total_lines = max(
        max(split.line_number for split in proposed_splits),
        max(split['line_number'] for split in golden_splits)
    )
    proposed = np.array([split.line_number for split in proposed_splits])
    golden = np.array([split['line_number'] for split in golden_splits])
    # Normalize the splits
    proposed_norm = proposed / total_lines
    golden_norm = golden / total_lines
    # Calculate the minimum distance for each proposed split to any golden split
    distances = np.min(np.abs(proposed_norm[:, np.newaxis] - golden_norm), axis=1)
    # Calculate similarity score (inverse of distance)
    similarities = 1 / (1 + distances)
    return np.mean(similarities)

def evaluate_splits(root_run: Run, example: Example) -> dict:
    outputs = root_run.outputs['output']
    example_output = example.outputs
    if outputs.type != 'NOT_RELATED_TOPICS' or example_output['type'] != 'NOT_RELATED_TOPICS':
        return {
            "key": "splits_comparison",
            "score": None,  # No score for non-NOT_RELATED_TOPICS types
            "metadata": {
                "similarity": "N/A",
                "reason": "Not applicable for this document type"
            }
        }
    proposed_splits = outputs.splits
    golden_splits = example_output['splits']
    # If either proposed or golden splits are empty, return a special score
    if not proposed_splits or not golden_splits:
        return {
            "key": "splits_comparison",
            "score": 0 if (proposed_splits and not golden_splits) or (not proposed_splits and golden_splits) else None,
            "metadata": {
                "similarity": "N/A",
                "reason": "Either proposed or golden splits are empty"
            }
        }
    similarity = calculate_splits_similarity(proposed_splits, golden_splits)
    return {
        "key": "splits_comparison",
        "score": similarity,
        "metadata": {
            "similarity": similarity,
            "proposed_splits": [split.line_number for split in proposed_splits],
            "golden_splits": [split['line_number'] for split in golden_splits],
        }
    }

def evaluate_split_count(root_run: Run, example: Example) -> dict:
    outputs = root_run.outputs['output']
    example_output = example.outputs
    if outputs.type != 'NOT_RELATED_TOPICS' or example_output['type'] != 'NOT_RELATED_TOPICS':
        return {
            "key": "split_count_comparison",
            "score": None,  # No score for non-NOT_RELATED_TOPICS types
            "metadata": {
                "count_match": "N/A",
                "reason": "Not applicable for this document type"
            }
        }
    proposed_splits = outputs.splits
    golden_splits = example_output['splits']
    proposed_count = len(proposed_splits)
    golden_count = len(golden_splits)
    # Calculate the difference in counts
    count_difference = abs(proposed_count - golden_count)
    # Normalize the score based on the difference
    max_difference = max(proposed_count, golden_count)
    score = 1 - (count_difference / max_difference) if max_difference > 0 else 1
    return {
        "key": "split_count_comparison",
        "score": score,
        "metadata": {
            "proposed_count": proposed_count,
            "golden_count": golden_count,
            "count_difference": count_difference
        }
    }

def compound_metric(root_run: Run, example: Example) -> dict:
    type_score = type_comparator(root_run, example)['score']
    splits_result = evaluate_splits(root_run, example)
    count_result = evaluate_split_count(root_run, example)
    splits_score = splits_result['score']
    count_score = count_result['score']
    # If the type is not NOT_RELATED_TOPICS, only consider the type score
    if splits_score is None or count_score is None:
        compound_score = type_score
        reason = "Only type comparison applicable"
    else:
        # Combine scores (you can adjust the weights as needed)
        compound_score = 0.4 * type_score + 0.3 * splits_score + 0.3 * count_score
        reason = "All metrics applicable"
    return {
        "key": "compound_metric",
        "score": compound_score,
        "metadata": {
            "type_score": type_score,
            "splits_score": splits_score,
            "count_score": count_score,
            "reason": reason
        }
    }

class FileSectionizerLineSplitLlmDataModel(BaseModel):
    line_number: int = Field(
        description="Zeilennummer, an der das Dokument aufgeteilt werden soll"
    )
    reason: str = Field(
        description="Kurze Begründung für die Aufteilung an dieser Stelle"
    )

class FileSectionizerLlmDataModel(BaseModel):
    type: str = Field(description="Der Inhaltstyp des Dokuments")
    reason: str = Field(
        description="Kurze Begründung für die Entscheidung des Inhaltstyps"
    )
    splits: list[FileSectionizerLineSplitLlmDataModel] = Field(
        description="Liste der Aufteilungen",
        default=[],
    )

prompt_template_string = """Du bist der Datei-Sektionierer im Rats Informations System (RIS) für Freiburg. Deine Hauptaufgabe ist die präzise Klassifizierung und, wenn nötig, Segmentierung von Dokumenten.
Dokumenttypen:

NOT_RELATED_TOPICS: Dokumente mit mehreren klar unverbundenen Themen, typischerweise Sitzungsprotokolle oder Ergebnismitteilungen mit verschiedenen Tagesordnungspunkten (TOPs).
RELATED_TOPICS: Dokumente, die sich auf ein Hauptthema konzentrieren, auch wenn sie Unterthemen oder detaillierte Diskussionen enthalten. Beispiele sind Bekanntmachungen, detaillierte Berichte zu einem spezifischen Thema oder Beschlussvorlagen zu einem einzelnen Thema.
OTHER: Strukturierte Dokumente wie Einladungen zu Sitzungen, Tagesordnungen oder ähnliche Formate, die nicht in die anderen Kategorien passen.

Analyserichtlinien:

Untersuche das gesamte Dokument sorgfältig, einschließlich Kopfzeilen, Datum, Absender und Empfänger.
Achte besonders auf die Struktur des Dokuments, Überschriften, Tagesordnungspunkte (TOPs) und thematische Abschnitte.
Berücksichtige den Gesamtkontext und -zweck des Dokuments.

Klassifizierung:

Wähle den am besten passenden Dokumenttyp basierend auf der Gesamtstruktur und dem Inhalt.
Gib eine detaillierte Begründung für deine Entscheidung an, die spezifische Elemente des Dokuments erwähnt.

Segmentierung:

Segmentiere NUR Dokumente des Typs NOT_RELATED_TOPICS.
Markiere den Beginn jeder neuen Sektion mit der entsprechenden Zeilennummer.
Segmentiere bei klaren Themenübergängen, insbesondere bei neuen Tagesordnungspunkten (TOPs).
Für OTHER und RELATED_TOPICS ist keine Segmentierung erforderlich.

Wichtige Hinweise:

Klassifiziere Einladungen zu Sitzungen, Tagesordnungen und ähnliche strukturierte Dokumente immer als OTHER, unabhängig von der Anzahl der aufgelisteten Themen.
Ergebnismitteilungen und Sitzungsprotokolle mit mehreren TOPs sind in der Regel als NOT_RELATED_TOPICS zu klassifizieren und zu segmentieren.
Bei RELATED_TOPICS ist keine Segmentierung erforderlich, auch wenn Unterthemen vorhanden sind.
Beachte, dass die ersten Zeilen oft Metadaten wie Datum, Absender oder Empfänger enthalten und nicht als separate Segmente behandelt werden sollten.
Sei konsistent in deiner Analyse und Segmentierung über verschiedene Dokumenttypen hinweg.
Berücksichtige, dass manche Dokumente mit einer Überschrift oder einem Titel beginnen können, der nicht als separates Segment gezählt werden sollte.
Beschlussvorlagen oder detaillierte Berichte zu einem einzelnen Hauptthema sollten als RELATED_TOPICS klassifiziert werden, auch wenn sie verschiedene Aspekte dieses Themas diskutieren.

<Inhalt des Dokuments>
{text}
</Inhalt des Dokuments>
Bitte klassifiziere das Dokument und, falls erforderlich, segmentiere es entsprechend den oben genannten Richtlinien. Gib eine detaillierte Begründung für deine Klassifizierung an und erkläre, warum du dich für oder gegen eine Segmentierung entschieden hast."""
prompt_template = PromptTemplate.from_template(prompt_template_string)
chain = prompt_template | AzureChatOpenAI(deployment_name="gpt-4o").with_structured_output(FileSectionizerLlmDataModel)
result = evaluate(
    lambda x: chain.invoke(x["text_lines"]),
    data=examples,
    evaluators=[type_comparator, evaluate_splits, evaluate_split_count, compound_metric],
    experiment_prefix="example",
    client=client
)

ModuleNotFoundError: No module named 'dotenv'

In [None]:
result

In [None]:
list(result)

In [None]:
# Calculate the average of the compound metric
compound_metric_scores = []

for item in result:
    evaluation_results = item.get('evaluation_results', {}).get('results', [])
    for eval_result in evaluation_results:
        if eval_result.key == 'compound_metric':
            compound_metric_scores.append(eval_result.score)

average_compound_metric = sum(compound_metric_scores) / len(compound_metric_scores) if compound_metric_scores else 0

print(f"Average Compound Metric: {average_compound_metric:.4f}")


In [None]:
# Create a mapping of example outputs to actual outputs
output_mapping = {}

for result in result:
    run = result['run']
    example = result['example']
    
    example_id = str(example.id)
    
    # Map the example output to the actual output
    output_mapping[example_id] = {
        'example_output': example.outputs,
        'actual_output': run.outputs['output']
    }

# Print the mapping for verification
for example_id, outputs in output_mapping.items():
    print(f"Example ID: {example_id}")
    print(f"Example Output: {outputs['example_output']}")
    print(f"Actual Output: {outputs['actual_output']}")
    print("---")


In [None]:
raise 

In [None]:
def analyze_deviations(results):
    deviations = []
    for item in results:
        run = item.get('run')
        example = item.get('example')
        
        if not run or not example:
            continue
        
        example_output = example.outputs
        actual_output = run.outputs.get('output')
        
        if not actual_output:
            continue
        
        # Check for deviations in 'type' and 'splits'
        if isinstance(actual_output, FileSectionizerLlmDataModel):
            type_mismatch = actual_output.type != example_output.get('type')
            splits_mismatch = actual_output.splits != example_output.get('splits')
        else:
            type_mismatch = getattr(actual_output, 'type', None) != example_output.get('type')
            splits_mismatch = getattr(actual_output, 'splits', None) != example_output.get('splits')
        
        if type_mismatch or False:
            deviations.append({
                'example_id': str(example.id),
                'example_output': example_output,
                'actual_output': actual_output,
                'type_mismatch': type_mismatch,
                'splits_mismatch': splits_mismatch
            })
    return deviations

# Analyze deviations
try:
    deviations = analyze_deviations(list(result))
except Exception as e:
    print(f"Error analyzing deviations: {str(e)}")
    deviations = []

In [None]:
deviations

In [None]:
def generate_updated_prompt(results, current_prompt):
    analysis = []
    for item in results:
        example = item.get('example')
        run = item.get('run')
        if not example or not run:
            continue
        
        example_output = example.outputs
        actual_output = run.outputs.get('output')
        if not actual_output:
            continue
        
        type_mismatch = actual_output.type != example_output.get('type')
        splits_mismatch = actual_output.splits != example_output.get('splits')
        
        analysis.append({
            'example_id': str(example.id),
            'expected_type': example_output.get('type'),
            'actual_type': actual_output.type,
            'type_mismatch': type_mismatch,
            'splits_mismatch': splits_mismatch,
            'example_input': example.inputs.get('text_lines'),
            'example_output': example_output,
            'actual_output': actual_output
        })
    
    descriptions = "\n\n".join([
        f"ID: {a['example_id']}\n"
        f"Input: {a['example_input'][:200]}...\n"
        f"Expected: {a['expected_type']}\n"
        f"Actual: {a['actual_type']}\n"
        f"Type Mismatch: {a['type_mismatch']}\n"
        f"Splits Mismatch: {a['splits_mismatch']}\n"
        f"Expected Output: {a['example_output']}\n"
        f"Actual Output: {a['actual_output']}\n"
        for a in analysis if a['type_mismatch'] or a['splits_mismatch']
    ])
    
    update_prompt = f"""
Analyze the following examples and improve the prompt to enhance document categorization accuracy:

{descriptions}

Current Prompt:
{current_prompt}

Provide only the updated prompt:
"""
    model = AzureChatOpenAI(deployment_name="gpt-4o")
    updated_prompt = model.invoke(update_prompt)
    
    summary_prompt = f"""
Summarize key changes between:

Original:
{current_prompt}

Updated:
{updated_prompt.content}
"""
    summary = model.invoke(summary_prompt)
    
    print("\033[94mChange Summary:\033[0m")
    print("\033[94m" + summary.content + "\033[0m")
    
    print("\033[92mUpdated Prompt:\033[0m")
    print("\033[92m" + updated_prompt.content + "\033[0m")
    
    return updated_prompt.content

current_prompt = prompt_template_string
updated_prompt = generate_updated_prompt(deviations, current_prompt)
print("Updated Prompt:")
print(updated_prompt)


In [None]:
# The below leads refinement as after some rounds of iteration it throws an error.

In [None]:
#   File "/var/folders/_x/18_1sczx5634lnky__ljb3900000gn/T/ipykernel_13744/2123409029.py", line 6, in evaluate_split_count
#     if outputs.type != 'NOT_RELATED_TOPICS' or example_output['type'] != 'NOT_RELATED_TOPICS':
#        ^^^^^^^^^^^^
# AttributeError: 'NoneType' object has no attribute 'type'
# Error running evaluator <DynamicRunEvaluator compound_metric> on run 45bbb681-d7b6-4e2e-8587-f21e50ee5374: AttributeError("'NoneType' object has no attribute 'type'")
# Traceback (most recent call last):
#   File "/Users/pascal/neurapolis/evals/.venv/lib/python3.12/site-packages/langsmith/evaluation/_runner.py", line 1344, in _run_evaluators
#     evaluator_response = evaluator.evaluate_run(
#                          ^^^^^^^^^^^^^^^^^^^^^^^
#   File "/Users/pascal/neurapolis/evals/.venv/lib/python3.12/site-packages/langsmith/evaluation/evaluator.py", line 327, in evaluate_run
#     result = self.func(
#              ^^^^^^^^^^
#   File "/Users/pascal/neurapolis/evals/.venv/lib/python3.12/site-packages/langsmith/run_helpers.py", line 646, in wrapper
#     raise e
#   File "/Users/pascal/neurapolis/evals/.venv/lib/python3.12/site-packages/langsmith/run_helpers.py", line 643, in wrapper
#     function_result = run_container["context"].run(func, *args, **kwargs)
#                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#   File "/var/folders/_x/18_1sczx5634lnky__ljb3900000gn/T/ipykernel_13744/3569692088.py", line 2, in compound_metric
#     type_score = type_comparator(root_run, example)['score']
#                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#   File "/var/folders/_x/18_1sczx5634lnky__ljb3900000gn/T/ipykernel_13744/4164126165.py", line 32, in type_comparator
#     if outputs['output'].type == example_output['type']:
#        ^^^^^^^^^^^^^^^^^^^^^^
# AttributeError: 'NoneType' object has no attribute 'type'
# 55it [01:03,  1.15s/it]




In [None]:
current_template = prompt_template_string
version = 1
desired_threshold = 0.95  # Set your desired compound metric threshold

while True:
    # Instantiate the prompt and LLM
    prompt_template = PromptTemplate.from_template(current_template)
    chain = prompt_template | AzureChatOpenAI(deployment_name="gpt-4o").with_structured_output(FileSectionizerLlmDataModel)
    
    # Run evaluations
    result = evaluate(
        lambda x: chain.invoke(x["text_lines"]),
        data=examples,
        evaluators=[type_comparator, evaluate_splits, evaluate_split_count, compound_metric],
        experiment_prefix="example",
        metadata={
            "version": f"1.0.{version}",
            "prompt_revision": f"revision_{version}"
        },
        client=client
    )
    
    # Calculate the average compound metric
    compound_metric_scores = []
    for item in result:
        evaluation_results = item.get('evaluation_results', {}).get('results', [])
        for eval_result in evaluation_results:
            if eval_result.key == 'compound_metric':
                compound_metric_scores.append(eval_result.score)
    average_compound_metric = sum(compound_metric_scores) / len(compound_metric_scores) if compound_metric_scores else 0
    print(f"Average Compound Metric (Version {version}): {average_compound_metric:.4f}")
    
    # Check if desired threshold is met
    if average_compound_metric >= desired_threshold:
        print("Desired performance achieved.")
        break
    
    # Analyze deviations
    deviations = analyze_deviations(result)
    
    # If no deviations, break the loop
    if not deviations:
        print("No deviations found. Model performance is optimal.")
        break
    
    # Generate updated prompt automatically
    updated_prompt = generate_updated_prompt(deviations, current_template)
    
    # Update the current template with the new prompt
    current_template = updated_prompt
    
    version += 1