# Setup

In [1]:
import helper_tools.parser as parser
import importlib
import pandas as pd

importlib.reload(parser)

relation_df, entity_df, docs = parser.synthie_parser("train")
entity_set = entity_df[['entity', 'entity_uri']].drop_duplicates()
predicate_set_df = relation_df[["predicate", "predicate_uri"]].drop_duplicates()

Fetching 27 files:   0%|          | 0/27 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:00<00:00, 2970.68it/s]


Uploading Entities to Qdrant.


100%|██████████| 46/46 [00:05<00:00,  7.98it/s]


Uploading Predicates to Qdrant.


100%|██████████| 29/29 [00:03<00:00,  8.91it/s]


In [7]:
from langchain_openai import ChatOpenAI
from langchain_ollama.embeddings import OllamaEmbeddings
from langfuse.callback import CallbackHandler
from dotenv import load_dotenv
import os

load_dotenv()
langfuse_handler = CallbackHandler(
    secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
    public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
    host=os.getenv("LANGFUSE_HOST"),
)

model = ChatOpenAI(model_name="Meta-Llama-3.3-70B-Instruct", base_url="https://api.sambanova.ai/v1", api_key=os.getenv("SAMBANOVA_API_KEY"))
embeddings = OllamaEmbeddings(model='nomic-embed-text')

In [2]:
target_doc = docs.iloc[1]
doc_id = target_doc["docid"]
text = target_doc["text"]
text

'Ricardo Lumengo is a Swiss politician. He was born in Fribourg and lives in Biel/Bienne. He works in Bern and speaks the Kongo language.'

In [17]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

client = QdrantClient("localhost", port=6333)
label_vector_store = QdrantVectorStore(
    client=client,
    collection_name="wikidata_labels",
    embedding=embeddings
)

description_vector_store = QdrantVectorStore(
    client=client,
    collection_name="wikidata_descriptions",
    embedding=embeddings
)

In [67]:
from langgraph.types import Command
from typing import TypedDict, Literal
from langchain_core.prompts import PromptTemplate
import re
from langchain_core.messages import AIMessage
from langgraph.graph import StateGraph, MessagesState, START, END

class cIEState(TypedDict):
    text: str
    call_trace: list[tuple[str]]
    results: list[str]
    comments: list[AIMessage]
    instruction: str
    
def planner(state: cIEState) -> Command[Literal["agent_instructor_agent"]]:
    prompt = PromptTemplate.from_template("""
    You are an expert in planning and executing tasks within multi-agent systems. Your role is to design and refine a detailed plan that processes a given text into a triple format, specifically for closed information extraction using an underlying Knowledge Graph. You design the plan for the agent instructor agent, which should execute your plan, call and instruct agents. It is only able to execute one step at a time. Your plan must be based on the following inputs:
    - Agent Call Trace
    - Agent Comments
    - The provided input text
    - All intermediate results produced during the process
    
    For executing the tasks, you can include the following agents in the plan:
    - **Entity Extraction Agent:** Can extract entities from the text.
    - **Relation Extraction Agent:** Can extract relations from the text.
    - **URI Detection Agent:** Based on search terms, can determine if there is an associated entity or relation in the Knowledge Graph.
    - **Result Formatting Agent:** After executing and iterating over the task, the result formatting agent should be called to summarize the results and output the final triples. Calling this agent will end the processing.
    
    Your plan should clearly outline the steps required to achieve the goal, ensuring that each phase is actionable and verifiable. The plan will be passed to the Agent Instructor, who will execute the steps through a series of Agent Calls. You will be asked to build up a plan, as long as no final result is done. Your response should be precise, structured, and demonstrate deep expertise in orchestrating complex multi-agent systems for closed Information Extraction tasks. Please line up the plan that you have, to accomplish the task. Do not include tasks that are already worked on.
    
    If you are called for the first time write down the full plan. If you are called afterwards just say what the next task is and where in your plan we are.
    
    Please base your plan on the following information:
    
    Agent Call Trace: {call_trace}
    Agent Comments: {comments}
    The provided input text: {text}
    All intermediate results produced during the process: {results}
    """)
   
    response_chain = prompt | model
    
    response = response_chain.invoke(state)
    
    next_agent = "agent_instructor_agent"
    
    if "<FINISH_MAS>" in response.content:
        next_agent = END
          
    return Command(goto=next_agent, update={"comments": state["comments"] + [response]})

def agent_instructor_agent(state: cIEState) -> Command[Literal["entity_extraction_agent", "relation_extraction_agent", "uri_detection_agent", "result_formatting_agent"]]:
    prompt = PromptTemplate.from_template("""
    
     You are an expert for executing plans in multi-agent-systems and instructing agents. You are embedded within such a MAS with the final goal of processing a text into relations. You will receive a plan from a planning agent within the agent comments alongside with the feedback given by the result checker. In addition, you will receive your agent call traces and the text which is being processed. Your task is then to reason, how the next agent should be called. The planner might give you a hint, which agent should be called next. 
     
    You have access on the following agents:
    Entity Extraction Agent
    - id: entity_extraction_agent
    - use of instruction: The use of an instruction is optional. It will be included in the context of the prompt of the agent and can modify the agents behaviour. Please do not include the original text in the prompt.
    - description: Can extract entities from the text.
    - state access on: text, instruction
    
    Relation Extraction Agent
    - id: relation_extraction_agent
    - use of instruction: The use of an instruction is optional. It will be included in the context of the prompt of the agent and can modify the agents behaviour. Please do not include the original text in the prompt. It can be relevant to provide the relation extraction agent with already extracted entities or entities it should focus on.
    - description: Can extract relations from the text.
    - state access on: text, instruction
    
    URI Detection Agent
    - id: uri_detection_agent
    - use of instruction: The use of an instruction is mandatory. The instruction must be a comma separated list of search terms. For each search term include the search mode - either [LABEL] for a search on rdfs:label or [DESCR] for a search on the description of an entity/relation.
    - description: Based on search terms, can determine if there is an associated entity or relation in the Knowledge Graph. The agent will respond with a mapping of search terms to URIs
    - state access on: instruction
    
    Result Formatting Agent
    - id: result_formatting_agent
    - use of instruction: None
    - description: Utilizes the whole state to output the final triples in an appropriate format
    - state access on: call_trace, comments, text, results
     
    Please include in your response exact one agent call using the following agent call structure:
    
    <agent_call>
        <id>AGENT_ID</id>
        <instruction>Put your instructions for the agents here</instruction>
    <agent_call/>
    
    
    
    Agent Call Trace: {call_trace}
    Agent Comments: {comments}
    The provided input text: {text}
    All intermediate results produced during the process: {results}
    
    """)
    
    response_chain = prompt | model
    
    response = response_chain.invoke(state)
    
    agent_id_match = re.search(r'<id>(.*?)</id>', response.content, re.DOTALL)
    if agent_id_match:
        agent_id = agent_id_match.group(1)
    else:
        agent_id = "agent_instructor"
        
    instruction_match = re.search(r'<instruction>(.*?)</instruction>', response.content, re.DOTALL)
    if instruction_match:
        instruction = instruction_match.group(1)
    else:
        instruction = ""
        
    return Command(goto=agent_id, update={"instruction": instruction, "call_trace": state["call_trace"] + [(agent_id, instruction)]})

def entity_extraction_agent(state: cIEState) -> Command[Literal["result_checker_agent"]]:
    prompt = PromptTemplate.from_template("""
    
    You are an expert for entity extraction out of text in a multi-agent-system for closed information extraction. You will receive a text out of the state from which you should extract all entities. In addition, the agent_instructor might give you an instruction, which you should follow. Your task is then to follow the optional instruction as well as this system prompt and return a comma separated list of entities that are in the text, which is enclosed in <result>insert list here</result>. 
    
    The provided input text: {text}
    Instruction: {instruction}
    
    """)
    
    response_chain = prompt | model
    
    response = response_chain.invoke(state)
    
    result_match = re.search(r'<result>(.*?)</result>', response.content, re.DOTALL)
    if result_match:
        result = result_match.group(1)
    else:
        result = ""
    
    result = f"Output of entity_extraction_agent: {result}"
    
    return Command(goto="result_checker_agent", update={"instruction": "", "results": state["results"] + [result]})

def relation_extraction_agent(state: cIEState) -> Command[Literal["result_checker_agent"]]:
    prompt = PromptTemplate.from_template("""
    
    You are an expert for relation extraction out of text in a multi-agent-system for closed information extraction. You will receive a text out of the state from which you should extract all relation. As closed information extraction uses an underlying knowledge graph, there can be different names for similar predicates. Therefore, extract also alternative predicates, when applicable (i.e. Berlin, located in, Germany -> Berlin, country, Germany). 
     
    In addition, the agent_instructor might give you an instruction, which you should follow. Your task is then to follow the optional instruction as well as this system prompt and return a list of all triples, where each triple is enclosed in <triple> tags and subject, predicate and object are comma separated from each other. Enclose your pure result in <result> tags
    
    The provided input text: {text}
    Instruction: {instruction}
    
    """)
    
    response_chain = prompt | model
    
    response = response_chain.invoke(state)
    
    result_match = re.search(r'<result>(.*?)</result>', response.content, re.DOTALL)
    if result_match:
        result = result_match.group(1)
    else:
        result = ""
        
    result = f"Output of relation_extraction_agent: {result}"
    
    return Command(goto="result_checker_agent", update={"instruction": "", "results": state["results"] + [result]})

def uri_detection_agent(state: cIEState) -> Command[Literal["result_checker_agent"]]:
    search_terms = state["instruction"].split(",")
    label_search_terms = [term.replace("[LABEL]","") for term in search_terms if "[LABEL]" in term]
    description_search_terms = [term.replace("[DESCR]","") for term in search_terms if "[DESCR]" in term]
    response = ""
    for term in label_search_terms:
        response += f'Most Similar rdfs:label Search Results for {term}:{[{"label": doc.page_content, "uri": doc.metadata["uri"]} for doc in label_vector_store.similarity_search(term, k=3)]}\n\n'
    for term in description_search_terms:
        response += f'Most Similar rdfs:label Search Results for {term}:{[{"label": doc.page_content, "uri": doc.metadata["uri"]} for doc in description_vector_store.similarity_search(term, k=3)]}\n\n'
    response = response.replace("},", "},\n")
        
    prompt_template = PromptTemplate.from_template(
        """
        You are a formatting agent. Your task is to check and format the output of the URI detection tool. The tool will give a response like this:
        Most Similar Detection Result for Olaf Scholz: ('label': Angela Merkel, 'uri': 'http://www.wikidata.org/entity/Q567)
        
        Your task is to check the response and output an overall mapping of search terms to URIs. If something doesn't match, please response the non mapping search term with the advise, that those might not be present in the knowledge graph. Please also leverage the text for identifying the context of the search terms.
        
        Text: {text}
        
        URI detection tool response:
        
        {response}
        """
    )
    
    chain = prompt_template | model
    result = chain.invoke({"response": response, "text": state["text"]})
    
    result = f"Output of uri_detection_agent: {result.content}"
    
    return Command(goto="result_checker_agent", update={"instruction": "", "results": state["results"] + [result]})

def result_checker_agent(state: cIEState) -> Command[Literal["planner"]]:
    prompt = PromptTemplate.from_template("""
    You are an expert in monitoring multi-agent-systems. In this case you are giving feedback on the process to the planning agent. Therefore, you can see the plans made, as well as agent calls and the history of comments. In addition, you will have access to a text, that should be transformed into triplets, which can be inserted into an underlying knowledge graph. This task often requires multiple iterations to really catch every entity and relation especially those, that are not visible first glimpse. As long as you think the result can be improved, just response with your feedback, which will be processed by the planner in the next step. Really push the result to the edge, what an LLM can do.
    
    Agent Call Trace: {call_trace}
    Agent Comments: {comments}
    The provided input text: {text}
    All intermediate results produced during the process: {results}
    """)
    
    response_chain = prompt | model
    
    response = response_chain.invoke(state)
          
    return Command(goto="planner", update={"comments": state["comments"] + [response]})

def result_formatting_agent(state: cIEState) -> Command[Literal[END]]:
    prompt = PromptTemplate.from_template("""
    You are an expert in formatting results of multi-agent-systems, which are used for closed information extraction. Therefore, your task is to produce triples in turtle format, that can be inserted in the underlying knowledge graph. Therefore, you will get access to the full state of the multi-agent-system including the full call trace, the comments of the planner and the result checker, the provided input text and all intermediate results. Please note, that the so called relation extraction agent will output more triples than necessary due to prompting. Please reduce the output so, that no triple is a duplicate of another. Please do not extract predicate from the rdf or rdfs namespaces. Please only use the http://www.wikidata.org/entity/ namespace.
    
    If you want to incorporate reasoning in your output make sure that you enclose the turtle output in <ttl> tags, so that it can be extracted afterwards.
    
    Agent Call Trace: {call_trace}
    Agent Comments: {comments}
    The provided input text: {text}
    All intermediate results produced during the process: {results}
    """)
    
    response_chain = prompt | model
    
    response = response_chain.invoke(state, config={"callbacks": [langfuse_handler]})
    
    result_match = re.search(r'<ttl>(.*?)</ttl>', response.content, re.DOTALL)

    if result_match:
        result = result_match.group(1)
    else:
        result = ""
    
    return Command(goto=END, update={"results": state["results"] + [result]})

builder = StateGraph(cIEState)
builder.add_node(planner)
builder.add_node(agent_instructor_agent)
builder.add_node(entity_extraction_agent)
builder.add_node(relation_extraction_agent)
builder.add_node(uri_detection_agent)
builder.add_node(result_checker_agent)
builder.add_node(result_formatting_agent)

builder.add_edge(START, "planner")

graph = builder.compile()

In [68]:
response_state = graph.invoke({"text": text, "results": [], "call_trace": [], "comments": []}, config={"callbacks": [langfuse_handler]})

# Save response state

In [52]:
import pickle

pickle.dump(response_state, open("./state_storage/final.state", "wb"))

# Pretty Print Response State

In [69]:
print(f"""cIE for text: {response_state["text"]}

Results:""")
for i, call in enumerate(response_state["call_trace"]):
    print(f"Agent ID: {call[0]}")
    print(f"Instruction: {call[1]}")
    print(f"Result: {response_state['results'][i]}\n\n")
    
print("Agent Comments:")
for comment in response_state["comments"]:
    print("-- START OF OUTPUT --\n" + comment.content + "\n -- END OF OUTPUT --\n")

cIE for text: Corfe Castle railway station is a station on the Swanage Railway in the village of Corfe Castle, in the United Kingdom.

Results:
Agent ID: entity_extraction_agent
Instruction: 
Result: Output of entity_extraction_agent: 


Agent ID: entity_extraction_agent
Instruction: Extract entities from the preprocessed text: "Corfe Castle railway station is a station on the Swanage Railway in the village of Corfe Castle, in the United Kingdom."
Result: Output of entity_extraction_agent: Corfe Castle railway station, Swanage Railway, Corfe Castle, United Kingdom


Agent ID: uri_detection_agent
Instruction: Corfe Castle railway station[LABEL], Swanage Railway[LABEL], Corfe Castle[LABEL], United Kingdom[LABEL], located on[DESCR], serves[DESCR]
Result: Output of uri_detection_agent: After analyzing the response from the URI detection tool, I have identified the most relevant mappings between search terms and URIs. Here is the overall mapping:

* Corfe Castle railway station: http://www.

# Evaluation

In [3]:
def get_uri_labels(df): 
    subjects = []
    predicates = []
    objects = []
    for i, row in df.iterrows():
        try:
            subjects.append(entity_set[entity_set["entity_uri"] == row["subject_uri"]]["entity"].values[0])
        except IndexError:
            subjects.append("Unknown")
        try:
            predicates.append(predicate_set_df[predicate_set_df["predicate_uri"] == row["predicate_uri"]]["predicate"].values[0])
        except IndexError:
            predicates.append("Unknown")
        if row["object_uri"] is not None and "^^" in row["object_uri"]:
            objects.append(row["object_uri"])
        else:
            try:
                objects.append(entity_set[entity_set["entity_uri"] == row["object_uri"]]["entity"].values[0])
            except IndexError:
                objects.append("Unknown")
    return pd.concat([df.reset_index(drop=True), pd.DataFrame({"subject": subjects, "predicate": predicates, "object": objects})], axis=1)

In [71]:
import pandas as pd
from rdflib import Graph, URIRef

ttl_file_path = "../../Data/Product_Graph.ttl"  # Path to your Turtle file

# Load the Turtle file into an RDF graph
graph = Graph()
graph.parse(data=response_state["results"][-1], format="turtle")

# Extract triples and convert to a DataFrame
data = []
namespace_manager = graph.namespace_manager  # Namespace manager for prefix resolution

final_result = []
for subj, pred, obj in graph:
    final_result.append([str(subj), str(pred), str(obj)])
    
pred_relation_df = pd.DataFrame(final_result, columns=["subject_uri", "predicate_uri", "object_uri"]).drop_duplicates()
doc_relation_df = relation_df[relation_df["docid"] == doc_id][["subject_uri", "predicate_uri", "object_uri"]]
correct_relation_df = pred_relation_df.merge(doc_relation_df[["subject_uri", "predicate_uri", "object_uri"]], on=["subject_uri", "predicate_uri", "object_uri"], how="inner")

BadSyntax: at line 2 of <>:
Bad syntax (Prefix "http:" not bound) at ^ in:
"b'\n'^b'http://www.wikidata.org/entity/Q5170476 http://www.wikidata.'..."

In [64]:
get_uri_labels(pred_relation_df)

Unnamed: 0,subject_uri,predicate_uri,object_uri,subject,predicate,object
0,http://www.wikidata.org/entity/Q1236511,http://www.wikidata.org/prop/direct/P17,http://www.wikidata.org/entity/Q145,Corfe_Castle,Unknown,United_Kingdom
1,http://www.wikidata.org/entity/Q5170476,http://www.wikidata.org/prop/direct/P276,http://www.wikidata.org/entity/Q7653559,Corfe_Castle_railway_station,Unknown,Swanage_Railway
2,http://www.wikidata.org/entity/Q5170476,http://www.wikidata.org/prop/direct/P276,http://www.wikidata.org/entity/Q1236511,Corfe_Castle_railway_station,Unknown,Corfe_Castle
3,http://www.wikidata.org/entity/Q1236511,http://www.wikidata.org/prop/direct/P276,http://www.wikidata.org/entity/Q145,Corfe_Castle,Unknown,United_Kingdom
4,http://www.wikidata.org/entity/Q5170476,http://www.wikidata.org/prop/direct/P361,http://www.wikidata.org/entity/Q7653559,Corfe_Castle_railway_station,Unknown,Swanage_Railway
5,http://www.wikidata.org/entity/Q1236511,http://www.wikidata.org/prop/direct/P361,http://www.wikidata.org/entity/Q145,Corfe_Castle,Unknown,United_Kingdom


In [7]:
get_uri_labels(doc_relation_df)

Unnamed: 0,subject_uri,predicate_uri,object_uri,subject,predicate,object
0,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P1321,http://www.wikidata.org/entity/Q36378,Ricardo_Lumengo,place of origin (Switzerland),Fribourg
1,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P27,http://www.wikidata.org/entity/Q39,Ricardo_Lumengo,country of citizenship,Switzerland
2,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P551,http://www.wikidata.org/entity/Q1034,Ricardo_Lumengo,residence,Biel/Bienne
3,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P937,http://www.wikidata.org/entity/Q70,Ricardo_Lumengo,work location,Bern
4,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P1412,http://www.wikidata.org/entity/Q33702,Ricardo_Lumengo,"languages spoken, written or signed",Kongo_language


In [66]:
get_uri_labels(correct_relation_df)

Unnamed: 0,subject_uri,predicate_uri,object_uri,subject,predicate,object


In [None]:
def evaluate(pred_relation_df, doc_id, verbose=False):
    doc_relation_df = relation_df[relation_df["docid"] == doc_id][["subject_uri", "predicate_uri", "object_uri"]]
    correct_relation_df = pred_relation_df.merge(doc_relation_df[["subject_uri", "predicate_uri", "object_uri"]], on=["subject_uri", "predicate_uri", "object_uri"], how="inner")
    precision = len(correct_relation_df) / len(pred_relation_df)
    recall = len(correct_relation_df) / len(doc_relation_df)
    if precision + recall == 0:
        f1_score = 0
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)
    
    if verbose:
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1: {f1_score}")
        
    return precision, recall, f1_score

In [None]:
evaluate(pred_relation_df, doc_id, verbose=True)