In [4]:
from langchain_core.runnables import RunnableParallel

import helper_tools.parser as parser
import importlib
import pandas as pd
''
importlib.reload(parser)

relation_df, entity_df, docs = parser.redfm_parser("train")

Fetching 22 files:   0%|          | 0/22 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:00<00:00, 4791.30it/s]


In [5]:
docs.head()

Unnamed: 0,docid,text
0,1755846-1,CBS Corporation comprised the over-the-air tel...
1,1755846-2,The second merger between CBS Corporation and ...
2,1701411-0,Club Sportivo Cienciano is a professional foot...
3,1854133-1,It is the seat of a municipality with 203.30 k...
4,1602703-0,Bad Ischl is a spa town in Austria. It lies in...


In [76]:
from dotenv import load_dotenv
from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
from gen_ai_hub.proxy.langchain.init_models import init_embedding_model
from langchain_ollama import OllamaEmbeddings



load_dotenv()

proxy_client = get_proxy_client('gen-ai-hub')
model = ChatOpenAI(proxy_model_name='meta--llama3.1-70b-instruct', proxy_client=proxy_client)
embeddings = OllamaEmbeddings(
    model="nomic-embed-text",
)

In [77]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

documents = []
entity_set = entity_df[['entity', 'entity_uri']].drop_duplicates()
for index, row in entity_set.iterrows():
    documents.append(Document(
        page_content=row["entity"],
        metadata={"uri": row["entity_uri"]},
    ))
    
predicate_set_df = relation_df[["predicate", "predicate_uri"]].drop_duplicates()
for index, row in predicate_set_df.iterrows():
    documents.append(Document(
        page_content=row["predicate"],
        metadata={"uri": row["predicate_uri"]},
    ))
    
faiss_document_ids = vector_store.add_documents(documents=documents)

In [78]:
from langgraph.types import Command
from typing import Literal, TypedDict
from langgraph.graph import StateGraph, MessagesState, START, END
from langchain_core.prompts import PromptTemplate
import re
from typing import TypedDict
from langchain_core.messages import HumanMessage, AIMessage

class cIEState(TypedDict):
    text: str
    messages: list[HumanMessage | AIMessage]
    instruction: str

system_prompt = f"""
You are the Supervisor of a conversation among multiple agents.
The conversation is about extracting information (Closed Information Extraction) from a user-provided text. The final output should only contain wikidata URIs instead of the labels of entities and relations. You can provide additional information to the agents using <instruction> tags.

Example Output: <relation>http://www.wikidata.org/entity/Q950380;http://www.wikidata.org/entity/P361;http://www.wikidata.org/entity/Q2576666</relation>

Agent Descriptions:
- entity_extraction_agent: Extracts entities from the text. Instructions can change the extraction behavior and focus of the agent. Do not instruct the agent with URIs. The agent has only access to your instruction and the text.
- relation_extraction_agent: Extracts relations from the text. Instructions can change the prompt of the called agent and can be used to input already extracted entity labels (i.e. <instruction>Please use the already extracted entities: [Olaf Scholz, Germany, Berlin]</instruction>). Do not instruct the agent with URIs. The agent has only access to your instruction and the text.
- uri_detection_agent: Please only use this, after entities and relation were extracted at least once. Returns possible wikidata URIs for entities and predicates based on similarity search. The instruction should be a list of search terms like predicate and entity labels, which the uri detection agent is searching for. For example: "Olaf Scholz, Germany, Berlin, is chancellor of, part of". It is recommended to use the agent at least once to search for the URIs for all possible entities and predicates. Do not instruct the agent with URIs. The agent has only access to your instruction and the text. 

You have two options:
1. Call an agent using <goto>agent_name</goto>. Replace agent_name with either entity_extraction_agent or relation_extraction_agent. I.e. <goto>entity_extraction_agent</goto>.
2. Finish the conversation using <goto>FINISH</goto>. Please output the final relations in <relation> tags alongside with the <goto> tag.


Note:
- Do not provide any information yourself, instead use the agents for this.
- The first <goto> tag in your response will be executed.
- Therefore, do include exact one agent call in your response.
- If you output nothing, this will result in a NoneType Error.
- Please do not hallucinate any URI.


"""

def supervisor(state: cIEState) -> Command[Literal["entity_extraction_agent", "relation_extraction_agent", "uri_detection_agent", END]]:    
    
    response = model.invoke(state["messages"])
    
    print(f"-- START OF OUTPUT (supervisor) --\n\n", response.content, "\n\n-- END OF OUTPUT --\n\n")
        
    goto_match = re.search(r'<goto>(.*?)</goto>', response.content)
    if goto_match:
        goto = goto_match.group(1)
        if goto == "FINISH":
            goto = END
    else:
        goto = "supervisor"
        
    instruction_match = re.search(r'<instruction>(.*?)</instruction>', response.content)
    if instruction_match:
        instruction = instruction_match.group(1)
    else:
        instruction = ""

    return Command(goto=goto, update={"messages": state["messages"] + [response], "instruction": instruction})

def entity_extraction_agent(state: cIEState) -> Command[Literal["supervisor"]]:
    prompt_template = PromptTemplate.from_template("""
    You are an agent tasked with extracting entities from a given text for linking to a knowledge graph. Your job is to capture every entity—both explicit and implicit—and return them as an array. This includes composite entities with modifiers (e.g., "professional football club"). Please output the entities as an array of strings. Do not include any further information or comment in the output.
    
    Example Output: [Olaf Scholz, chancellor, Germany]
    
    Guidelines:
    - An entity is a unique real-world object or concept represented as a node with its properties and relationships.
    - Extract every entity mentioned in the text, including those that are not immediately obvious.
    - For composite entities, include the full descriptive phrase and break it into its core components when appropriate. For example, "chancellor of Germany" should yield [chancellor, Germany] and "professional football club" should capture the descriptive phrase as needed.
    - For composite entities that include a date at the beginning or end, extract the date separately, the entity without the date, and the full composite (e.g., "2022 Winter Olympics" should result in [2022, 2022 Winter Olympics, Winter Olympics]).
    - Also, ensure that dates are extracted as entities.
    
    Instruction: {instruction}
    
    Text: {text}
    """)
    chain = prompt_template | model
    response = chain.invoke({"text": state["text"], "instruction": state["instruction"]})
    
    print(f"-- START OF OUTPUT (entity_extraction_agent) --\n\n", response.content, "\n\n-- END OF OUTPUT --\n\n")
    
    return Command(goto="supervisor", update={"messages": state["messages"] + [response], "instruction": ""})

def relation_extraction_agent(state: cIEState) -> Command[Literal["supervisor"]]:
    prompt_template = PromptTemplate.from_template(
        """
        You are a relation extraction agent. Your task is to analyze the provided text and extract all semantic relations present. Each relation must be output in the exact format:
        
        <relation>subject;predicate;object</relation>
        
        (For example: <relation>Olaf Scholz;is chancellor of;Germany</relation>).
        
        Guidelines:
        - **Extraction Scope:** Extract only the relations explicitly mentioned in the text. Additionally, if the text implies a relation or if a relation can be inferred using the provided entity list, include that relation.
        - **Utilize Provided Entities:** Use the provided list of extracted entities to ensure that all relevant relations are captured. For example, if "technology" is in the list and the text indicates that the subject is a technology company, you must output: <relation>Apple;industry;technology</relation>.
        - **Attribute Relations:** If an entity is described by a characteristic or category (e.g., renowned film director, prestigious university), automatically extract the corresponding attribute relation. For example, if the text states "Steven Spielberg is a renowned film director", extract: <relation>Steven Spielberg;profession;film director</relation>.
        - **Formatting:** Each relation must strictly follow the format <relation>subject;predicate;object</relation> with no additional text or commentary.
        - **Accuracy:** Only include relations that are clearly supported by the text or can be confidently inferred using the provided entity list.
        
        Instruction: {instruction}
        
        Text: {text}
        """
    )
    chain = prompt_template | model
    response = chain.invoke({"text": state["text"], "instruction": state["instruction"]})
    
    print(f"-- START OF OUTPUT (relation_extraction_agent) --\n\n", response.content, "\n\n-- END OF OUTPUT --\n\n")
    
    return Command(goto="supervisor", update={"messages": state["messages"] + [response], "instruction": ""})

def uri_detection_agent(state):
    search_terms = state["instruction"].split(",")
    response = ""
    for term in search_terms:
        response += f'Detection Result for {term}:{[{"label": doc.page_content, "uri": doc.metadata["uri"]} for doc in vector_store.similarity_search(term, search_type="similarity", k=1)]}\n\n'
    response = response.replace("},", "},\n")
    return Command(goto="supervisor", update={"messages": state["messages"] + [response], "instruction": ""})
    
builder = StateGraph(cIEState)
builder.add_node(supervisor)
builder.add_node(entity_extraction_agent)
builder.add_node(relation_extraction_agent)
builder.add_node(uri_detection_agent)

builder.add_edge(START, "supervisor")

graph = builder.compile()

In [79]:
from langfuse.callback import CallbackHandler
from dotenv import load_dotenv
import os

load_dotenv()
langfuse_handler = CallbackHandler(
    secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
    public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
    host=os.getenv("LANGFUSE_HOST"),
)

target_doc = docs.iloc[2]
doc_id = target_doc["docid"]
response = graph.invoke({"text": target_doc["text"], "messages": [system_prompt, target_doc["text"]], "instruction": ""}, config={"callbacks": [langfuse_handler]})

-- START OF OUTPUT (supervisor) --

 <goto>entity_extraction_agent</goto> 

-- END OF OUTPUT --


-- START OF OUTPUT (entity_extraction_agent) --

 [Club Sportivo Cienciano, professional football club, football club, Cusco, Peru, Ciencias y Artes School, science, Spanish, River Plate, Argentina, Copa Sudamericana, Boca Juniors, Recopa Sudamericana, 1901, 2003, 2003 Copa Sudamericana, 2004, 2004 Recopa Sudamericana] 

-- END OF OUTPUT --


-- START OF OUTPUT (supervisor) --

 <instruction>Please use the already extracted entities: [Club Sportivo Cienciano, professional football club, Cusco, Peru, River Plate, Argentina, Copa Sudamericana, Boca Juniors, Recopa Sudamericana]</instruction>
<goto>relation_extraction_agent</goto> 

-- END OF OUTPUT --


-- START OF OUTPUT (relation_extraction_agent) --

 Here are the extracted relations:

<relation>Club Sportivo Cienciano;is;professional football club</relation>
<relation>Club Sportivo Cienciano;located in;Cusco</relation>
<relation>Club Spo

In [80]:
def get_uri_labels(df): 
    subjects = []
    predicates = []
    objects = []
    for i, row in df.iterrows():
        try:
            subjects.append(entity_set[entity_set["entity_uri"] == row["subject_uri"]]["entity"].values[0])
        except IndexError:
            subjects.append("Unknown")
        try:
            predicates.append(predicate_set_df[predicate_set_df["predicate_uri"] == row["predicate_uri"]]["predicate"].values[0])
        except IndexError:
            predicates.append("Unknown")
        if row["object_uri"] is not None and "^^" in row["object_uri"]:
            objects.append(row["object_uri"])
        else:
            try:
                objects.append(entity_set[entity_set["entity_uri"] == row["object_uri"]]["entity"].values[0])
            except IndexError:
                objects.append("Unknown")
    return pd.concat([df.reset_index(drop=True), pd.DataFrame({"subject": subjects, "predicate": predicates, "object": objects})], axis=1)

In [81]:
relation_list = [x.split(";") for x in re.findall(r"<relation>(.*?)</relation>", response["messages"][-1].content)]
pred_relation_df = pd.DataFrame(relation_list, columns=["subject_uri", "predicate_uri", "object_uri"]).drop_duplicates()
get_uri_labels(pred_relation_df)

Unnamed: 0,subject_uri,predicate_uri,object_uri,subject,predicate,object
0,http://www.wikidata.org/entity/Q602482,http://www.wikidata.org/entity/P276,http://www.wikidata.org/entity/Q5582862,Cienciano,location,Cusco
1,http://www.wikidata.org/entity/Q602482,http://www.wikidata.org/entity/P276,http://www.wikidata.org/entity/Q419,Cienciano,location,Unknown
2,http://www.wikidata.org/entity/Q602482,http://www.wikidata.org/entity/P127,http://www.wikidata.org/entity/Q60585,Cienciano,owned by,Copa Sudamericana
3,http://www.wikidata.org/entity/Q602482,http://www.wikidata.org/entity/P127,http://www.wikidata.org/entity/Q4603244,Cienciano,owned by,2004 Recopa Sudamericana
4,http://www.wikidata.org/entity/Q602482,http://www.wikidata.org/entity/P361,http://www.wikidata.org/entity/Q602482,Cienciano,part of,Cienciano
5,http://www.wikidata.org/entity/Q15799,http://www.wikidata.org/entity/P361,http://www.wikidata.org/entity/Q414,River Plate,part of,Argentina
6,http://www.wikidata.org/entity/Q170703,http://www.wikidata.org/entity/P361,http://www.wikidata.org/entity/Q414,Boca Juniors,part of,Argentina


In [82]:
doc_relation_df = relation_df[relation_df["docid"] == doc_id][["subject_uri", "predicate_uri", "object_uri"]]
get_uri_labels(doc_relation_df)

Unnamed: 0,subject_uri,predicate_uri,object_uri,subject,predicate,object
0,http://www.wikidata.org/entity/Q602482,http://www.wikidata.org/entity/P641,http://www.wikidata.org/entity/Q2736,Cienciano,sport,football
1,http://www.wikidata.org/entity/Q602482,http://www.wikidata.org/entity/P159,http://www.wikidata.org/entity/Q5582862,Cienciano,headquarters location,Cusco
2,http://www.wikidata.org/entity/Q602482,http://www.wikidata.org/entity/P571,1901-01-01T00:00:00Z^^http://www.w3.org/2001/X...,Cienciano,inception,1901-01-01T00:00:00Z^^http://www.w3.org/2001/X...
3,http://www.wikidata.org/entity/Q15799,http://www.wikidata.org/entity/P17,http://www.wikidata.org/entity/Q414,River Plate,country,Argentina


In [83]:
correct_relation_df = pred_relation_df.merge(doc_relation_df[["subject_uri", "predicate_uri", "object_uri"]], on=["subject_uri", "predicate_uri", "object_uri"], how="inner")
correct_relation_df

Unnamed: 0,subject_uri,predicate_uri,object_uri


In [84]:
def evaluate(pred_relation_df, doc_id, verbose=False):
    doc_relation_df = relation_df[relation_df["docid"] == doc_id][["subject_uri", "predicate_uri", "object_uri"]]
    correct_relation_df = pred_relation_df.merge(doc_relation_df[["subject_uri", "predicate_uri", "object_uri"]], on=["subject_uri", "predicate_uri", "object_uri"], how="inner")
    precision = len(correct_relation_df) / len(pred_relation_df)
    recall = len(correct_relation_df) / len(doc_relation_df)
    if precision + recall == 0:
        f1_score = 0
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)
    
    if verbose:
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1: {f1_score}")
        
    return precision, recall, f1_score

In [85]:
evaluate(pred_relation_df, doc_id, verbose=True)

Precision: 0.0
Recall: 0.0
F1: 0


(0.0, 0.0, 0)

# Evaluation on Test

In [86]:
evaluation_df = []

for i, target_doc in docs.iterrows():
    doc_id = target_doc["docid"]
    response = graph.invoke({"text": target_doc["text"], "messages": [system_prompt, target_doc["text"]], "instruction": ""}, config={"callbacks": [langfuse_handler]})
    relation_list = [x.split(";") for x in re.findall(r"<relation>(.*?)</relation>", response["messages"][-1].content)]
    pred_relation_df = pd.DataFrame(relation_list, columns=["subject_uri", "predicate_uri", "object_uri"]).drop_duplicates()
    evaluation_df.append([doc_id, *evaluate(pred_relation_df, doc_id, verbose=False)])
    if i >= 3:
        break
    
evaluation_df = pd.DataFrame(evaluation_df, columns=["docid", "precision", "recall", "f1_score"])
evaluation_df

-- START OF OUTPUT (supervisor) --

 <goto>entity_extraction_agent</goto> 

-- END OF OUTPUT --


-- START OF OUTPUT (entity_extraction_agent) --

 [CBS Corporation, CBS, The CW, Viacom, over-the-air television, television production and distribution, publishing, pay-cable, recording, entertainment company, CBS Building, Midtown Manhattan, New York City] 

-- END OF OUTPUT --


-- START OF OUTPUT (supervisor) --

 <instruction>Please use the already extracted entities: [CBS Corporation, CBS, The CW, Viacom, over-the-air television, television production and distribution, publishing, pay-cable, recording, entertainment company, CBS Building, Midtown Manhattan, New York City]</instruction> 
<goto>relation_extraction_agent</goto> 

-- END OF OUTPUT --


-- START OF OUTPUT (relation_extraction_agent) --

 Here are the extracted relations:

<relation>CBS Corporation;subsumes;CBS</relation>
<relation>CBS Corporation;subsumes;The CW</relation>
<relation>CBS Corporation;industry;over-the-air t

RateLimitError: Error code: 429 - {'error': 'TooManyRequest', 'message': 'Your request has been rate limited by AI Core. Please try again later.'}