# Set up the environment

In [1]:
import helper_tools.parser as parser
import importlib
import pandas as pd

importlib.reload(parser)

relation_df, entity_df, docs = parser.redfm_parser("train")
predicate_set_df = relation_df[["predicate", "predicate_uri"]].drop_duplicates()

Fetching 22 files:   0%|          | 0/22 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:00<00:00, 5624.65it/s]


In [2]:
from gen_ai_hub.proxy.langchain import init_embedding_model
from dotenv import load_dotenv
from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client

load_dotenv()

proxy_client = get_proxy_client('gen-ai-hub')
model = ChatOpenAI(proxy_model_name='meta--llama3-70b-instruct', proxy_client=proxy_client)
embeddings = init_embedding_model('text-embedding-3-large')

In [3]:
target_doc = docs.iloc[2]
doc_id = target_doc["docid"]
text = target_doc["text"]
text

'Club Sportivo Cienciano is a professional football club based in Cusco, Peru. The club was founded in 1901 and was originally the team of the students of the "Ciencias y Artes" School (Ciencias meaning science in Spanish), from which it takes its name. It gained worldwide recognition after defeating River Plate from Argentina in the finals of the 2003 Copa Sudamericana and Boca Juniors in the 2004 Recopa Sudamericana.'

# Supervisor Agent

In [14]:
import re

system_prompt = f"""
You are the Supervisor of a conversation among multiple agents.
The conversation is about extracting information (Closed Information Extraction) from a user-provided text. The final output should only contain URIs isntead of labels or descriptions of entities or relations.

Agent Descriptions:
- entity_extraction_agent: Extracts entities from the text. Can not take any instructions.
- relation_extraction_agent: Extracts relations from the text. Can not take any instructions.
- uri_detection_agent: Detects URIs for entities based on similarity search. The instruction is the search term, which the uri detection agent is searching for.

You have two options:
1. Call an agent using <goto>agent_name</goto>. Replace agent_name with either entity_extraction_agent or relation_extraction_agent. I.e. <goto>entity_extraction_agent</goto>.
2. Finish the conversation using <goto>FINISH</goto>. Please output the final relations in <relation> tags alongside with the <goto> tag.

In addition to the options you can provide additional information to the agents using the <instruction> tag. I.e. <instruction>Search additional for entities that are not obvious.</instruction>.

Note:
- Do not provide any information yourself, instead use the agents for this.
- The first <goto> tag in your response will be executed.
- Therefore, do include exact one agent call in your response.
- If you output nothing, this will result in a NoneType Error.


"""


def supervisor(state):
    # Stream the response and accumulate the tokens
    response = model.invoke(state["messages"])
          
    # Now extract the <goto> tag from the accumulated response
    goto_match = re.search(r'<goto>(.*?)</goto>', response.content)
    if goto_match:
        goto = goto_match.group(1)
    else:
        goto = "supervisor"
        
    instruction_match = re.search(r'<instruction>(.*?)</instruction>', response.content)
    if instruction_match:
        instruction = instruction_match.group(1)
    else:
        instruction = ""
        
    print(f"\n\n-- START OF OUTPUT (SUPERVISOR) --\n\n{response.content}\n\n-- END OF OUTPUT (NEXT: {goto} - INSTRUCTION: {instruction}) --\n\n")
    
    return state["messages"] + [response]

messages = supervisor({"messages": [{"role": "system", "content": system_prompt}, {"role": "user", "content": text}], "text": text})
messages = supervisor({"messages": messages + [{"role": "assistant", "content": "Output of entity_extraction_agent: [Olaf Scholz, chancellor, Germany]"}], "text": text})
messages = supervisor({"messages": messages + [{"role": "assistant", "content": "Output of relation_extraction_agent: <relation>Olaf Scholz;is chancellor of; Germany</relation><relation>Olaf Scholz;has job;chancellor</relation>"}], "text": text})



-- START OF OUTPUT (SUPERVISOR) --

<goto>entity_extraction_agent</goto>

-- END OF OUTPUT (NEXT: entity_extraction_agent - INSTRCUTION: ) --




-- START OF OUTPUT (SUPERVISOR) --

<instruction>Olaf Scholz</instruction><goto>uri_detection_agent</goto>

-- END OF OUTPUT (NEXT: uri_detection_agent - INSTRCUTION: Olaf Scholz) --




-- START OF OUTPUT (SUPERVISOR) --

<instruction>chancellor</instruction><goto>uri_detection_agent</goto>

-- END OF OUTPUT (NEXT: uri_detection_agent - INSTRCUTION: chancellor) --




# URI Detection Agent

In [4]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

documents = []
entity_set = entity_df[['entity', 'entity_uri']].drop_duplicates()
for index, row in entity_set.iterrows():
    documents.append(Document(
        page_content=row["entity"],
        metadata={"uri": row["entity_uri"]},
    ))
    
predicate_set_df = relation_df[["predicate", "predicate_uri"]].drop_duplicates()
for index, row in predicate_set_df.iterrows():
    documents.append(Document(
        page_content=row["predicate"],
        metadata={"uri": row["predicate_uri"]},
    ))
    
faiss_document_ids = vector_store.add_documents(documents=documents)

['200f9179-a5bb-4d9d-9e3b-d365aaf45647',
 '3fff9238-8c40-4a35-a3dc-ce999e212ba6',
 '5a9df054-83b1-474a-976e-180ba877d3df',
 '0e06fe7f-2a20-4f06-a479-d1e37663f11e',
 'f168453e-c72d-4b9e-84a4-08791d41e230',
 'ed80ca3d-8bc2-4c7d-819f-07c0269315ca',
 '103f7ac7-efee-4050-82e5-4bf325bf6275',
 'aed4317e-9598-4370-b788-36f8622d0c98',
 '4581f1d0-aae1-4a74-b023-e2b2d78b501e',
 '316ee8a8-263b-4819-8995-a471526a4cd7',
 '4124c4f4-952b-4e7a-8373-bb3424853657',
 '4788f180-c8a4-436e-9c4d-b7ed769162ec',
 '104b6652-871a-45e1-9b0b-0d3c66431892',
 'c58b8fc8-61a8-4b5c-be67-cb27f18ded96',
 '0019bc92-d2fd-44b8-8dd4-e9e9d727e5ca',
 'af3445a1-e027-46f4-83c6-4d94bcd1e682',
 'e1c2c699-549b-469b-9bd4-bba8a1b06e41',
 '073f18f8-0bb5-456a-a616-ee4b90541d50',
 '61179e3b-09ef-43a3-82a0-507dfb04508d',
 '3eb3f450-2e25-485a-9640-2ea0de0ebe8b',
 '7bd18465-ea60-437b-a6a6-0252cd2e8e75',
 '709d2572-6d74-476b-81f0-3ccd6f1807ba',
 'e39e356b-39fe-4405-b28e-05990eec03a7',
 '1db81d3c-6552-4056-ab9b-9bba2a2787a8',
 'dda9e061-ed02-

In [21]:
def uri_detection_agent(state):
    search_terms = state["instructions"].split(",")
    response = ""
    for term in search_terms:
        response += f'Search Results for {term}:\n{[{"entity_label": doc.page_content, "uri": doc.metadata["uri"], "similarity_score": score} for doc, score in vector_store.similarity_search_with_score(term, search_type="similarity", k=3)]}\n\n'
    return response.replace("},", "},\n")

print(uri_detection_agent({"instructions": "Strobel, Austria, Linz"}))

Search Results for Strobel:
[{'entity_label': 'Strobl', 'uri': 'Q667278', 'similarity_score': 0.7443207},
 {'entity_label': 'Linz', 'uri': 'Q41329', 'similarity_score': 1.4175475},
 {'entity_label': 'rock', 'uri': 'Q11399', 'similarity_score': 1.4656672}]

Search Results for  Austria:
[{'entity_label': 'Austria', 'uri': 'Q40', 'similarity_score': 0.4953544},
 {'entity_label': 'Upper Austria', 'uri': 'Q41967', 'similarity_score': 0.8636428},
 {'entity_label': 'Austro-Hungarian', 'uri': 'Q28513', 'similarity_score': 0.95185095}]

Search Results for  Linz:
[{'entity_label': 'Linz', 'uri': 'Q41329', 'similarity_score': 0.22136378},
 {'entity_label': 'Upper Austria', 'uri': 'Q41967', 'similarity_score': 1.0135541},
 {'entity_label': 'Graz', 'uri': 'Q13298', 'similarity_score': 1.0828822}]




# Entity Extraction Agent

In [39]:
from langgraph.types import Command
from langchain_core.prompts import PromptTemplate


def entity_extraction_agent(state):
    prompt_template = PromptTemplate.from_template("""
    You are an agent tasked with extracting entities from a given text for linking to a knowledge graph. Your job is to capture every entity—both explicit and implicit—and return them as an array. This includes composite entities with modifiers (e.g., "professional football club"). Please output the entities as an array of strings. Do not include any further information or comment in the output.
    
    Example Output: [Olaf Scholz, chancellor, Germany]
    
    Guidelines:
    - An entity is a unique real-world object or concept represented as a node with its properties and relationships.
    - Extract every entity mentioned in the text, including those that are not immediately obvious.
    - For composite entities, include the full descriptive phrase and break it into its core components when appropriate. For example, "chancellor of Germany" should yield [chancellor, Germany] and "professional football club" should capture the descriptive phrase as needed.
    - For composite entities that include a date at the beginning or end, extract the date separately, the entity without the date, and the full composite (e.g., "2022 Winter Olympics" should result in [2022, 2022 Winter Olympics, Winter Olympics]).
    - Also, ensure that dates are extracted as entities.
    
    Instruction: {instruction}
    
    Text: {text}
    """)



    chain = prompt_template | model
    
    response = chain.invoke({"text": state["text"], "instruction": state["instruction"]})
    
    print(f"-- START OF OUTPUT (entity_extraction_agent) --\n\n", response.content, "\n\n-- END OF OUTPUT --\n\n")
    
    return Command(goto="supervisor", update={"messages": state["messages"] + [response], "instruction": ""})

response = entity_extraction_agent({"text": text, "messages": [], "instruction": "Extract entities from the text."})

text

-- START OF OUTPUT (entity_extraction_agent) --

 [Club Sportivo Cienciano, professional football club, Cusco, Peru, 1901, Ciencias y Artes School, River Plate, Argentina, 2003, Copa Sudamericana, Boca Juniors, 2004, Recopa Sudamericana] 

-- END OF OUTPUT --




'Club Sportivo Cienciano is a professional football club based in Cusco, Peru. The club was founded in 1901 and was originally the team of the students of the "Ciencias y Artes" School (Ciencias meaning science in Spanish), from which it takes its name. It gained worldwide recognition after defeating River Plate from Argentina in the finals of the 2003 Copa Sudamericana and Boca Juniors in the 2004 Recopa Sudamericana.'

In [37]:
sorted(entity_df[entity_df["docid"] == doc_id]["entity"].to_list())

['1901',
 '2003',
 '2004',
 '2004 Recopa Sudamericana',
 'Argentina',
 'Boca Juniors',
 'Cienciano',
 'Copa Sudamericana',
 'Cusco',
 'River Plate',
 'football']

# Relation Extraction Agent

In [47]:
def relation_extraction_agent(state):
    prompt_template = PromptTemplate.from_template(
        """
        You are a relation extraction agent. Your task is to read the text of the user message and extract the relations found in the text. Each relation should be written in this exact format: <relation>subject;predicate;object</relation> (e.g.: <relation>Olaf Scholz;is chancellor of;Germany</relation>). Please return only the relations and no other information.
    
        Note: In addition to the explicit relations mentioned in the text, if an entity is described by a characteristic or category (e.g., renowned film director, prestigious university), you must also extract the corresponding attribute relation automatically. For example, if the text states that "Steven Spielberg is a renowned film director", you should extract: <relation>Steven Spielberg;profession;film director</relation>.
    
        Instruction: {instruction}
    
        Text: {text}
        """
    )

    chain = prompt_template | model
    response = chain.invoke({"text": state["text"], "instruction": state["instruction"]})
    
    print(f"-- START OF OUTPUT (relation_extraction_agent) --\n\n", response.content, "\n\n-- END OF OUTPUT --\n\n")
    
    return Command(goto="supervisor", update={"messages": state["messages"] + [response], "instruction": ""})

response = relation_extraction_agent({"text": text, "messages": [], "instruction": "Please use the already extracted entities: ['1901', '2003','2004','2004 Recopa Sudamericana','Argentina','Boca Juniors','Cienciano','Copa Sudamericana','Cusco','River Plate','football']"})

-- START OF OUTPUT (relation_extraction_agent) --

 Here are the extracted relations:

<relation>Club Sportivo Cienciano;is based in;Cusco</relation>
<relation>Club Sportivo Cienciano;was founded in;1901</relation>
<relation>Club Sportivo Cienciano;originally was;team of the students of the "Ciencias y Artes" School</relation>
<relation>Ciencias y Artes School;name;Cienciano</relation>
<relation>Club Sportivo Cienciano;defeated;River Plate</relation>
<relation>River Plate;from;Argentina</relation>
<relation>Club Sportivo Cienciano;defeated;Boca Juniors</relation>
<relation>2003 Copa Sudamericana;year;2003</relation>
<relation>2004 Recopa Sudamericana;year;2004</relation>
<relation>Club Sportivo Cienciano;profession;football club</relation>
<relation>River Plate;profession;football</relation>
<relation>Boca Juniors;profession;football</relation> 

-- END OF OUTPUT --




In [44]:
relation_df[relation_df["docid"] == doc_id][["subject", "predicate", "object"]]

Unnamed: 0,subject,predicate,object
3,Cienciano,sport,football
4,Cienciano,headquarters location,Cusco
5,Cienciano,inception,1901
6,River Plate,country,Argentina
