# Set up the environment

In [27]:
import helper_tools.parser as parser
import importlib
import pandas as pd

importlib.reload(parser)

relation_df, entity_df, docs = parser.redfm_parser("train")
predicate_set_df = relation_df[["predicate", "predicate_uri"]].drop_duplicates()

Fetching 22 files:   0%|          | 0/22 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:00<00:00, 3187.65it/s]


In [28]:
from gen_ai_hub.proxy.langchain import init_embedding_model
from dotenv import load_dotenv
from langchain_sambanova import ChatSambaNovaCloud

load_dotenv()

model = ChatSambaNovaCloud(
    model="Meta-Llama-3.3-70B-Instruct",
    max_tokens=8192
)
embeddings = init_embedding_model('text-embedding-3-large')

In [29]:
target_doc = docs.iloc[2]
doc_id = target_doc["docid"]
text = target_doc["text"]
text

'Club Sportivo Cienciano is a professional football club based in Cusco, Peru. The club was founded in 1901 and was originally the team of the students of the "Ciencias y Artes" School (Ciencias meaning science in Spanish), from which it takes its name. It gained worldwide recognition after defeating River Plate from Argentina in the finals of the 2003 Copa Sudamericana and Boca Juniors in the 2004 Recopa Sudamericana.'

# Supervisor Agent

In [30]:
import re

system_prompt = f"""
You are the Supervisor of a conversation among multiple agents.
The conversation is about extracting information (Closed Information Extraction) from a user-provided text. The final output should only contain URIs isntead of labels or descriptions of entities or relations.

Agent Descriptions:
- entity_extraction_agent: Extracts entities from the text. Can not take any instructions.
- relation_extraction_agent: Extracts relations from the text. Can not take any instructions.
- uri_detection_agent: Detects URIs for entities based on similarity search. The instruction is the search term, which the uri detection agent is searching for.

You have two options:
1. Call an agent using <goto>agent_name</goto>. Replace agent_name with either entity_extraction_agent or relation_extraction_agent. I.e. <goto>entity_extraction_agent</goto>.
2. Finish the conversation using <goto>FINISH</goto>. Please output the final relations in <relation> tags alongside with the <goto> tag.

In addition to the options you can provide additional information to the agents using the <instruction> tag. I.e. <instruction>Search additional for entities that are not obvious.</instruction>.

Note:
- Do not provide any information yourself, instead use the agents for this.
- The first <goto> tag in your response will be executed.
- Therefore, do include exact one agent call in your response.
- If you output nothing, this will result in a NoneType Error.


"""


def supervisor(state):
    # Stream the response and accumulate the tokens
    response = model.invoke(state["messages"])
          
    # Now extract the <goto> tag from the accumulated response
    goto_match = re.search(r'<goto>(.*?)</goto>', response.content)
    if goto_match:
        goto = goto_match.group(1)
    else:
        goto = "supervisor"
        
    instruction_match = re.search(r'<instruction>(.*?)</instruction>', response.content)
    if instruction_match:
        instruction = instruction_match.group(1)
    else:
        instruction = ""
        
    print(f"\n\n-- START OF OUTPUT (SUPERVISOR) --\n\n{response.content}\n\n-- END OF OUTPUT (NEXT: {goto} - INSTRUCTION: {instruction}) --\n\n")
    
    return state["messages"] + [response]

messages = supervisor({"messages": [{"role": "system", "content": system_prompt}, {"role": "user", "content": text}], "text": text})
messages = supervisor({"messages": messages + [{"role": "assistant", "content": "Output of entity_extraction_agent: [Olaf Scholz, chancellor, Germany]"}], "text": text})
messages = supervisor({"messages": messages + [{"role": "assistant", "content": "Output of relation_extraction_agent: <relation>Olaf Scholz;is chancellor of; Germany</relation><relation>Olaf Scholz;has job;chancellor</relation>"}], "text": text})



-- START OF OUTPUT (SUPERVISOR) --

<goto>entity_extraction_agent</goto>
<instruction>Extract all entities from the provided text.</instruction>

-- END OF OUTPUT (NEXT: entity_extraction_agent - INSTRUCTION: Extract all entities from the provided text.) --




-- START OF OUTPUT (SUPERVISOR) --

<goto>uri_detection_agent</goto>
<instruction>Search for URIs of the entities: Club Sportivo Cienciano, Cusco, Peru, Ciencias y Artes, River Plate, Argentina, Copa Sudamericana, Boca Juniors, Recopa Sudamericana, 1901, 2003, 2004</instruction>

-- END OF OUTPUT (NEXT: uri_detection_agent - INSTRUCTION: Search for URIs of the entities: Club Sportivo Cienciano, Cusco, Peru, Ciencias y Artes, River Plate, Argentina, Copa Sudamericana, Boca Juniors, Recopa Sudamericana, 1901, 2003, 2004) --




-- START OF OUTPUT (SUPERVISOR) --

<goto>relation_extraction_agent</goto>
<instruction>Extract relations between entities from the provided text.</instruction>

-- END OF OUTPUT (NEXT: relation_extractio

# URI Detection Agent

In [14]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

documents = []
entity_set = entity_df[['entity', 'entity_uri']].drop_duplicates()
for index, row in entity_set.iterrows():
    documents.append(Document(
        page_content=row["entity"],
        metadata={"uri": row["entity_uri"]},
    ))
    
predicate_set_df = relation_df[["predicate", "predicate_uri"]].drop_duplicates()
for index, row in predicate_set_df.iterrows():
    documents.append(Document(
        page_content=row["predicate"],
        metadata={"uri": row["predicate_uri"]},
    ))
    
faiss_document_ids = vector_store.add_documents(documents=documents)

In [21]:
def uri_detection_agent(state):
    search_terms = state["instruction"].split(",")
    response = ""
    for term in search_terms:
        response += f'Detection Result for {term}:{[{"label": doc.page_content, "uri": doc.metadata["uri"], "score": score} for doc, score in vector_store.similarity_search_with_score(term, search_type="similarity", k=3)]}\n\n'
    response = response.replace("},", "},\n")
    return Command(goto="supervisor", update={"messages": state["messages"] + [response], "instruction": ""})

print(uri_detection_agent({"instruction": "Peru, Cusco", "messages": []}).update["messages"][-1])

Detection Result for Peru:[{'label': 'Cusco', 'uri': 'http://www.wikidata.org/entity/Q5582862', 'score': 0.75133276},
 {'label': 'Argentina', 'uri': 'http://www.wikidata.org/entity/Q414', 'score': 1.1657093},
 {'label': 'Austria', 'uri': 'http://www.wikidata.org/entity/Q40', 'score': 1.348447}]

Detection Result for  Cusco:[{'label': 'Cusco', 'uri': 'http://www.wikidata.org/entity/Q5582862', 'score': 0.17139363},
 {'label': 'Cienciano', 'uri': 'http://www.wikidata.org/entity/Q602482', 'score': 1.1785243},
 {'label': 'Argentina', 'uri': 'http://www.wikidata.org/entity/Q414', 'score': 1.3753572}]




# Entity Extraction Agent

In [39]:
from langgraph.types import Command
from langchain_core.prompts import PromptTemplate


def entity_extraction_agent(state):
    prompt_template = PromptTemplate.from_template("""
    You are an agent tasked with extracting entities from a given text for linking to a knowledge graph. Your job is to capture every entity—both explicit and implicit—and return them as an array. This includes composite entities with modifiers (e.g., "professional football club"). Please output the entities as an array of strings. Do not include any further information or comment in the output.
    
    Example Output: [Olaf Scholz, chancellor, Germany]
    
    Guidelines:
    - An entity is a unique real-world object or concept represented as a node with its properties and relationships.
    - Extract every entity mentioned in the text, including those that are not immediately obvious.
    - For composite entities, include the full descriptive phrase and break it into its core components when appropriate. For example, "chancellor of Germany" should yield [chancellor, Germany] and "professional football club" should capture the descriptive phrase as needed.
    - For composite entities that include a date at the beginning or end, extract the date separately, the entity without the date, and the full composite (e.g., "2022 Winter Olympics" should result in [2022, 2022 Winter Olympics, Winter Olympics]).
    - Also, ensure that dates are extracted as entities.
    
    Instruction: {instruction}
    
    Text: {text}
    """)



    chain = prompt_template | model
    
    response = chain.invoke({"text": state["text"], "instruction": state["instruction"]})
    
    print(f"-- START OF OUTPUT (entity_extraction_agent) --\n\n", response.content, "\n\n-- END OF OUTPUT --\n\n")
    
    return Command(goto="supervisor", update={"messages": state["messages"] + [response], "instruction": ""})

response = entity_extraction_agent({"text": text, "messages": [], "instruction": "Extract entities from the text."})

text

-- START OF OUTPUT (entity_extraction_agent) --

 [Club Sportivo Cienciano, professional football club, Cusco, Peru, 1901, Ciencias y Artes School, River Plate, Argentina, 2003, Copa Sudamericana, Boca Juniors, 2004, Recopa Sudamericana] 

-- END OF OUTPUT --




'Club Sportivo Cienciano is a professional football club based in Cusco, Peru. The club was founded in 1901 and was originally the team of the students of the "Ciencias y Artes" School (Ciencias meaning science in Spanish), from which it takes its name. It gained worldwide recognition after defeating River Plate from Argentina in the finals of the 2003 Copa Sudamericana and Boca Juniors in the 2004 Recopa Sudamericana.'

In [37]:
sorted(entity_df[entity_df["docid"] == doc_id]["entity"].to_list())

['1901',
 '2003',
 '2004',
 '2004 Recopa Sudamericana',
 'Argentina',
 'Boca Juniors',
 'Cienciano',
 'Copa Sudamericana',
 'Cusco',
 'River Plate',
 'football']

# Relation Extraction Agent

In [13]:
from langgraph.types import Command
from langchain_core.prompts import PromptTemplate

def relation_extraction_agent(state):
    prompt_template = PromptTemplate.from_template(
    """
    You are a relation extraction agent. Your task is to analyze the provided text and extract all semantic relations present. Each relation must be output in the exact format:
    
    <relation>subject;predicate;object</relation>
    
    (For example: <relation>Olaf Scholz;is chancellor of;Germany</relation>).
    
    Guidelines:
    - **Extraction Scope:** Extract only the relations explicitly mentioned in the text. Additionally, if the text implies a relation or if a relation can be inferred using the provided entity list, include that relation.
    - **Utilize Provided Entities:** Use the provided list of extracted entities to ensure that all relevant relations are captured. For example, if "technology" is in the list and the text indicates that the subject is a technology company, you must output: <relation>Apple;industry;technology</relation>.
    - **Attribute Relations:** If an entity is described by a characteristic or category (e.g., renowned film director, prestigious university), automatically extract the corresponding attribute relation. For example, if the text states "Steven Spielberg is a renowned film director", extract: <relation>Steven Spielberg;profession;film director</relation>.
    - **Formatting:** Each relation must strictly follow the format <relation>subject;predicate;object</relation> with no additional text or commentary.
    - **Accuracy:** Only include relations that are clearly supported by the text or can be confidently inferred using the provided entity list.
    
    Instruction: {instruction}
    
    Text: {text}
    """
    )

    chain = prompt_template | model
    response = chain.invoke({"text": state["text"], "instruction": state["instruction"]})
    
    print(f"-- START OF OUTPUT (relation_extraction_agent) --\n\n", response.content, "\n\n-- END OF OUTPUT --\n\n")
    
    return Command(goto="supervisor", update={"messages": state["messages"] + [response], "instruction": ""})

response = relation_extraction_agent({"text": text, "messages": [], "instruction": "Please use the already extracted entities: ['1901', '2003','2004','2004 Recopa Sudamericana','Argentina','Boca Juniors','Cienciano','Copa Sudamericana','Cusco','River Plate','football']"})

-- START OF OUTPUT (relation_extraction_agent) --

 <relation>Club Sportivo Cienciano;industry;football</relation>
<relation>Club Sportivo Cienciano;founding year;1901</relation>
<relation>Club Sportivo Cienciano;headquarters;Cusco</relation>
<relation>Club Sportivo Cienciano;location;Peru</relation>
<relation>Ciencias y Artes School;type;school</relation>
<relation>Ciencias y Artes School;science category;science</relation>
<relation>Club Sportivo Cienciano;team;students</relation>
<relation>Club Sportivo Cienciano;tournament;2003 Copa Sudamericana</relation>
<relation>Club Sportivo Cienciano;tournament;2004 Recopa Sudamericana</relation>
<relation>Club Sportivo Cienciano;opponent;River Plate</relation>
<relation>Club Sportivo Cienciano;opponent country;Argentina</relation>
<relation>Club Sportivo Cienciano;match outcome;defeated</relation>
<relation>River Plate;country;Argentina</relation>
<relation>Club Sportivo Cienciano;opponent;Boca Juniors</relation>
<relation>Boca Juniors;final

In [11]:
relation_df[relation_df["docid"] == doc_id][["subject", "predicate", "object"]]

Unnamed: 0,subject,predicate,object
3,Cienciano,sport,football
4,Cienciano,headquarters location,Cusco
5,Cienciano,inception,1901
6,River Plate,country,Argentina
