In [3]:
from langchain_core.runnables import RunnableParallel

import helper_tools.parser as parser
import importlib
import pandas as pd
''
importlib.reload(parser)

relation_df, entity_df, docs = parser.redfm_parser("train")

Fetching 22 files:   0%|          | 0/22 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:00<00:00, 9100.25it/s]


In [4]:
docs.head()

Unnamed: 0,docid,text
0,1755846-1,CBS Corporation comprised the over-the-air tel...
1,1755846-2,The second merger between CBS Corporation and ...
2,1701411-0,Club Sportivo Cienciano is a professional foot...
3,1854133-1,It is the seat of a municipality with 203.30 k...
4,1602703-0,Bad Ischl is a spa town in Austria. It lies in...


In [5]:
predicate_set_df = relation_df[["predicate", "predicate_uri"]].drop_duplicates()
predicate_set_df

Unnamed: 0,predicate,predicate_uri
0,owned by,P127
1,follows,P155
2,inception,P571
3,sport,P641
4,headquarters location,P159
6,country,P17
7,shares border with,P47
8,located in or next to body of water,P206
10,location,P276
13,spouse,P26


In [6]:
from langgraph.types import Command
from typing import Literal
from langchain_ollama.chat_models import ChatOllama
from langgraph.graph import StateGraph, MessagesState, START, END
from langchain_ollama import OllamaEmbeddings
import os
from dotenv import load_dotenv

load_dotenv()

ollama_base_url = os.getenv("OLLAMA_BASE_URL")

model = ChatOllama(base_url=ollama_base_url, model="phi4")
embeddings = OllamaEmbeddings(base_url=ollama_base_url, model="nomic-embed-text")

In [7]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

documents = []
entity_set = entity_df[['entity', 'entity_uri']].drop_duplicates()
for index, row in entity_set.iterrows():
    documents.append(Document(
        page_content=row["entity"],
        metadata={"uri": row["entity_uri"]},
    ))
    
vector_store.add_documents(documents=documents)

['c8a89e4a-be40-46f5-8e50-bf71b7aa3957',
 '6d8cee5e-eb56-45e1-b893-998c2a470d17',
 '0b03bb38-f57f-4cb0-81ed-2372eba4bf43',
 'eb78dc1a-ce8f-44c9-8def-2cbddcbc92e3',
 'ef6ca036-7e5a-4797-b107-4cc58594be55',
 'f71b17cd-9b54-43e1-9dd9-2acadd8c1256',
 '9dc15bfc-cb2f-47f2-b251-cb83824122d0',
 '3f2b0f16-dcd0-4058-809a-417329e269da',
 'eef62250-849e-47e8-8927-46cf112d8bc8',
 'c2dac769-3ed5-4d4a-bc27-d4cc38a15ed7',
 '52e84430-de06-492b-835a-f7d263d7aef3',
 '4ae60ef6-e9d8-4599-ba8a-fc876cec93b8',
 'd7e07c24-71e2-4ca8-801c-1fb730da4d50',
 '09945c02-6f0f-4a32-8a6e-1d00bae9381a',
 'e2e28497-b25c-4a05-91ae-a3300709e62a',
 '455d16cd-dc1b-41f6-ae7c-786cee82dc47',
 'ab28bde7-4e73-465e-8fce-70db141f5719',
 '6516751c-e41a-4411-8d09-beec82fc9c1a',
 '929249da-dd07-4300-b946-96731f368ef8',
 '3680bdf4-7c5e-4873-a250-c32824764669',
 '46ec932a-05bf-4a86-9e2c-434f7a583e43',
 '69477235-d892-4c39-8a35-2cd6f14b6340',
 'f9196f7c-c0be-4d64-8e70-20087bd70434',
 '3e56510d-d557-440e-8036-32584192bf0a',
 '658bd91b-10bf-

In [16]:
from typing import Literal, TypedDict
from langgraph.graph import StateGraph, MessagesState, START, END
from langchain_core.prompts import PromptTemplate
import re
from typing import TypedDict
from langchain_core.messages import HumanMessage, AIMessage

class cIEState(TypedDict):
    text: str
    messages: list[HumanMessage | AIMessage]
    instruction: str

members = ["entity_extraction_agent", "relation_extraction_agent"]
# Our team supervisor is an LLM node. It just picks the next agent to process
# and decides when the work is completed
options = members + ["FINISH"]

def supervisor(state: cIEState) -> Command[Literal["entity_extraction_agent", "relation_extraction_agent", END]]:    
    system_prompt = """
You are the Supervisor of a conversation among two agents: entity_extraction_agent and relation_extraction_agent.
The conversation is about extracting information (Closed Information Extraction) from a user-provided text.

You have two options:
1. Call an agent using <goto>agent_name</goto>. Replace agent_name with either entity_extraction_agent or relation_extraction_agent. I.e. <goto>entity_extraction_agent</goto>.
2. Finish the conversation using <goto>FINISH</goto>. Please also output the final result.

Note:
- Do not provide any information yourself, instead use the agents for this.
- The first <goto> tag in your response will be executed.
- Therefore, do include exact one agent call in your response.
- If you output nothing, this will result in a NoneType Error.

"""

    messages = [{"role": "system", "content": system_prompt},] + state["messages"]
    response = model.invoke(messages)
    
    print("SUPERVISOR:", response.content, "\n\n-- END OF OUTPUT --\n\n")
    
    goto = re.search(r'<goto>(.*?)</goto>', response.content).group(1)
    if goto == "FINISH":
        goto = END

    return Command(goto=goto, update={"messages": messages + [response]})

def entity_extraction_agent(state: cIEState) -> Command[Literal["supervisor"]]:
    prompt_template = PromptTemplate.from_template("""
        You are an agent to extract entities out of a given text. The entities will be linked to a knowledge graph later. You should return a list of any explicit and implicit entities found in the text. Please output the entities in an array like this: [Olaf Scholz, Germany, Berlin]. Please return only the array.

        Text: {text}
    """)
    chain = prompt_template | model
    response = chain.invoke({"text": state["text"]})
    
    print("AGENT:", response.content, "\n\n-- END OF OUTPUT --\n\n")
    
    return Command(goto="supervisor", update={"messages": state["messages"] + [response]})

def relation_extraction_agent(state: cIEState) -> Command[Literal["supervisor"]]:
    prompt_template = PromptTemplate.from_template(
    """
    You are a relation extraction agent. Your task is to read the text of the user message and extract the relations found in the text. Each relation should be written in this exact format: <relation>subject;predicate;object</relation> (e.g.: <relation>Olaf Scholz;is chancellor of;Germany</relation>). Please return only the relations and no other information.
    
    Text: {text}
        """)
    chain = prompt_template | model
    response = chain.invoke({"text": state["text"]})
    
    print("AGENT:", response.content, "\n\n-- END OF OUTPUT --\n\n")
    
    return Command(goto="supervisor", update={"messages": state["messages"] + [response]})
    
builder = StateGraph(cIEState)
builder.add_node(supervisor)
builder.add_node(entity_extraction_agent)
builder.add_node(relation_extraction_agent)

builder.add_edge(START, "supervisor")

graph = builder.compile()

In [17]:
from langfuse.callback import CallbackHandler
from dotenv import load_dotenv
import os

load_dotenv()
langfuse_handler = CallbackHandler(
    secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
    public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
    host=os.getenv("LANGFUSE_HOST"),
)

graph.invoke({"text": docs.iloc[0]["text"], "messages": [docs.iloc[0]["text"]], "instruction": ""}, config={"callbacks": [langfuse_handler]})

SUPERVISOR: <goto>entity_extraction_agent</goto>
The entity extraction agent should identify key entities such as "CBS Corporation," "over-the-air television" (including "CBS" and "The CW"), "television production and distribution," "publishing," "pay-cable," "recording assets," "first Viacom," "world's eighth largest entertainment company," "revenue," "CBS Building," "Midtown Manhattan," and "New York City." These entities are crucial for understanding the structure, ownership, and components of CBS Corporation as mentioned in the text. 

-- END OF OUTPUT --


AGENT: ["CBS Corporation", "over-the-air television", "The CW", "television production and distribution", "publishing", "pay-cable", "recording assets", "first Viacom", "eighth largest entertainment company", "CBS Building", "Midtown Manhattan", "New York City"] 

-- END OF OUTPUT --


SUPERVISOR:  

-- END OF OUTPUT --




AttributeError: 'NoneType' object has no attribute 'group'

In [18]:
from langchain_core.prompts import PromptTemplate
import re

prompt_template = PromptTemplate.from_template("""
You are an agent to extract entities out of a given text. The entities will be linked to a knowledge graph later. You should return a list of any explicit and implicit entities found in the text. Please enclose the list with <list> and </list> tags and the entities with <entity> and </entity> tags.

Text: {text}
""")

doc_id = docs.iloc[6]["docid"]
text = docs[docs["docid"] == doc_id]["text"]
doc_relation_df = relation_df[relation_df["docid"] == doc_id]
prompt = prompt_template.invoke({"text": text})
output = model.invoke(prompt)
entity_list = re.findall(r'<entity>(.*?)</entity>', output.content)
entity_list = [vector_store.similarity_search(x, k=1)[0] for x in entity_list]
entity_set_df = pd.DataFrame({"entity": [x.page_content for x in entity_list], "entity_uri": [x.metadata["uri"] for x in entity_list]}).drop_duplicates()
entity_set_df

Unnamed: 0,entity,entity_uri
0,Humiliated and Insulted,Q1130546
1,Vremya,Q1959539
5,publishing,Q1756332
6,Fyodor Dostoevsky,Q991
7,1861,1861-01-01T00:00:00Z^^http://www.w3.org/2001/X...


In [17]:
prompt_template = PromptTemplate.from_template(
    """
    
You are a relation extraction agent. Your task is to read a given text (along with a given list of possible entities) and extract the relations found in the text. Each relation should be written in this exact format: <relation>subject;predicate;object</relation>.

Example:

Entity List: 
    entity           entity_uri
0   Olaf Scholz      Q1
1   Germany          Q2

Text:
Olaf Scholz is chancellor of Germany.

Expected Output:
<relation>Olaf Scholz;chancellor of;Germany</relation>

-- Start of input --

Entity Set: 
{entity_set}

Relation Set: 
{relation_set}

Text: 
{text}
    
    """)
prompt = prompt_template.invoke({"text": text, "relation_set": predicate_set_df.to_string(), "entity_set": entity_set_df.to_string()})
output = model.invoke(prompt)
relation_list = re.findall(r'<relation>(.*?)</relation>', output.content)
relation_list = [relation.split(";") for relation in relation_list]
print(relation_list)

[['Fyodor Dostoevsky', 'author of', 'Humiliated and Insulted'], ['Humiliated and Insulted', 'inception', '1861'], ['Humiliated and Insulted', 'published in', 'Vremya']]


In [None]:
pred_relation_df = pd.DataFrame(relation_list, columns=["subject_uri", "predicate_uri", "object_uri"])
pred_relation_df

In [None]:
correct_relation_df = pred_relation_df.merge(doc_relation_df[["subject_uri", "predicate_uri", "object_uri"]], on=["subject_uri", "predicate_uri", "object_uri"], how="inner")
correct_relation_df

In [None]:
print(f"Accuracy: {len(correct_relation_df) / len(pred_relation_df)}")
print(f"Precision: {len(correct_relation_df) / len(doc_relation_df)}")

In [None]:
relation_df