# Setup

In [1]:
import helper_tools.parser as parser
import importlib
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

importlib.reload(parser)

relation_df, entity_df, docs = parser.synthie_parser("train")
entity_set = entity_df[['entity', 'entity_uri']].drop_duplicates()
predicate_set_df = relation_df[["predicate", "predicate_uri"]].drop_duplicates()

Fetching 27 files:   0%|          | 0/27 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:00<00:00, 27395.85it/s]


Uploading Entities to Qdrant.


100%|██████████| 46/46 [00:06<00:00,  6.91it/s]


Uploading Predicates to Qdrant.


100%|██████████| 29/29 [00:04<00:00,  6.79it/s]


In [2]:
from langfuse.callback import CallbackHandler
from dotenv import load_dotenv
import os

load_dotenv()
langfuse_handler = CallbackHandler(
    secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
    public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
    host=os.getenv("LANGFUSE_HOST"),
)

In [3]:
from langgraph.graph import StateGraph, START, END
from approaches.Supervisor.Gen1.setup import cIEState
from approaches.Supervisor.Gen1.agents.agent_instructor import agent as agent_instructor_agent
from approaches.Supervisor.Gen1.agents.entity_extractor import agent as entity_extraction_agent
from approaches.Supervisor.Gen1.agents.relation_extractor import agent as relation_extraction_agent
from approaches.Supervisor.Gen1.agents.uri_detector import agent as uri_detection_agent
from approaches.Supervisor.Gen1.agents.result_checker import agent as result_checker_agent
from approaches.Supervisor.Gen1.agents.result_formatter import agent as result_formatting_agent
from approaches.Supervisor.Gen1.agents.planner import agent as planner

builder = StateGraph(cIEState)
builder.add_node("planner", planner)
builder.add_node("agent_instructor_agent",agent_instructor_agent)
builder.add_node("entity_extraction_agent", entity_extraction_agent)
builder.add_node("relation_extraction_agent",relation_extraction_agent)
builder.add_node("uri_detection_agent",uri_detection_agent)
builder.add_node("result_checker_agent",result_checker_agent)
builder.add_node("result_formatting_agent",result_formatting_agent)

builder.add_edge(START, "planner")

graph = builder.compile()

In [7]:
target_doc = docs.iloc[3]
doc_id = target_doc["docid"]
text = target_doc["text"]
text

'Lambda Mensae is a star in the constellation Mensa. It was discovered by Nicolas-Louis de Lacaille, and named after Table Mountain in South Africa. Mensa shares borders with Chamaeleon and Hydrus.'

In [8]:
response_state = graph.invoke({"text": text, "results": [], "call_trace": [], "comments": [], "debug": False}, config={"callbacks": [langfuse_handler], "recursion_limit": 100})

# Pretty Print Response State

In [7]:
print(f"""cIE for text: {response_state["text"]}

Results:""")
for i, call in enumerate(response_state["call_trace"]):
    print(f"Agent ID: {call[0]}")
    print(f"Instruction: {call[1]}")
    print(f"Result: {response_state['results'][i]}\n\n")
    
print("Agent Comments:")
for comment in response_state["comments"]:
    print(comment)

cIE for text: Ricardo Lumengo is a Swiss politician. He was born in Fribourg and lives in Biel/Bienne. He works in Bern and speaks the Kongo language.

Results:
Agent ID: entity_extraction_agent
Instruction: Extract entities from the given text.
Result: -- Entity Extraction Agent --
    
Based on the provided text, I will extract the entities as per the instruction. Here is the list of extracted entities:

1. **Person**: Ricardo Lumengo
2. **Nationality**: Swiss
3. **Location**: 
   - Fribourg
   - Biel/Bienne
   - Bern
4. **Language**: Kongo

These entities are extracted from the given text as they represent specific objects, concepts, or individuals mentioned in the text. Let me know if you need further assistance or have any additional instructions. 


Agent ID: relation_extraction_agent
Instruction: Extract relations from the text focusing on the entities: Ricardo Lumengo, Swiss, Fribourg, Biel/Bienne, Bern, and Kongo. Disambiguate any potentially ambiguous relations such as "born 

# Evaluation

In [6]:
from helper_tools.evaluation import parse_turtle, evaluate, get_uri_labels

In [14]:
turtle_string = response_state["results"][-1]
pred_relation_df, error = parse_turtle(turtle_string)
doc_relation_df = relation_df[relation_df["docid"] == doc_id][["subject_uri", "predicate_uri", "object_uri"]]
correct_relation_df = pred_relation_df.merge(doc_relation_df[["subject_uri", "predicate_uri", "object_uri"]], on=["subject_uri", "predicate_uri", "object_uri"], how="inner")

In [10]:
get_uri_labels(pred_relation_df)

Unnamed: 0,subject_uri,predicate_uri,object_uri,subject,predicate,object
0,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P937,http://www.wikidata.org/entity/Q70,Ricardo_Lumengo,work location,Bern
1,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P27,http://www.wikidata.org/entity/Q39,Ricardo_Lumengo,country of citizenship,Switzerland
2,http://www.wikidata.org/entity/Q70,http://www.wikidata.org/entity/P17,http://www.wikidata.org/entity/Q39,Bern,country,Switzerland
3,http://www.wikidata.org/entity/Q1034,http://www.wikidata.org/entity/P17,http://www.wikidata.org/entity/Q39,Biel/Bienne,country,Switzerland
4,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P19,http://www.wikidata.org/entity/Q36378,Ricardo_Lumengo,Unknown,Fribourg
5,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P1412,http://www.wikidata.org/entity/Q33702,Ricardo_Lumengo,"languages spoken, written or signed",Kongo_language
6,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P551,http://www.wikidata.org/entity/Q1034,Ricardo_Lumengo,residence,Biel/Bienne
7,http://www.wikidata.org/entity/Q36378,http://www.wikidata.org/entity/P17,http://www.wikidata.org/entity/Q39,Fribourg,country,Switzerland


In [11]:
get_uri_labels(doc_relation_df)

Unnamed: 0,subject_uri,predicate_uri,object_uri,subject,predicate,object
0,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P1321,http://www.wikidata.org/entity/Q36378,Ricardo_Lumengo,place of origin (Switzerland),Fribourg
1,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P27,http://www.wikidata.org/entity/Q39,Ricardo_Lumengo,country of citizenship,Switzerland
2,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P551,http://www.wikidata.org/entity/Q1034,Ricardo_Lumengo,residence,Biel/Bienne
3,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P937,http://www.wikidata.org/entity/Q70,Ricardo_Lumengo,work location,Bern
4,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P1412,http://www.wikidata.org/entity/Q33702,Ricardo_Lumengo,"languages spoken, written or signed",Kongo_language


In [12]:
get_uri_labels(correct_relation_df)

Unnamed: 0,subject_uri,predicate_uri,object_uri,subject,predicate,object
0,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P937,http://www.wikidata.org/entity/Q70,Ricardo_Lumengo,work location,Bern
1,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P27,http://www.wikidata.org/entity/Q39,Ricardo_Lumengo,country of citizenship,Switzerland
2,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P1412,http://www.wikidata.org/entity/Q33702,Ricardo_Lumengo,"languages spoken, written or signed",Kongo_language
3,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P551,http://www.wikidata.org/entity/Q1034,Ricardo_Lumengo,residence,Biel/Bienne


In [15]:
evaluate(turtle_string, doc_id, relation_df)

(0.5, 0.8, 0.6153846153846154)

# Evaluation on Test

In [4]:
evaluation_df = []   

In [16]:
i = 9
target_doc = docs.iloc[i]
doc_id = target_doc["docid"]
text = target_doc["text"]
print(f"doc: {doc_id} - text: {text}")
response = graph.invoke({"text": text, "results": [], "call_trace": [], "comments": [], "debug": False}, config={"callbacks": [langfuse_handler], "recursion_limit": 100})
evaluation_df.append([doc_id, *evaluate(response["results"][-1], doc_id, relation_df)])

doc: 9 - text: Mohsin Siddiqui is a Pakistani politician who is a member of the Muttahida Qaumi Movement, a secular political party.


In [21]:
evaluation_df = pd.DataFrame(evaluation_df, columns=["docid", "precision", "recall", "f1_score"])
evaluation_df

Unnamed: 0,docid,precision,recall,f1_score
0,0,0.5,0.5,0.5
1,1,0.5,0.8,0.615385
2,2,0.25,0.25,0.25
3,4,0.166667,1.0,0.285714
4,5,0.0,0.0,0.0
5,6,0.0,0.0,0.0
6,7,0.2,0.25,0.222222
7,8,0.0,0.0,0.0
8,9,0.5,0.2,0.285714
9,3,0.0,0.0,0.0


In [22]:
print(f'F1 (Macro Avg.): {evaluation_df["f1_score"].mean()}')
print(f'Precision (Macro Avg.): {evaluation_df["precision"].mean()}')
print(f'Recall (Macro Avg.): {evaluation_df["recall"].mean()}')

F1 (Macro Avg.): 0.21590354090354089
Precision (Macro Avg.): 0.21166666666666661
Recall (Macro Avg.): 0.3
