# Setup

In [1]:
import helper_tools.parser as parser
import importlib
import pandas as pd

importlib.reload(parser)

relation_df, entity_df, docs = parser.synthie_parser("train")
entity_set = entity_df[['entity', 'entity_uri']].drop_duplicates()
predicate_set_df = relation_df[["predicate", "predicate_uri"]].drop_duplicates()

Fetching 27 files:   0%|          | 0/27 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:00<00:00, 10743.61it/s]


Uploading Entities to Qdrant.


100%|██████████| 46/46 [00:06<00:00,  7.42it/s]


Uploading Predicates to Qdrant.


100%|██████████| 29/29 [00:03<00:00,  8.99it/s]


In [2]:
from langfuse.callback import CallbackHandler
from dotenv import load_dotenv
import os

load_dotenv()
langfuse_handler = CallbackHandler(
    secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
    public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
    host=os.getenv("LANGFUSE_HOST"),
)

In [3]:
from langgraph.graph import StateGraph, START, END
from approaches.full_sentence.Gen1.setup import cIEState
from approaches.full_sentence.Gen1.agents.agent_instructor import agent as agent_instructor_agent
from approaches.full_sentence.Gen1.agents.entity_extractor import agent as entity_extraction_agent
from approaches.full_sentence.Gen1.agents.relation_extractor import agent as relation_extraction_agent
from approaches.full_sentence.Gen1.agents.uri_detector import agent as uri_detection_agent
from approaches.full_sentence.Gen1.agents.result_checker import agent as result_checker_agent
from approaches.full_sentence.Gen1.agents.result_formatter import agent as result_formatting_agent
from approaches.full_sentence.Gen1.agents.planner import agent as planner

builder = StateGraph(cIEState)
builder.add_node("planner", planner)
builder.add_node("agent_instructor_agent",agent_instructor_agent)
builder.add_node("entity_extraction_agent", entity_extraction_agent)
builder.add_node("relation_extraction_agent",relation_extraction_agent)
builder.add_node("uri_detection_agent",uri_detection_agent)
builder.add_node("result_checker_agent",result_checker_agent)
builder.add_node("result_formatting_agent",result_formatting_agent)

builder.add_edge(START, "planner")

graph = builder.compile()

In [4]:
target_doc = docs.iloc[1]
doc_id = target_doc["docid"]
text = target_doc["text"]
text

'Ricardo Lumengo is a Swiss politician. He was born in Fribourg and lives in Biel/Bienne. He works in Bern and speaks the Kongo language.'

In [5]:
response_state = graph.invoke({"text": text, "results": [], "call_trace": [], "comments": [], "debug": False}, config={"callbacks": [langfuse_handler], "recursion_limit": 100})

# Pretty Print Response State

In [6]:
print(f"""cIE for text: {response_state["text"]}

Results:""")
for i, call in enumerate(response_state["call_trace"]):
    print(f"Agent ID: {call[0]}")
    print(f"Instruction: {call[1]}")
    print(f"Result: {response_state['results'][i]}\n\n")
    
print("Agent Comments:")
for comment in response_state["comments"]:
    print(comment)

cIE for text: Ricardo Lumengo is a Swiss politician. He was born in Fribourg and lives in Biel/Bienne. He works in Bern and speaks the Kongo language.

Results:
Agent ID: entity_extraction_agent
Instruction: 
Result: -- Entity Extraction Agent --
    
Since there is no specific instruction provided, I will proceed to extract all entities from the given text. Here is the list of extracted entities:

1. **Person**: Ricardo Lumengo
2. **Nationality**: Swiss
3. **Location**: 
   - Fribourg
   - Biel/Bienne
   - Bern
4. **Language**: Kongo

Please let me know if there are any further instructions or if you would like me to proceed with any additional tasks. 


Agent ID: relation_extraction_agent
Instruction: Ricardo Lumengo, Swiss, Fribourg, Biel/Bienne, Bern, Kongo. Consider disambiguating complex relationships such as "born in," "lives in," and "works in."
Result: -- Relation Extraction Agent --
    
To extract the relations from the provided text and follow the given instruction, we need

# Evaluation

In [11]:
def get_uri_labels(df): 
    subjects = []
    predicates = []
    objects = []
    for i, row in df.iterrows():
        try:
            subjects.append(entity_set[entity_set["entity_uri"] == row["subject_uri"]]["entity"].values[0])
        except IndexError:
            subjects.append("Unknown")
        try:
            predicates.append(predicate_set_df[predicate_set_df["predicate_uri"] == row["predicate_uri"]]["predicate"].values[0])
        except IndexError:
            predicates.append("Unknown")
        if row["object_uri"] is not None and "^^" in row["object_uri"]:
            objects.append(row["object_uri"])
        else:
            try:
                objects.append(entity_set[entity_set["entity_uri"] == row["object_uri"]]["entity"].values[0])
            except IndexError:
                objects.append("Unknown")
    return pd.concat([df.reset_index(drop=True), pd.DataFrame({"subject": subjects, "predicate": predicates, "object": objects})], axis=1)

In [12]:
import pandas as pd
from rdflib import Graph, URIRef

# Load the Turtle file into an RDF graph
result_graph = Graph()
result_graph.parse(data=response_state["results"][-1], format="turtle")

# Extract triples and convert to a DataFrame
data = []
namespace_manager = result_graph.namespace_manager  # Namespace manager for prefix resolution

final_result = []
for subj, pred, obj in result_graph:
    final_result.append([str(subj), str(pred), str(obj)])
    
pred_relation_df = pd.DataFrame(final_result, columns=["subject_uri", "predicate_uri", "object_uri"]).drop_duplicates()
doc_relation_df = relation_df[relation_df["docid"] == doc_id][["subject_uri", "predicate_uri", "object_uri"]]
correct_relation_df = pred_relation_df.merge(doc_relation_df[["subject_uri", "predicate_uri", "object_uri"]], on=["subject_uri", "predicate_uri", "object_uri"], how="inner")

In [13]:
get_uri_labels(pred_relation_df)

Unnamed: 0,subject_uri,predicate_uri,object_uri,subject,predicate,object


In [10]:
get_uri_labels(doc_relation_df)

Unnamed: 0,subject_uri,predicate_uri,object_uri,subject,predicate,object
0,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P1321,http://www.wikidata.org/entity/Q36378,Ricardo_Lumengo,place of origin (Switzerland),Fribourg
1,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P27,http://www.wikidata.org/entity/Q39,Ricardo_Lumengo,country of citizenship,Switzerland
2,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P551,http://www.wikidata.org/entity/Q1034,Ricardo_Lumengo,residence,Biel/Bienne
3,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P937,http://www.wikidata.org/entity/Q70,Ricardo_Lumengo,work location,Bern
4,http://www.wikidata.org/entity/Q677663,http://www.wikidata.org/entity/P1412,http://www.wikidata.org/entity/Q33702,Ricardo_Lumengo,"languages spoken, written or signed",Kongo_language


In [34]:
get_uri_labels(correct_relation_df)

Unnamed: 0,subject_uri,predicate_uri,object_uri,subject,predicate,object
0,http://www.wikidata.org/entity/Q287961,http://www.wikidata.org/entity/P682,http://www.wikidata.org/entity/Q4941581,Beta-2_adrenergic_receptor,biological process,Bone_resorption


In [35]:
def evaluate(pred_relation_df, doc_id, verbose=False):
    doc_relation_df = relation_df[relation_df["docid"] == doc_id][["subject_uri", "predicate_uri", "object_uri"]]
    correct_relation_df = pred_relation_df.merge(doc_relation_df[["subject_uri", "predicate_uri", "object_uri"]], on=["subject_uri", "predicate_uri", "object_uri"], how="inner")
    precision = len(correct_relation_df) / len(pred_relation_df)
    recall = len(correct_relation_df) / len(doc_relation_df)
    if precision + recall == 0:
        f1_score = 0
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)
    
    if verbose:
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1: {f1_score}")
        
    return precision, recall, f1_score

In [36]:
evaluate(pred_relation_df, doc_id, verbose=True)

Precision: 0.2
Recall: 0.3333333333333333
F1: 0.25


(0.2, 0.3333333333333333, 0.25)