# Setup

In [8]:
import helper_tools.parser as parser
import importlib
import pandas as pd

importlib.reload(parser)

relation_df, entity_df, docs = parser.synthie_parser("train")
entity_set = entity_df[['entity', 'entity_uri']].drop_duplicates()
predicate_set_df = relation_df[["predicate", "predicate_uri"]].drop_duplicates()

Fetching 27 files:   0%|          | 0/27 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:00<00:00, 2576.67it/s]


Uploading Entities to Qdrant.


100%|██████████| 46/46 [00:05<00:00,  7.79it/s]


Uploading Predicates to Qdrant.


100%|██████████| 29/29 [00:02<00:00, 10.24it/s]


In [9]:
from langfuse.callback import CallbackHandler
from dotenv import load_dotenv
import os

load_dotenv()
langfuse_handler = CallbackHandler(
    secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
    public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
    host=os.getenv("LANGFUSE_HOST"),
)

In [26]:
target_doc = docs.iloc[5]
doc_id = target_doc["docid"]
text = target_doc["text"]
text

'Beta2-adrenergic agonists are agonists of the beta-2 adrenergic receptor, which is found in early modern humans and is involved in bone resorption.'

In [28]:
from langgraph.graph import StateGraph, START, END
from approaches.full_sentence.Gen1.setup import cIEState
from approaches.full_sentence.Gen1.agents.agent_instructor import agent as agent_instructor_agent
from approaches.full_sentence.Gen1.agents.entity_extractor import agent as entity_extraction_agent
from approaches.full_sentence.Gen1.agents.relation_extractor import agent as relation_extraction_agent
from approaches.full_sentence.Gen1.agents.uri_detector import agent as uri_detection_agent
from approaches.full_sentence.Gen1.agents.result_checker import agent as result_checker_agent
from approaches.full_sentence.Gen1.agents.result_formatter import agent as result_formatting_agent
from approaches.full_sentence.Gen1.agents.planner import agent as planner

builder = StateGraph(cIEState)
builder.add_node("planner", planner)
builder.add_node("agent_instructor_agent",agent_instructor_agent)
builder.add_node("entity_extraction_agent", entity_extraction_agent)
builder.add_node("relation_extraction_agent",relation_extraction_agent)
builder.add_node("uri_detection_agent",uri_detection_agent)
builder.add_node("result_checker_agent",result_checker_agent)
builder.add_node("result_formatting_agent",result_formatting_agent)

builder.add_edge(START, "planner")

graph = builder.compile()

In [29]:
response_state = graph.invoke({"text": text, "results": [], "call_trace": [], "comments": [], "debug": False}, config={"callbacks": [langfuse_handler]})

  Expected `int` but got `float` with value `1742369741.4696937` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `int` but got `float` with value `1742369742.1808693` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `int` but got `float` with value `1742369742.8804078` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `int` but got `float` with value `1742369743.7012672` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `int` but got `float` with value `1742369744.58361` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `int` but got `float` with value `1742369745.972598` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `int` but got `float` with value `1742369747.3394923` - 

# Pretty Print Response State

In [30]:
print(f"""cIE for text: {response_state["text"]}

Results:""")
for i, call in enumerate(response_state["call_trace"]):
    print(f"Agent ID: {call[0]}")
    print(f"Instruction: {call[1]}")
    print(f"Result: {response_state['results'][i]}\n\n")
    
print("Agent Comments:")
for comment in response_state["comments"]:
    print(comment)

cIE for text: Beta2-adrenergic agonists are agonists of the beta-2 adrenergic receptor, which is found in early modern humans and is involved in bone resorption.

Results:
Agent ID: entity_extraction_agent
Instruction: 
Result: -- Entity Extraction Agent --
    
Since there is no specific instruction provided by the agent_instructor, I will proceed with extracting entities from the given text.

Here is the list of extracted entities:

1. Beta2-adrenergic agonists (chemical compound)
2. Beta-2 adrenergic receptor (biological receptor)
3. Early modern humans (species/historical group)
4. Bone (anatomical structure)

Please let me know if I should proceed with any further instructions or if the agent_instructor has any additional guidance. 


Agent ID: entity_extraction_agent
Instruction: Disambiguate the extracted entities, especially "Bone", and identify their specific types to facilitate accurate mapping to the knowledge graph.
Result: -- Entity Extraction Agent --
    
To extract enti

# Evaluation

In [3]:
def get_uri_labels(df): 
    subjects = []
    predicates = []
    objects = []
    for i, row in df.iterrows():
        try:
            subjects.append(entity_set[entity_set["entity_uri"] == row["subject_uri"]]["entity"].values[0])
        except IndexError:
            subjects.append("Unknown")
        try:
            predicates.append(predicate_set_df[predicate_set_df["predicate_uri"] == row["predicate_uri"]]["predicate"].values[0])
        except IndexError:
            predicates.append("Unknown")
        if row["object_uri"] is not None and "^^" in row["object_uri"]:
            objects.append(row["object_uri"])
        else:
            try:
                objects.append(entity_set[entity_set["entity_uri"] == row["object_uri"]]["entity"].values[0])
            except IndexError:
                objects.append("Unknown")
    return pd.concat([df.reset_index(drop=True), pd.DataFrame({"subject": subjects, "predicate": predicates, "object": objects})], axis=1)

In [31]:
import pandas as pd
from rdflib import Graph, URIRef

# Load the Turtle file into an RDF graph
result_graph = Graph()
result_graph.parse(data=response_state["results"][-1], format="turtle")

# Extract triples and convert to a DataFrame
data = []
namespace_manager = result_graph.namespace_manager  # Namespace manager for prefix resolution

final_result = []
for subj, pred, obj in result_graph:
    final_result.append([str(subj), str(pred), str(obj)])
    
pred_relation_df = pd.DataFrame(final_result, columns=["subject_uri", "predicate_uri", "object_uri"]).drop_duplicates()
doc_relation_df = relation_df[relation_df["docid"] == doc_id][["subject_uri", "predicate_uri", "object_uri"]]
correct_relation_df = pred_relation_df.merge(doc_relation_df[["subject_uri", "predicate_uri", "object_uri"]], on=["subject_uri", "predicate_uri", "object_uri"], how="inner")

In [32]:
get_uri_labels(pred_relation_df)

Unnamed: 0,subject_uri,predicate_uri,object_uri,subject,predicate,object
0,http://www.wikidata.org/entity/Q287961,http://www.wikidata.org/entity/P682,http://www.wikidata.org/entity/Q4941581,Beta-2_adrenergic_receptor,biological process,Bone_resorption
1,http://www.wikidata.org/entity/Q423482,http://www.wikidata.org/entity/P1289,http://www.wikidata.org/entity/Q4941581,Beta2-adrenergic_agonist,Unknown,Bone_resorption
2,http://www.wikidata.org/entity/Q287961,http://www.wikidata.org/entity/P361,http://www.wikidata.org/entity/Q5,Beta-2_adrenergic_receptor,Unknown,Human
3,http://www.wikidata.org/entity/Q423482,http://www.wikidata.org/entity/P129,http://www.wikidata.org/entity/Q287961,Beta2-adrenergic_agonist,Unknown,Beta-2_adrenergic_receptor
4,http://www.wikidata.org/entity/Q4941581,http://www.wikidata.org/entity/P582,http://www.wikidata.org/entity/Q4941581,Bone_resorption,Unknown,Bone_resorption


In [33]:
get_uri_labels(doc_relation_df)

Unnamed: 0,subject_uri,predicate_uri,object_uri,subject,predicate,object
0,http://www.wikidata.org/entity/Q423482,http://www.wikidata.org/entity/P3772,http://www.wikidata.org/entity/Q287961,Beta2-adrenergic_agonist,agonist of,Beta-2_adrenergic_receptor
1,http://www.wikidata.org/entity/Q287961,http://www.wikidata.org/entity/P682,http://www.wikidata.org/entity/Q4941581,Beta-2_adrenergic_receptor,biological process,Bone_resorption
2,http://www.wikidata.org/entity/Q287961,http://www.wikidata.org/entity/P703,http://www.wikidata.org/entity/Q15978631,Beta-2_adrenergic_receptor,found in taxon,Early_modern_human


In [34]:
get_uri_labels(correct_relation_df)

Unnamed: 0,subject_uri,predicate_uri,object_uri,subject,predicate,object
0,http://www.wikidata.org/entity/Q287961,http://www.wikidata.org/entity/P682,http://www.wikidata.org/entity/Q4941581,Beta-2_adrenergic_receptor,biological process,Bone_resorption


In [35]:
def evaluate(pred_relation_df, doc_id, verbose=False):
    doc_relation_df = relation_df[relation_df["docid"] == doc_id][["subject_uri", "predicate_uri", "object_uri"]]
    correct_relation_df = pred_relation_df.merge(doc_relation_df[["subject_uri", "predicate_uri", "object_uri"]], on=["subject_uri", "predicate_uri", "object_uri"], how="inner")
    precision = len(correct_relation_df) / len(pred_relation_df)
    recall = len(correct_relation_df) / len(doc_relation_df)
    if precision + recall == 0:
        f1_score = 0
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)
    
    if verbose:
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1: {f1_score}")
        
    return precision, recall, f1_score

In [36]:
evaluate(pred_relation_df, doc_id, verbose=True)

Precision: 0.2
Recall: 0.3333333333333333
F1: 0.25


(0.2, 0.3333333333333333, 0.25)