In [1]:
import jsonlines
import pandas as pd
import tqdm


def rebel_parser(path, dataset_size):
    data = []

    i = 0
    
    with jsonlines.open(path) as reader:
        with tqdm.tqdm(total=dataset_size) as pbar:
            for obj in reader:
                data.append(obj)
                i += 1
                pbar.update(1)
                if i == dataset_size:
                    break
    
    relation_df = pd.DataFrame([
        {
            "docid": datapoint["docid"],
            "text": datapoint["text"],
            "subject": triple["subject"]["surfaceform"],
            "subject_uri": triple["subject"]["uri"],
            "predicate": triple["predicate"]["surfaceform"],
            "predicate_uri": triple["predicate"]["uri"],
            "object": triple["object"]["surfaceform"],
            "object_uri": triple["object"]["uri"]
        }
        for datapoint in data
        for triple in datapoint["triples"]
    ])
    
    entity_df = pd.DataFrame([
        {
            "docid": datapoint["docid"],
            "text": datapoint["text"],
            "entity": entity["surfaceform"],
            "entity_uri": entity["uri"]
        }
        for datapoint in data
        for entity in datapoint["entities"]
    ])

    return relation_df, entity_df

relation_df, entity_df = rebel_parser('./datasets/REBEL/en_train.jsonl', 10)

100%|██████████| 10/10 [00:00<00:00, 9929.70it/s]


In [2]:
relation_df.head()

Unnamed: 0,docid,text,subject,subject_uri,predicate,predicate_uri,object,object_uri
0,30111982,The Philippine one hundred-peso note (Filipino...,Philippine one hundred-peso note,Q7185360,face value,P3934,100,+100^^http://www.w3.org/2001/XMLSchema#decimal
1,30111982,The Philippine one hundred-peso note (Filipino...,Manuel A. Roxas,Q223419,position held,P39,Philippine president,Q1209571
2,30111996,The Philippine five hundred-peso note (Filipin...,Philippine five hundred-peso note,Q7185306,face value,P3934,500,+500^^http://www.w3.org/2001/XMLSchema#decimal
3,30111996,The Philippine five hundred-peso note (Filipin...,Corazon Aquino,Q1480,position held,P39,President,Q1209571
4,30112002,The Philippine one thousand-peso note (Filipin...,Philippine one thousand-peso note,Q7185363,face value,P3934,1000,+1000^^http://www.w3.org/2001/XMLSchema#decimal


In [3]:
entity_df.head()

Unnamed: 0,docid,text,entity,entity_uri
0,30111982,The Philippine one hundred-peso note (Filipino...,Filipino,Q33298
1,30111982,The Philippine one hundred-peso note (Filipino...,denomination,Q918448
2,30111982,The Philippine one hundred-peso note (Filipino...,Philippine currency,Q17193
3,30111982,The Philippine one hundred-peso note (Filipino...,Philippine president,Q1209571
4,30111982,The Philippine one hundred-peso note (Filipino...,Manuel A. Roxas,Q223419


In [4]:
predicate_set_df = relation_df[["predicate", "predicate_uri"]].drop_duplicates()
predicate_set_df

Unnamed: 0,predicate,predicate_uri
0,face value,P3934
1,position held,P39
5,country,P17
7,has list,P2354
10,country of origin,P495
11,genre,P136
12,location of formation,P740
13,inception,P571
15,work period (start),P2031
16,start time,P580


In [23]:
from langgraph.types import Command
from typing import Literal
from langchain_ollama.chat_models import ChatOllama
from langgraph.graph import StateGraph, MessagesState, START, END
from langchain_ollama import OllamaEmbeddings

model = ChatOllama(model="phi4")
embeddings = OllamaEmbeddings(model="nomic-embed-text")

In [24]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

documents = []
entity_set = entity_df[['entity', 'entity_uri']].drop_duplicates()
for index, row in entity_set.iterrows():
    documents.append(Document(
        page_content=row["entity"],
        metadata={"uri": row["entity_uri"]},
    ))
    
vector_store.add_documents(documents=documents)

['3f2d94a0-02a0-450d-917a-40e21a8d754d',
 '411a6589-adee-4cef-b8f2-8fbe4bb2f716',
 '31558032-0b58-4317-b611-f2231648e74d',
 '5ec09fd4-ab74-4622-b598-247a1f172b5d',
 '445077c6-296c-4583-ad31-e0e8eec80a0d',
 'ca5985dc-b4c5-4511-91c6-53165f8d68d1',
 '8fa85b3a-20d3-4f68-8852-b1d023b5d3e9',
 '89efd20b-46d5-4c25-a325-9b0559c16028',
 'a2db8aa0-6409-46db-b52e-73415c214253',
 'b5524f4e-dee6-45fb-844c-28245f96b028',
 'b2eea585-975d-49db-b26e-c5ca47b5c83d',
 '0a291a0b-4cc3-424f-9d9a-7be50addac52',
 '4a6f0737-ffe5-4f37-aaee-72a97e06513a',
 '620ab2f8-44aa-4c5e-ac6b-5952e151e562',
 'f792cce2-77d7-4b83-b9c5-8da0b6499068',
 'e668c4f0-d51e-4d4c-b516-d2c7a94e3724',
 '92912eb0-c6f2-4aae-9322-49c74bf40648',
 '1a176b46-474c-4acb-92b6-d17df4f1f004',
 '15d7702a-5689-4b97-aec9-dde6397644c3',
 '67372493-8ee8-44ab-bc6f-ee09e266018e',
 '96681a3e-08a7-405c-b89c-67cf634910d2',
 '268db40e-8e41-4715-8cbd-d53b768dc617',
 'ba53b992-bb23-4458-a63f-3b175cc94668',
 'b98cd471-01d2-4666-8674-855e0189ea54',
 'e6e322c4-4ade-

In [25]:
from langchain_core.prompts import PromptTemplate
import re

prompt_template = PromptTemplate.from_template("""
You are an agent to extract entities out of a given text. The entities will be linked to a knowledge graph later. You should return a list of any explicit and implicit entities found in the text. Please enclose the list with <list> and </list> tags and the entities with <entity> and </entity> tags.

Text: {text}
""")

doc_id = relation_df["docid"].iloc[0]
doc_relation_df = relation_df[relation_df["docid"] == doc_id]
prompt = prompt_template.invoke({"text": doc_relation_df["text"].iloc[0]})
print(prompt)
output = model.invoke(prompt)
print(output.content)
entity_list = re.findall(r'<entity>(.*?)</entity>', output.content)
entity_list = [vector_store.similarity_search(x, k=1)[0] for x in entity_list]
entity_set_df = pd.DataFrame({"entity": [x.page_content for x in entity_list], "entity_uri": [x.metadata["uri"] for x in entity_list]})
entity_set_df

text='\nYou are an agent to extract entities out of a given text. The entities will be linked to a knowledge graph later. You should return a list of any explicit and implicit entities found in the text. Please enclose the list with <list> and </list> tags and the entities with <entity> and </entity> tags.\n\nText: The Philippine one hundred-peso note (Filipino: "Sandaang Piso") (₱100) is a denomination of Philippine currency. Philippine president Manuel A. Roxas is currently featured on the front side of the bill, while the Mayon Volcano and the whale shark (locally known as "butanding") are featured on the reverse side.\n'
<list>
  <entity>The Philippine one hundred-peso note</entity>
  <entity>Sandaang Piso</entity>
  <entity>₱100</entity>
  <entity>Philippine currency</entity>
  <entity>Philippines</entity>
  <entity>Manuel A. Roxas</entity>
  <entity>president</entity>
  <entity>front side of the bill</entity>
  <entity>Mayon Volcano</entity>
  <entity>reverse side of the bill</en

Unnamed: 0,entity,entity_uri
0,Philippine one hundred-peso note,Q7185360
1,Philippine tarsier,Q536862
2,Philippine one hundred-peso note,Q7185360
3,Philippine currency,Q17193
4,Filipino,Q33298
5,Manuel A. Roxas,Q223419
6,President,Q1209571
7,Philippine one hundred-peso note,Q7185360
8,Mayon Volcano,Q1484
9,Philippine one hundred-peso note,Q7185360


In [26]:
prompt_template = PromptTemplate.from_template(
    """
    
You are a relation extraction agent. Your task is to read a given text (along with a given list of possible entities and relations) and extract the relations found in the text. Each relation should be written in this exact format: <relation>subject;predicate;object</relation>. Please output just the relations using the URIs, nothing else.

Example:

Entity List: 
    entity           entity_uri
0   Olaf Scholz      Q1
1   Germany          Q2

Relation Set: 
    predicate           predicate_uri
0   is chancellor of    P1

Text:
Olaf Scholz is chancellor of Germany.

Expected Output:
<relation>Q1;P1;Q2</relation>

-- Start of input --

Entity Set: 
{entity_set}

Relation Set: 
{relation_set}

Text: 
{text}
    
    """)
prompt = prompt_template.invoke({"text": relation_df["text"].iloc[0], "relation_set": predicate_set_df.to_string(), "entity_set": entity_set_df.to_string()})
print(prompt.text)
output = model.invoke(prompt)
print(output.content)
relation_list = re.findall(r'<relation>(.*?)</relation>', output.content)
relation_list = [relation.split(";") for relation in relation_list]
print(relation_list)


    
You are a relation extraction agent. Your task is to read a given text (along with a given list of possible entities and relations) and extract the relations found in the text. Each relation should be written in this exact format: <relation>subject;predicate;object</relation>. Please output just the relations using the URIs, nothing else.

Example:

Entity List: 
    entity           entity_uri
0   Olaf Scholz      Q1
1   Germany          Q2

Relation Set: 
    predicate           predicate_uri
0   is chancellor of    P1

Text:
Olaf Scholz is chancellor of Germany.

Expected Output:
<relation>Q1;P1;Q2</relation>

-- Start of input --

Entity Set: 
                              entity entity_uri
0   Philippine one hundred-peso note   Q7185360
1                 Philippine tarsier    Q536862
2   Philippine one hundred-peso note   Q7185360
3                Philippine currency     Q17193
4                           Filipino     Q33298
5                    Manuel A. Roxas    Q223419
6 

In [27]:
pred_relation_df = pd.DataFrame(relation_list, columns=["subject_uri", "predicate_uri", "object_uri"])
pred_relation_df

Unnamed: 0,subject_uri,predicate_uri,object_uri
0,Q7185360,P3934,Philippine peso
1,Q7185360,P17,Q2
2,Q7185360,P2354,Q3
3,Q223419,P39,Q6
4,Q1484,P361,Philippines
5,Q80378,P495,Philippines
6,Q7185360,P3934,Philippine peso
7,Q7185360,P17,Q2
8,Q7185360,P2354,Q3
9,Q223419,P39,Q6


In [32]:
correct_relation_df = pred_relation_df.merge(doc_relation_df[["subject_uri", "predicate_uri", "object_uri"]], on=["subject_uri", "predicate_uri", "object_uri"], how="inner")
correct_relation_df

Unnamed: 0,subject_uri,predicate_uri,object_uri


In [33]:
print(f"Accuracy: {len(correct_relation_df) / len(pred_relation_df)}")
print(f"Precision: {len(correct_relation_df) / len(doc_relation_df)}")

Accuracy: 0.0
Precision: 0.0


In [35]:
relation_df

Unnamed: 0,docid,text,subject,subject_uri,predicate,predicate_uri,object,object_uri
0,30111982,The Philippine one hundred-peso note (Filipino...,Philippine one hundred-peso note,Q7185360,face value,P3934,100,+100^^http://www.w3.org/2001/XMLSchema#decimal
1,30111982,The Philippine one hundred-peso note (Filipino...,Manuel A. Roxas,Q223419,position held,P39,Philippine president,Q1209571
2,30111996,The Philippine five hundred-peso note (Filipin...,Philippine five hundred-peso note,Q7185306,face value,P3934,500,+500^^http://www.w3.org/2001/XMLSchema#decimal
3,30111996,The Philippine five hundred-peso note (Filipin...,Corazon Aquino,Q1480,position held,P39,President,Q1209571
4,30112002,The Philippine one thousand-peso note (Filipin...,Philippine one thousand-peso note,Q7185363,face value,P3934,1000,+1000^^http://www.w3.org/2001/XMLSchema#decimal
5,30112004,Havnar Róðrarfelag is a Faroese rowing club in...,Tórshavn,Q10704,country,P17,Faroese,Q4628
6,30111989,The Philippine two hundred-peso note (Filipino...,Philippine two hundred-peso note,Q7185399,face value,P3934,200,+200^^http://www.w3.org/2001/XMLSchema#decimal
7,30111989,The Philippine two hundred-peso note (Filipino...,Philippine president,Q1209571,has list,P2354,14th President of the Philippines,Q269860
8,30111989,The Philippine two hundred-peso note (Filipino...,Diosdado Macapagal,Q312539,position held,P39,Philippine president,Q1209571
9,30111989,The Philippine two hundred-peso note (Filipino...,President Gloria Macapagal-Arroyo,Q123665,position held,P39,Philippine president,Q1209571
