# URI Detection Agent

In [1]:
import helper_tools.parser as parser
import importlib
import pandas as pd

importlib.reload(parser)

relation_df, entity_df, docs = parser.synthie_parser("train")

Fetching 27 files:   0%|          | 0/27 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:00<00:00, 12164.45it/s]


Uploading Entities to Qdrant.


100%|██████████| 46/46 [00:06<00:00,  7.52it/s]


Uploading Predicates to Qdrant.


100%|██████████| 29/29 [00:03<00:00,  8.84it/s]


In [2]:
from langchain_openai import ChatOpenAI
from langchain_ollama.embeddings import OllamaEmbeddings
from langfuse.callback import CallbackHandler
from dotenv import load_dotenv
import os

load_dotenv()
langfuse_handler = CallbackHandler(
    secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
    public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
    host=os.getenv("LANGFUSE_HOST"),
)

model = ChatOpenAI(model_name="Meta-Llama-3.3-70B-Instruct", base_url="https://api.sambanova.ai/v1", api_key=os.getenv("SAMBANOVA_API_KEY"))
embeddings = OllamaEmbeddings(model='nomic-embed-text')

In [3]:
target_doc = docs.iloc[0]
doc_id = target_doc["docid"]
text = target_doc["text"]
text

'Corfe Castle railway station is a station on the Swanage Railway in the village of Corfe Castle, in the United Kingdom.'

# Development Space

In [4]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

client = QdrantClient("localhost", port=6333)
vector_store = QdrantVectorStore(
    client=client,
    collection_name="wikidata_labels",
    embedding=embeddings
)

In [10]:
from langchain_core.prompts import PromptTemplate

def uri_detection_agent(state):
    search_terms = state["instruction"].split(",")
    response = ""
    for term in search_terms:
        response += f'Most Similar Detection Results for {term}:{[{"label": doc.page_content, "uri": doc.metadata["uri"]} for doc in vector_store.similarity_search(term, k=3)]}\n\n'
    response = response.replace("},", "},\n")
        
    prompt_template = PromptTemplate.from_template(
        """
        You are a formatting agent. Your task is to check and format the output of the URI detection tool. The tool will give a response like this:
        Most Similar Detection Result for Olaf Scholz: ('label': Angela Merkel, 'uri': 'http://www.wikidata.org/entity/Q567)
        
        Your task is to check the response and output an overall mapping of search terms to URIs. If something doesn't match, please response the non mapping search term with the advise, that those might not be present in the knowledge graph.
        
        URI detection tool response:
        
        {response}
        """
    )
    
    chain = prompt_template | model
    response = chain.invoke({"response": response})
    
    return response

In [12]:
mock_instruction = """Corfe Castle railway station, Swanage Railway, Corfe Castle, United Kingdom"""

response = uri_detection_agent({"instruction": mock_instruction})
print(response.content)

After checking the responses from the URI detection tool, I've compiled an overall mapping of search terms to URIs. Here are the results:

* Corfe Castle railway station: http://www.wikidata.org/entity/Q5170476
* Swanage Railway: http://www.wikidata.org/entity/Q7653559
* Corfe Castle: http://www.wikidata.org/entity/Q1236511
* United Kingdom: http://www.wikidata.org/entity/Q145

The following search terms have multiple possible URIs, but I've selected the one that seems to be the most relevant based on the label:
* Corfe Castle railway station: The tool also returned URIs for Corfe Castle (http://www.wikidata.org/entity/Q1236511) and Corfe Castle (village) (http://www.wikidata.org/entity/Q13341461), but the most relevant one seems to be http://www.wikidata.org/entity/Q5170476.
* Corfe Castle: The tool also returned URIs for Corfe Castle (village) (http://www.wikidata.org/entity/Q13341461) and Corfe Castle railway station (http://www.wikidata.org/entity/Q5170476), but the most relevant o