# Entity Extraction Agent

In [21]:
import helper_tools.parser as parser
import importlib
import pandas as pd

importlib.reload(parser)

relation_df, entity_df, docs = parser.synthie_parser("train")

Fetching 27 files:   0%|          | 0/27 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:00<00:00, 4549.63it/s]


Uploading Entities to Qdrant.


100%|██████████| 46/46 [00:06<00:00,  7.46it/s]


Uploading Predicates to Qdrant.


100%|██████████| 29/29 [00:03<00:00,  8.27it/s]


In [17]:
from langchain_openai import ChatOpenAI
from langchain_ollama.embeddings import OllamaEmbeddings
from langfuse.callback import CallbackHandler
from dotenv import load_dotenv
import os

load_dotenv()
langfuse_handler = CallbackHandler(
    secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
    public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
    host=os.getenv("LANGFUSE_HOST"),
)

model = ChatOpenAI(model_name="Meta-Llama-3.3-70B-Instruct", base_url="https://api.sambanova.ai/v1", api_key=os.getenv("SAMBANOVA_API_KEY"))
embeddings = OllamaEmbeddings(model='nomic-embed-text')

In [18]:
target_doc = docs.iloc[0]
doc_id = target_doc["docid"]
text = target_doc["text"]
text

'Corfe Castle railway station is a station on the Swanage Railway in the village of Corfe Castle, in the United Kingdom.'

# Development Space

In [19]:
from langchain_core.prompts import PromptTemplate
import re


def entity_extraction_agent(state):
    prompt = PromptTemplate.from_template("""
    
    You are an expert for entity extraction out of text in a multi-agent-system for closed information extraction. You will receive a text out of the state from which you should extract all entities. In addition, the agent_instructor might give you an instruction, which you should follow. Your task is then to follow the optional instruction as well as this system prompt and return a comma separated list of entities that are in the text, which is enclosed in <result>insert list here</result>. 
    
    The provided input text: {text}
    Instruction: {instruction}
    
    """)
    
    response_chain = prompt | model
    
    response = response_chain.invoke(state, config={"callbacks": [langfuse_handler]})
    
    result_match = re.search(r'<result>(.*?)</result>', response.content, re.DOTALL)
    if result_match:
        result = result_match.group(1)
    else:
        result = ""
    print(response.content)
    
    return result

In [20]:
mock_instruction = """
Extract entities from the provided preprocessed text.
"""

response = entity_extraction_agent({"text": text, "instruction": mock_instruction})
response

To extract entities from the provided text, I will identify and list the names of locations, organizations, and other relevant entities mentioned in the text. 

The entities extracted from the text are: Corfe Castle railway station, Swanage Railway, Corfe Castle, United Kingdom.

<result>Corfe Castle railway station, Swanage Railway, Corfe Castle, United Kingdom</result>


'Corfe Castle railway station, Swanage Railway, Corfe Castle, United Kingdom'