## Disambiguation

In this notebook...


In [1]:
import sys
sys.path.append("..")

In [2]:
from dotenv import load_dotenv
import os
from openai import OpenAI
from pydantic import BaseModel
import json

In [3]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [20]:
import re
import spacy
from spacy.tokens import Doc, Span


def annotated_text_to_spacy_doc(text, nlp=None):
    """
    Converts annotated text in format [Entity](LABEL) to a spaCy Doc with entity spans.
    
    Args:
        text (str): Text with annotations like "[Tom](PERSON) worked for [Microsoft](ORGANIZATION)"
        nlp (spacy.Language, optional): spaCy language model. If None, uses blank English model.
    
    Returns:
        spacy.tokens.Doc: spaCy document with entity spans set
        
    Example:
        >>> text = "[Tom](PERSON) worked for [Microsoft](ORGANIZATION) in 2020 before he lived in [Rome](LOCATION)."
        >>> doc = annotated_text_to_spacy_doc(text)
        >>> spacy.displacy.render(doc, style="ent")
    """
    if nlp is None:
        nlp = spacy.blank("en")
    
    # Pattern to match [text](LABEL) format
    pattern = r'\[([^\]]+)\]\(([^)]+)\)'
    
    # Parse the text to extract tokens and entity information
    tokens = []
    entity_spans = []  # List of (start_token_idx, end_token_idx, label)
    custom_labels = set()
    
    # Split text by the pattern and process each part
    last_end = 0
    token_idx = 0
    
    for match in re.finditer(pattern, text):
        # Add tokens before the entity
        before_entity = text[last_end:match.start()]
        if before_entity.strip():
            # Tokenize the text before the entity
            before_tokens = before_entity.split()
            tokens.extend(before_tokens)
            token_idx += len(before_tokens)
        
        # Add the entity tokens
        entity_text = match.group(1)
        entity_label = match.group(2)
        custom_labels.add(entity_label)
        
        # Tokenize the entity text
        entity_tokens = entity_text.split()
        start_token_idx = token_idx
        tokens.extend(entity_tokens)
        token_idx += len(entity_tokens)
        end_token_idx = token_idx
        
        # Store entity span information
        entity_spans.append((start_token_idx, end_token_idx, entity_label))
        
        last_end = match.end()
    
    # Add any remaining tokens after the last entity
    remaining = text[last_end:]
    if remaining.strip():
        remaining_tokens = remaining.split()
        tokens.extend(remaining_tokens)
    
    # Add custom labels to the NLP model if they don't exist
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner")
    else:
        ner = nlp.get_pipe("ner")
    
    for label in custom_labels:
        ner.add_label(label)
    
    # Create spaces array (True for tokens that should have a space after them)
    # Simple heuristic: all tokens except the last one get a space
    spaces = [True] * len(tokens)
    if tokens:
        spaces[-1] = False
    
    # Create the Doc from tokens
    doc = Doc(nlp.vocab, words=tokens, spaces=spaces)
    
    # Create entity spans
    entities = []
    for start_idx, end_idx, label in entity_spans:
        if start_idx < len(doc) and end_idx <= len(doc):
            span = Span(doc, start_idx, end_idx, label=label)
            entities.append(span)
    
    # Set entities on the document
    doc.ents = entities
    
    return doc


def visualize_annotated_text(text, nlp=None, style="ent", jupyter=True):
    """
    Convenience function to convert annotated text and visualize it with displaCy.
    
    Args:
        text (str): Text with annotations like "[Tom](PERSON) worked for [Microsoft](ORGANIZATION)"
        nlp (spacy.Language, optional): spaCy language model. If None, uses blank English model.
        style (str): displaCy style ("ent" or "dep")
        jupyter (bool): Whether to render for Jupyter notebook
    
    Returns:
        Rendered visualization (HTML string if not in Jupyter)
    """
    doc = annotated_text_to_spacy_doc(text, nlp)
    
    try:
        import spacy
        return spacy.displacy.render(doc, style=style, jupyter=jupyter)
    except ImportError:
        print("spaCy not installed. Please install with: pip install spacy")
        return None


In [4]:
client = OpenAI(api_key=OPENAI_API_KEY)

In [5]:
MODEL = "gpt-4o-mini"
TEXT = "They marched from [Alexandria](LOCATION) through [Memphis](LOCATION) via the [Nile](LOCATION) to [Thebes](LOCATION)."
ENTITY_TO_IDENTIFY = "Memphis"

In [6]:
prompt = """
Query the web to identify this entity in Wikidata.

{entity}

It is within the context of the following text:

{text}

Only return the JSON output, nothing else. Do so with the following schema:

class Entity(BaseModel):
    entity_text: str
    label: str
    wikidata_id: str
    sources: list[str]
"""

In [7]:
formatted_prompt = prompt.format(entity=ENTITY_TO_IDENTIFY, text=TEXT)

In [8]:
print(formatted_prompt)


Query the web to identify this entity in Wikidata.

Memphis

It is within the context of the following text:

They marched from [Alexandria](LOCATION) through [Memphis](LOCATION) via the [Nile](LOCATION) to [Thebes](LOCATION).

Only return the JSON output, nothing else. Do so with the following schema:

class Entity(BaseModel):
    entity_text: str
    label: str
    wikidata_id: str
    sources: list[str]



In [9]:

response = client.responses.create(
    model="gpt-4o",
    tools=[{"type": "web_search",
}],
    input=formatted_prompt,
)

output_text = response.output_text

In [10]:
print(output_text)

```json
{
  "entity_text": "Memphis",
  "label": "Memphis",
  "wikidata_id": "Q5715",
  "sources": [
    "Wikidata entry for ancient capital of Inebu-hedj, Egypt",
    "UNT Digital Library ”Egypt – Giza Governorate – Memphis\" for Wikidata link Q5715"
  ]
}
```


In [None]:
def parse_json_with_sources(text):
    json_data = text.split("```json")[1]
    json_data, sources = json_data.split("```")
    json_data = json.loads(json_data)
    return json_sdata, sources

json_output, sources = parse_json_with_sources(output_text)
print(json_output)

{'entity_text': 'Memphis', 'label': 'Memphis', 'wikidata_id': 'Q5715', 'sources': ['Wikidata entry for ancient capital of Inebu-hedj, Egypt', 'UNT Digital Library ”Egypt – Giza Governorate – Memphis" for Wikidata link Q5715']}


In [16]:
from spacy import displacy
import spacy

In [19]:
TEXT

'They marched from [Alexandria](LOCATION) through [Memphis](LOCATION) via the [Nile](LOCATION) to [Thebes](LOCATION).'

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(TEXT)

W0912 09:27:14.353000 43069 site-packages/torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [21]:
doc = annotated_text_to_spacy_doc(TEXT)

In [22]:
displacy.render(doc, style="ent")

In [42]:
output_ents = []
for ent in doc.ents:
    if ent.text == ENTITY_TO_IDENTIFY:
        output_ents.append({"start": ent.start_char, "end": ent.end_char, "label": f'{ent.label_} <a href="https://www.wikidata.org/wiki/{json_output["wikidata_id"]}">{json_output["wikidata_id"]}</a>'})


In [43]:
dic_ents = {
    "text": doc.text,
    "ents": output_ents,
    "title": None
}

displacy.render(dic_ents, manual=True, style="ent")