# Named Entity Recognition
A simple example on how to use the vector database for linking named entities to Wikidata items

In [26]:
import requests

HEADERS = {
    'User-Agent': 'NER/1.0 (embeddings@wikimedia.de)'
}
LIMIT = 50
LANG = 'en'

# Define the entity to link to Wikidata along with the statement where it appeats
statement = 'Man Playing Flute" is a mezzotint engraving created by John Smith in the late 17th century'
named_entity = 'John Smith'

### Get potential items from Wikidata that refer to the named entity

In [27]:
# Search for candidates with keyword search

params = {
    "action": "wbsearchentities",
    "format": "json",
    "type": "item",
    "language": LANG,
    "search": named_entity,
    "limit": LIMIT
}
candidates = requests.get(
    "https://www.wikidata.org/w/api.php",
    params,
    headers=HEADERS
)
candidates = candidates.json()

### Score each candidate with vector similarity

In [28]:
# Use vector similarity score with the original statement to rank candidates

candidate_ids = [candidate['id'] for candidate in candidates.get('search', [])]

params = {
    "query": statement,
    "qid": ','.join(candidate_ids),
    "lang": LANG
}
candidate_scores = requests.get(
    'https://wd-vectordb.wmcloud.org/similarity-score',
    params=params,
    headers=HEADERS,
)
candidate_scores = candidate_scores.json()

In [29]:
highest_score_qid = candidate_scores[0]['QID']
highest_score_candidate = next((c for c in candidates['search'] if c["id"] == highest_score_qid), None)

print(highest_score_candidate['label'])
print(highest_score_candidate['id'])

John Smith
Q6258328


### Why do we need keyword search?

In [None]:
# When querying the full vector search with the statement.
# Results will be focused more on the context rather than the entity name.
# In this example, we get items relating to the art piece rather than the artist "John Smith".
# Keyword search is therefore necessary to narrow down to candidates that match the entity name. Then we can rank results by vector similarity while considering the context of the full statement.

items = requests.get(
    'https://wd-vectordb.wmcloud.org/item/query',
    params={
        'query': statement,
        'lang': LANG,
        'rerank': True
    },
    headers=HEADERS,
)
items = items.json()
items

[{'QID': 'Q26693267',
  'similarity_score': 0.74336195,
  'rrf_score': 0.0196078431372549,
  'source': 'Vector Search',
  'reranker_score': 0.45132649},
 {'QID': 'Q15994751',
  'similarity_score': 0.67061293,
  'rrf_score': 0.011363636363636364,
  'source': 'Vector Search',
  'reranker_score': 0.30074561},
 {'QID': 'Q6258270',
  'similarity_score': 0.6875923,
  'rrf_score': 0.016666666666666666,
  'source': 'Vector Search',
  'reranker_score': 0.22405544},
 {'QID': 'Q6232556',
  'similarity_score': 0.6992055,
  'rrf_score': 0.017857142857142856,
  'source': 'Vector Search',
  'reranker_score': 0.22270013},
 {'QID': 'Q21460534',
  'similarity_score': 0.6740238,
  'rrf_score': 0.012658227848101266,
  'source': 'Vector Search',
  'reranker_score': 0.22135067},
 {'QID': 'Q16239552',
  'similarity_score': 0.66782826,
  'rrf_score': 0.010752688172043012,
  'source': 'Vector Search',
  'reranker_score': 0.20181322},
 {'QID': 'Q29789017',
  'similarity_score': 0.67584056,
  'rrf_score': 0.0133