# NER vs LLM

---

### Configure OLLama

Install OLLama

Open terminal and type:

`ollama run phi3:3.8b`



In [1]:
import json
import requests

# This code returns an empty string...

url = "http://localhost:11434/api/chat" 

question = "Hi. Can you help me?"

payload = {
    "model": "phi3:3.8b",
    "message":[{"role":"user","content": question}],
    "stream": False,
}

response = requests.post(url, json=payload)

# Parse the response
if response.status_code == 200:
    print("Response:\n", response.json()["message"]["content"])
else:
    print("Error:\n", response.status_code)

Response:
 


In [2]:
url = "http://localhost:11434/api/generate" 

def prompt_ollama(text):
    payload = {
        "model": "phi3:3.8b",
        "prompt": text,
        "context": [1],
        "options":{
            "top_k": 10,
            "temperature": 0
        },
        "stream": False,
    }

    response = requests.post(url, data=json.dumps(payload))
    
    # Parse the response
    if response.status_code == 200:
        return response.json()["response"]

    return None

In [3]:
prompt_ollama("Identify so called named entities in this sentence: \'Kraków jest największym miastem w Polsce\'. After that list out the entities in a form of strings in a python list. I want your response to only include the python list without any additional code or \"```\" characters. Be careful to use the polish form of the words.")

"['Kraków', 'Polska']\n"

### Take 1000 passages from fiqa corpus

In [4]:
from datasets import load_dataset
from numpy.random import choice

fiqa_corpus = load_dataset("clarin-knext/fiqa-pl", "corpus")["corpus"]

fiqa_corpus = fiqa_corpus["text"]
fiqa_idx = choice(len(fiqa_corpus), 100, replace=False)
fiqa_corpus = [fiqa_corpus[i] for i in fiqa_idx]

---

### NER baseline

In [5]:
import spacy

nlp = spacy.load("pl_core_news_sm")

def get_ents(text):
    doc = nlp(text)
    entity_dict = {}
    for ent in doc.ents:
        text, label = ent.text, ent.label_
        if (text, label) not in entity_dict.keys():
            entity_dict[(text, label)] = 0
        entity_dict[(text, label)] += 1
    return entity_dict

In [6]:
prompt = (
    "Named entities are real-world objects, such as a people, locations, organizations, products, etc., that can be denoted with a proper name.\n"
    "List out the named entities in the Polish text that I'm providing.\n"
    "Precise categories of named entities that you should recognize:\n"
    "- date\n"
    "- geogName\n"
    "- orgName\n"
    "- persName\n"
    "- placeName\n"
    "- time\n\n"
    "The text is written in Polish so do not translate the entities to english.\n"
    "Use the neutral polish nominative case to list out entities\n"
    "The output should be a list of tuples consisting of named entity and it's category\n"
    "The output should look as follows:\n"
    "[('Name1', 'categoryName1'), ('Name2', 'categoryName1')] \n\n"
    "Output the list string only. Do not add any additional characters, translation or information.\n"
    "In case of no entities found, return an empty list and nothing else.\n\n"
    "List out the named entities in the privded text:\n"
)

prompt_few_shot = (
    "Named entities are real-world objects, such as a people, locations, organizations, products, etc., that can be denoted with a proper name.\n"
    "List out the named entities in the Polish text that I'm providing.\n"
    "Precise categories of named entities that you should recognize:\n"
    "- date\n"
    "- geogName\n"
    "- orgName\n"
    "- persName\n"
    "- placeName\n"
    "- time\n\n"
    "The text is written in Polish so do not translate the entities to english.\n"
    "Use the neutral polish nominative case to list out entities\n"
    "The output should be a list of tuples consisting of named entity and it's category\n"
    "The output should look as follows:\n"
    "[('Name1', 'categoryName1'), ('Name2', 'categoryName1')]\n\n"
    "Output the list string only. Do not add any additional characters, translation or information.\n"
    "In case of no entities found, return an empty list and nothing else.\n\n"

    "For a given sentence 'George Washington był prezydentem Stanów Zjednoczonych' the output should be:\n"
    "[('George Washington','persName'),('Stany Zjednoczone','geogName')]\n"
    "For a given sentence 'Artur Rojek nie jest już wokalistą Myslovitz' the output should be:\n"
    "[('Artur Rojek', 'persName'),('Myslovitz', 'orgName')] \n\n"
    
    "List out the named entities in the privded text:\n"
)

example_text = "Wczoraj w Krakowie miało miejsce spotkanie prezydentów Polski i Stanów Zjednoczonych."

In [7]:
from ast import literal_eval

# We use spacy to lemmatize if possible as the llm seems to be oblivious to the intricacy of case declension
def get_llm_ents(text, few_shot = False):
    p = prompt_few_shot if few_shot else prompt
    response = prompt_ollama(p + text)
    
    entity_dict = {}
    for entity, label in literal_eval(response):
        ents_tmp = list(nlp(entity).sents)
        if len(ents_tmp):
            lemma = " ".join([l.lemma_ for l in ents_tmp[0]])
        else:
            lemma = entity
        if (lemma, label) not in entity_dict.keys():
            entity_dict[(lemma, label)] = 0
        entity_dict[(lemma, label)] += 1
    return entity_dict

In [8]:
get_llm_ents(example_text)

{('wczoraj', 'time'): 1,
 ('Krakowie', 'placeName'): 1,
 ('prezydent', 'orgName'): 1,
 ('Polska', 'country'): 1,
 ('Stany Zjednoczone', 'orgName'): 1}

---

### Compare spaCy to PHI-3

In [9]:
from tqdm import tqdm

fiqa_spacy = [get_ents(text) for text in fiqa_corpus]

In [10]:
fiqa_llm = [get_llm_ents(text) for text in fiqa_corpus]

SyntaxError: unterminated string literal (detected at line 1) (<unknown>, line 1)

In [None]:
fiqa_llm_few_shot = [get_llm_ents(text,few_shot=True) for text in tqdm(fiqa_corpus)]

In [None]:
from pathlib import Path
import pickle

save_path = Path("data")
save_path.mkdir(exist_ok=True,parents=True)

with open(save_path / 'fiqa_spacy.pkl', 'wb') as fp:
    pickle.dump(fiqa_spacy, fp)

with open(save_path / 'fiqa_llm.pkl', 'wb') as fp:
    pickle.dump(fiqa_llm, fp)

with open(save_path / 'fiqa_llm_few_shot.pkl', 'wb') as fp:
    pickle.dump(fiqa_llm_few_shot, fp)

In [None]:
def get_ent_cat_count(entity_dict_list):
    entities, categories = {}, {}
    for d in entity_dict_list:
        for (entity,category), count in d.items():
            if entity not in entities.keys():
                entities[entity] = 0
            if category not in categories.keys():
                categories[category] = 0
            entities[entity] += count
            categories[category] += count
    entities = sorted(list(entities.items()),key=lambda x: x[1],reverse=True)
    categories = sorted(list(categories.items()),key=lambda x: x[1],reverse=True)
    return entities, categories


In [None]:
spacy_entities, spacy_categories = get_ent_cat_count(fiqa_spacy)
llm_entities, llm_categories = get_ent_cat_count(fiqa_llm)
few_shot_entities, few_shot_categories = get_ent_cat_count(fiqa_llm_few_shot)

In [None]:
spacy_categories

In [None]:
llm_categories

In [None]:
llm_entities