In [1]:
import pandas as pd

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_name_or_path = "m3hrdadfi/icelandic-ner-roberta"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForTokenClassification.from_pretrained(model_name_or_path)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)         #use model and tokenizer to create our pipeline

example = "Mörður hét maður er kallaður var gígja. Hann var sonur Sighvats hins rauða. Hann bjó á Velli á Rangárvöllum. Hann var ríkur höfðingi og málafylgjumaður mikill og svo mikill lögmaður að engir þóttu löglegir dómar dæmdir nema hann væri við."

ner_results = nlp(example)                                      #tag every token in our trial input with the pipeline

person_names = []
person_types = []
scores = []
starts = []
ends = []

for entity in ner_results:                              #loop through every token
    if entity["entity"] in ["I-Person", "B-Person"]:    #check if it's a person
        start = entity["start"]
        end = entity["end"]
        word = example[start:end]                       #use start and end indexes to get the word

        person_names.append(word)                       #append the name of the person    
        person_types.append(entity["entity"])           #append the entity type
        scores.append(entity["score"])                  #append the model score
        starts.append(start)                            #append the start index
        ends.append(end)                                #append the end index

df = pd.DataFrame({"person_name": person_names,"types":person_types, "score":scores, "start":starts, "end":ends})
df

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unnamed: 0,person_name,types,score,start,end
0,Mörður,B-Person,0.999842,0,6
1,gí,B-Person,0.990288,33,35
2,gja,B-Person,0.573237,35,38
3,Sighv,B-Person,0.999778,55,60
4,ats,I-Person,0.98895,60,63
5,rauða,I-Person,0.715451,69,74


In [4]:
#concatanate the split names
extraname = pd.Series(dtype="string")

for index, row in df.iterrows():                            #loop through words as rows
    if index != len(df)-1:
        if row["end"] == df.loc[index+1,"start"]:           #check if the end index is the same index as the beginning index of the next word
            next_name = df.loc[index+1,"person_name"]
            extraname.at[index] = next_name

df["extra_name"] = extraname
df["full_name"] = df[df["extra_name"].isna()==False]["person_name"] + df[df["extra_name"].isna()==False]["extra_name"]

df

  extraname = pd.Series()


Unnamed: 0,person_name,types,score,start,end,extra_name,full_name
0,Mörður,B-Person,0.999842,0,6,,
1,gí,B-Person,0.990288,33,35,gja,gígja
2,gja,B-Person,0.573237,35,38,,
3,Sighv,B-Person,0.999778,55,60,ats,Sighvats
4,ats,I-Person,0.98895,60,63,,
5,rauða,I-Person,0.715451,69,74,,
