In [25]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import json
import pandas as pd
import re

In [6]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

In [127]:
text = "RGU is a university located in Aberdeen, south east part of Scotland"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)

In [143]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "RGU is a university located in Aberdeen, south east part of Scotland"

In [332]:
ner_results = nlp(example)
counter = 0
for index in range(len(ner_results)):
    
    ner_results[index]["parentEntity"] = ner_results[index]["entity"][-3:]
    if ner_results[index]["entity"][0] == "B":
        ner_results[index]["beginning"] = True
        counter += 1
    else:
        ner_results[index]["beginning"] = False
    ner_results[index]["groupNER"] = counter

df = pd.DataFrame(ner_results)
df

Unnamed: 0,entity,score,index,word,start,end,parentEntity,beginning,groupNER
0,B-ORG,0.997546,1,R,0,1,ORG,True,1
1,I-ORG,0.973958,2,##G,1,2,ORG,False,1
2,I-ORG,0.978134,3,##U,2,3,ORG,False,1
3,B-LOC,0.995686,9,Aberdeen,31,39,LOC,True,2
4,B-LOC,0.999581,15,Scotland,60,68,LOC,True,3


In [333]:
# df = df.groupby(['groupNER']).aggregate({'start':'min','end':'max'}).reset_index()
# df = df.sort_values(by=['start']).reset_index()
df['word'] = df.groupby(['groupNER'])['word'].transform(lambda x: ' '.join(x))
df['word'] = df['word'].apply(lambda x: x.replace('#',''))
df['word'] = df['word'].apply(lambda x: x.replace(' ',''))
# df = df.groupby(['groupNER','parentEntity','word','beginning']).aggregate({'start':'min','end':'max'}).reset_index()
df = df[df['beginning']==True]
df = df[["parentEntity","word"]]
df

Unnamed: 0,parentEntity,word
0,ORG,RGU
3,LOC,Aberdeen
4,LOC,Scotland


In [20]:
def postProcessing(input):
        tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
        model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

        languageModel = pipeline("ner", model=model, tokenizer=tokenizer)
        ner_results = languageModel(input)

        counter = 0
        for index in range(len(ner_results)):
            ner_results[index]["parentEntity"] = ner_results[index]["entity"][-3:]
            if ner_results[index]["entity"][0] == "B":
                ner_results[index]["beginning"] = True
                counter += 1
            else:
                ner_results[index]["beginning"] = False
            ner_results[index]["groupNER"] = counter

        df = pd.DataFrame(ner_results)
        df['word'] = df.groupby(['groupNER'])['word'].transform(lambda x: ' '.join(x))
        df['word'] = df['word'].apply(lambda x: x.replace('#',''))
        df['word'] = df['word'].apply(lambda x: x.replace(' ',''))
        
        df = df[df['beginning']==True]
        df = df[["parentEntity","word"]]
        js = df.to_json(orient = "records")
        return js

In [34]:
def highlightWord(js,text):
    
    json_data = json.loads(js)
    html = '<p>'
    for word in text.split():
        highlighted = False
        for item in json_data:
            if item['word'] == word:
                html += f'<mark>{word}</mark> '
                highlighted = True
                break
        if not highlighted:
            html += f'{word} '
    html += '</p>'
    return html

In [35]:
input = "RGU is a university located in Aberdeen, south east part of Scotland"
js = postProcessing(input)
html = highlightWord(js,input)

In [36]:
js

'[{"parentEntity":"ORG","word":"RGU"},{"parentEntity":"LOC","word":"Aberdeen"},{"parentEntity":"LOC","word":"Scotland"}]'

In [37]:
html

'<p><mark>RGU</mark> is a university located in Aberdeen, south east part of <mark>Scotland</mark> </p>'

In [119]:
# df["groupStart"] =  df["start"]
df["beginning"] = df["entity"].apply(lambda x: True if x[0] == "B" else False)
# df["groupStart"] = df.apply(lambda x: groupStart(x["start"],x["beginning"],x["groupStart"]),axis=1)
df["parentEntity"] = df["entity"].apply(lambda x:x[-3:])
df["score"]=df["score"].astype("float64")
df["entity"]=df["entity"].astype("string")
df["word"]=df["parentEntity"].astype("string")
df["parentEntity"]=df["parentEntity"].astype("string")
df.dtypes

Unnamed: 0,entity,score,index,word,start,end,beginning,parentEntity
0,B-ORG,0.997546,1,ORG,0,1,True,ORG
1,I-ORG,0.973958,2,ORG,1,2,False,ORG
2,I-ORG,0.978134,3,ORG,2,3,False,ORG
3,B-LOC,0.995686,9,LOC,31,39,True,LOC
4,B-LOC,0.999581,15,LOC,60,68,True,LOC


In [120]:
js = df.to_json(orient = "records")

In [123]:
type(js)

str