In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("./data/Test2NER.csv", sep=";", names=["Sentence #", "Word"], encoding = "latin1")
df.fillna('', inplace=True)

In [5]:
def fill_sentence_numbers(df):
    sentences = df["Sentence #"].values
    current = sentences[0]
    new_sentence_nums = []
    for s in sentences:
        if s != '':
            current = s
        new_sentence_nums.append(int(current.replace("Sentence: ", "")))
    df["Sentence_num"] = new_sentence_nums
        
fill_sentence_numbers(df)
df.head(20)

Unnamed: 0,Sentence #,Word,Sentence_num
0,Sentence: 9901,The,9901
1,,International,9901
2,,Committee,9901
3,,of,9901
4,,the,9901
5,,Red,9901
6,,Cross,9901
7,,says,9901
8,,84,9901
9,,people,9901


In [6]:
sentences = df.groupby('Sentence_num').apply(lambda row: " ".join(row["Word"]))
sentences

Sentence_num
9901     The International Committee of the Red Cross s...
9902     Another 18 people have been killed in the sout...
9903     The violence has forced scores of northern Mus...
9904     Mr. Garang led the mostly Christian and animis...
9905     He was sworn in as vice president just three w...
                               ...                        
10336    U.S. private employers added 71 jobs in July ,...
10337    A report Friday from the Labor Department show...
10338    Economists closely watch the private payrolls ...
10339    Employment is also a big indicator of consumer...
10340    The country lost jobs overall in July , due in...
Length: 440, dtype: object

In [7]:
from flair.models import SequenceTagger
from flair.data import Sentence

model = SequenceTagger.load('final-model.pt')

tagged_sentences = []
# create example sentence
for sentence_string in sentences:
    sentence = Sentence(text=sentence_string, use_tokenizer=False)

    # predict
    model.predict(sentence)
    tagged_sentences.append(sentence.to_tagged_string())

2020-10-06 22:59:17,632 loading file final-model.pt


In [8]:
import re
ner_regex = re.compile('^<[B|I]-.+>')

# set all predictions to O
df["Predicted"] = "O"

row_index = 0
for tagged_sentence in tagged_sentences:
    sentence_tokens = tagged_sentence.split(" ")
    for token in sentence_tokens:
        if ner_regex.match(token) is not None:
            # set previous word tag
            df.iat[row_index - 1, 3] = token[1:-1]
        else:
            row_index += 1
print(row_index)

9878


In [9]:
print(len(df))

9878


In [11]:
df.head(50)

Unnamed: 0,Sentence #,Word,Sentence_num,Predicted
0,Sentence: 9901,The,9901,B-org
1,,International,9901,I-org
2,,Committee,9901,I-org
3,,of,9901,O
4,,the,9901,O
5,,Red,9901,B-org
6,,Cross,9901,I-org
7,,says,9901,O
8,,84,9901,O
9,,people,9901,O


In [12]:
df.to_csv("Test2NER_Prediction.csv", sep=";", index=False, columns=["Sentence #", "Word", "Predicted"], header=["Sentences", "Word", "Predicted"])