In [1]:
import pandas as pd

In [8]:
df = pd.read_csv("./data/Test1NER.csv", sep=";", names=["Sentence #", "Word", "Pos"], encoding = "latin1")
df.fillna('', inplace=True)

In [9]:
def fill_sentence_numbers(df):
    sentences = df["Sentence #"].values
    current = sentences[0]
    new_sentence_nums = []
    for s in sentences:
        if s != '':
            current = s
        new_sentence_nums.append(int(current.replace("Sentence: ", "")))
    df["Sentence_num"] = new_sentence_nums
        
fill_sentence_numbers(df)
df.head(20)

Unnamed: 0,Sentence #,Word,Pos,Sentence_num
0,Sentence: 9001,In,IN,9001
1,,2005,CD,9001
2,,",",",",9001
3,,Zambia,NNP,9001
4,,qualified,VBD,9001
5,,for,IN,9001
6,,debt,NN,9001
7,,relief,NN,9001
8,,under,IN,9001
9,,the,DT,9001


In [10]:
sentences = df.groupby('Sentence_num').apply(lambda row: " ".join(row["Word"]))
sentences

Sentence_num
9001    In 2005 , Zambia qualified for debt relief und...
9002    Poverty remains a significant problem in Zambi...
9003    Zambia 's dependency on copper makes it vulner...
9004    A high birth rate , relatively high HIV / AIDS...
9005    Slovakia 's roots can be traced to the 9th cen...
                              ...                        
9896    Oil is priced in dollars , so a lower value te...
9897    Soaring oil prices have pushed U.S. retail gas...
9898    The latest survey puts the average price at ne...
9899    That is an increase of 30 cents a liter in the...
9900    The death toll from violence in Sudan followin...
Length: 900, dtype: object

In [14]:
from flair.models import SequenceTagger
from flair.data import Sentence

model = SequenceTagger.load('final-model.pt')

tagged_sentences = []
# create example sentence
for sentence_string in sentences:
    sentence = Sentence(text=sentence_string, use_tokenizer=False)

    # predict
    model.predict(sentence)
    tagged_sentences.append(sentence.to_tagged_string())

2020-10-06 22:14:52,542 loading file final-model.pt


In [15]:
import re
ner_regex = re.compile('^<[B|I]-.+>')

# set all predictions to O
df["Predicted"] = "O"

row_index = 0
for tagged_sentence in tagged_sentences:
    sentence_tokens = tagged_sentence.split(" ")
    for token in sentence_tokens:
        if ner_regex.match(token) is not None:
            # set previous word tag
            df.iat[row_index - 1, 4] = token[1:-1]
        else:
            row_index += 1
print(row_index)

['In', '2005', '<B-time>', ',', 'Zambia', '<B-gpe>', 'qualified', 'for', 'debt', 'relief', 'under', 'the', 'Highly', '<B-org>', 'Indebted', '<I-org>', 'Poor', '<I-org>', 'Country', '<I-org>', 'Initiative', '<I-org>', ',', 'consisting', 'of', 'approximately', 'USD', '6', 'billion', 'in', 'debt', 'relief', '.']
['Poverty', 'remains', 'a', 'significant', 'problem', 'in', 'Zambia', '<B-geo>', ',', 'despite', 'a', 'stronger', 'economy', '.']
['Zambia', '<B-geo>', "'s", 'dependency', 'on', 'copper', 'makes', 'it', 'vulnerable', 'to', 'depressed', 'commodity', 'prices', ',', 'but', 'record', 'high', 'copper', 'prices', 'and', 'a', 'bumper', 'maize', 'crop', 'in', '2010', '<B-time>', 'helped', 'Zambia', '<B-geo>', 'rebound', 'quickly', 'from', 'the', 'world', 'economic', 'slowdown', 'that', 'began', 'in', '2008', '<B-time>', '.']
['A', 'high', 'birth', 'rate', ',', 'relatively', 'high', 'HIV', '/', 'AIDS', 'burden', ',', 'and', 'market', 'distorting', 'agricultural', 'policies', 'have', 'meant

['Croatia', '<B-geo>', "'s", 'EU', '<B-org>', 'membership', 'negotiations', 'began', 'in', 'October', '<B-time>', 'after', 'an', 'eight-month', '<B-time>', 'delay', 'due', 'to', 'the', 'country', "'s", 'failure', 'to', 'capture', 'a', 'top', 'suspect', 'wanted', 'by', 'the', 'United', '<B-org>', 'Nations', '<I-org>', 'war', 'crimes', 'tribunal', '.']
['Mr.', '<B-per>', 'Chirac', '<I-per>', 'says', 'Croatia', '<B-geo>', "'s", 'rapid', 'economic', 'development', 'and', 'progress', 'in', 'reforms', 'justified', 'the', 'start', 'of', 'the', 'country', "'s", 'membership', 'negotiations', 'with', 'the', 'bloc', '.']
['Italian', '<B-gpe>', 'police', 'have', 'arrested', '36', 'people', 'during', 'protests', 'in', 'Rome', '<B-geo>', 'ahead', 'of', 'the', 'summit', 'of', 'leaders', 'of', 'the', 'world', "'s", 'eight', 'major', 'industrial', 'countries', '.']
['Hooded', 'demonstrators', 'clashed', 'with', 'police', 'and', 'set', 'fires', 'on', 'streets', 'near', 'one', 'of', 'the', 'capital', "'s

In [16]:
print(len(df))

19402


In [17]:
df.tail(50)

Unnamed: 0,Sentence #,Word,Pos,Sentence_num,Predicted
19352,,at,IN,9898,O
19353,,nearly,RB,9898,O
19354,,$,$,9898,O
19355,,42743,CD,9898,O
19356,,a,DT,9898,O
19357,,liter,NN,9898,O
19358,,(,LRB,9898,O
19359,,$,$,9898,O
19360,,42826,CD,9898,O
19361,,a,DT,9898,O


In [19]:
df.to_csv("Test1NER_Prediction.csv", sep=";", index=False, columns=["Sentence #", "Word", "Pos", "Predicted"], header=["Sentences", "Word", "POS", "Predicted"])