In [1]:
from os import getcwd, path
import sys
import matplotlib.pyplot as plt

BASE_PATH = path.dirname(getcwd())
sys.path.append(BASE_PATH)

from common.utils import wordpunct_space_tokenize
from config import START_TAG, STOP_TAG, EMPTY_TAG

In [2]:
TRAIN_PATH = path.join(BASE_PATH, 'data/CoNLL-2003/eng.train')
print(TRAIN_PATH)

/Users/2359media/Documents/botbot-nlp/data/CoNLL-2003/eng.train


In [3]:
import io
import string

def read_conll_2003(filename, tag_idx=-1, space_token=EMPTY_TAG):
    all_data = []

    current_txt = []
    current_tags = []
    tagset = []

    fin = io.open(filename, 'r', encoding='utf-8', newline='\n', errors='ignore')
    for line in fin:
        line = line.strip()
        if len(line) > 0: # skip blank lines
            tmp = line.split(' ')
            if tmp[0] != '-DOCSTART-':
                current_txt.append(tmp[0])
                current_tags.append(tmp[tag_idx])
                tagset.append(tmp[tag_idx])
        else:
            if len(current_txt) > 0:
                line_txt = ''
                line_tags = []
                for idx in range(len(current_txt)):
                    tokens = wordpunct_space_tokenize(current_txt[idx])
                    if idx > 0:
                        line_txt += ' ' + current_txt[idx]
                        if current_tags[idx - 1] == current_tags[idx]:
                            line_tags.extend([current_tags[idx]] * (1 + len(tokens)))
                        else:
                            line_tags.append(space_token)
                            tagset.append(space_token)
                            line_tags.extend([current_tags[idx]] * len(tokens))
                    else:
                        line_txt += current_txt[idx]
                        line_tags.extend([current_tags[idx]] * len(tokens))
                all_data.append((line_txt, ' '.join(line_tags)))
                current_txt = []
                current_tags = []
    fin.close()

    tagset = list(set(tagset))
    tag_to_ix = {tag: key for key, tag in enumerate(tagset)}
    tag_to_ix[START_TAG] = len(tagset)
    tag_to_ix[STOP_TAG] = len(tagset) + 1

    print(tag_to_ix)
    print('Loaded %s sentences' % len(all_data))
    
    return tag_to_ix, all_data

In [4]:
tag_to_ix, training_data = read_conll_2003(TRAIN_PATH, 1)
result = []
for sentence, tag_seq in training_data:
    tokens_in = wordpunct_space_tokenize(sentence)
    assert len(tokens_in) == len(tag_seq.split(' '))
#     print(read_tags(tokens_in, tag_seq.split(' ')))

{'RP': 0, "''": 1, 'PRP': 2, '-': 3, 'POS': 21, 'NNPS': 4, 'NNS': 26, 'CD': 5, 'UH': 6, '<START>': 46, 'RBS': 7, 'RB': 8, 'JJ': 9, 'PRP$': 11, 'MD': 12, 'WDT': 14, 'VBP': 15, 'TO': 23, '(': 16, 'JJS': 17, 'NN': 18, ')': 19, 'WP$': 34, 'VBZ': 20, 'JJR': 22, 'FW': 24, '<STOP>': 47, 'RBR': 25, ',': 27, 'LS': 28, 'NN|SYM': 29, '$': 30, '"': 31, 'VBD': 32, 'DT': 33, '.': 36, 'SYM': 35, 'WP': 37, 'EX': 38, 'VBG': 13, 'VBN': 39, 'CC': 40, 'IN': 41, 'PDT': 42, 'WRB': 43, ':': 10, 'NNP': 44, 'VB': 45}
Loaded 14041 sentences


In [5]:
from entities_recognition.bilstm.model import SequenceTaggerWrapper
from entities_recognition.bilstm.train import SequenceTaggerLearner
from common.callbacks import PrintLoggerCallback, EarlyStoppingCallback

model = SequenceTaggerWrapper({'tag_to_ix': tag_to_ix, 'task': 'pos'})
learner = SequenceTaggerLearner(model)

In [None]:
learner.fit(
    training_data=training_data,
    epochs=50,
    callbacks=[
        PrintLoggerCallback(log_every=5),
        EarlyStoppingCallback()
    ]
)

In [None]:
SAVE_PATH = 
learner.save('pos_model_en.bin')

In [None]:
test_data = [
    'I live in Ho Chi Minh City, nice place, though my hometown is in Hanoi. I do miss it sometimes',
    'Trump’s role in midterm elections roils Republicans',
    'Kenya bans film about 2 girls in love because it’s ‘too hopeful’',
    'G.O.P. leaders and White House aides are trying to prepare President Trump for trouble in House and Senate races.'
]
for line in test_data:
    print('\n---\n' + line)
    print(model([line]))