# Training an entities recognition model

Importing the required code files

In [1]:
from os import getcwd, path
import sys

BASE_PATH = path.dirname(getcwd())
sys.path.append(BASE_PATH)

from config import START_TAG, STOP_TAG

In [2]:
print(BASE_PATH)

/Users/2359media/Documents/botbot-nlp


The training data must be an array that:
- Contains tuples of (sentence, tags)
- Sentence will be splitted using nltk.wordpunct_tokenize
- Tags will be splitted using .split() - hence spaces by default

Each entity must be separated into 3 kinds of tag: B- (Begin), I- (Inside) and O- (Outside)

_This is to help with separation in the case of consecutive entities_

A `dictionary` to translate from these tags into consecutive indices must be defined
This dictionary will contain:
- The empty token
- `START_TAG` and `END_TAG` tokens (imported from global configs - used internally to indicate start and end of sentence)
- Entities B-, I-, O- tokens

**Sample training data for email recognition:**

In [3]:
# training_data = [('hi thanh', '- - B-name'), ('hello duc, how are you?', '- - B-name - - - - - - - -')]

# tag_to_ix = {'-': 0, '<START>': 1, '<STOP>': 2, 'B-name': 3, 'I-name': 4}

training_data = [(
    'My email address is at luungoc2005@gmail.com.',
    '- - - - - - - - - - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL -'
), (
    'Contact me at contact@2359media.net.',
    '- - - - - - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL -'
), (
    'test.email@microsoft.com is a testing email address',
    'B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL - - - - - - - - - -'
), (
    'Any inquiries email thesloth_197@gmail.com for assistance',
    '- - - - - - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL - - - -'
), (
    'Email addresses include test.noreply@gmail.com hello.vietnam@hallo.org contact@rocket.net',
    '- - - - - - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL'
), (
    'Contact: tester@github.com at any hours',
    '- - - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL - - - - - -'
)]

tag_to_ix = {
    '-': 1, # O tag but using '-' for readability
    'B-EMAIL': 2,
    'I-EMAIL': 3,
}

In [4]:
from entities_recognition.transformer.model import TransformerSequenceTaggerWrapper
from entities_recognition.transformer.train import TransformerSequenceTaggerLearner
from entities_recognition.transformer.data import TransformerEntitiesRecognitionDataset
from common.callbacks import PrintLoggerCallback, EarlyStoppingCallback, ReduceLROnPlateau
from common.modules import BertAdam

model = TransformerSequenceTaggerWrapper({'tag_to_ix': tag_to_ix})
learner = TransformerSequenceTaggerLearner(model)
training_data = TransformerEntitiesRecognitionDataset(training_data, tag_to_ix)

In [5]:
learner.fit(
    training_data=training_data,
    epochs=500,
    batch_size=2,
    callbacks=[
        PrintLoggerCallback(log_every=5),
        ReduceLROnPlateau(reduce_factor=4, patience=10)
#         EarlyStoppingCallback()
    ]
)

0m 3s (- 5m 51s) (5 1%) - loss: 15.3277 - accuracy: 0.4459
0m 5s (- 4m 21s) (10 2%) - loss: 8.6164 - accuracy: 0.5490
0m 7s (- 3m 48s) (15 3%) - loss: 6.5600 - accuracy: 0.5356
0m 8s (- 3m 30s) (20 4%) - loss: 7.1209 - accuracy: 0.6956
0m 10s (- 3m 21s) (25 5%) - loss: 8.7328 - accuracy: 0.7066
0m 12s (- 3m 15s) (30 6%) - loss: 3.3468 - accuracy: 0.6362
0m 14s (- 3m 7s) (35 7%) - loss: 2.7418 - accuracy: 0.7275
0m 15s (- 3m 2s) (40 8%) - loss: 0.4241 - accuracy: 0.7415
0m 17s (- 2m 57s) (45 9%) - loss: 0.7996 - accuracy: 0.6558
0m 19s (- 2m 52s) (50 10%) - loss: -3.0843 - accuracy: 0.7146
0m 20s (- 2m 48s) (55 11%) - loss: -2.5515 - accuracy: 0.7048
0m 22s (- 2m 44s) (60 12%) - loss: -3.1614 - accuracy: 0.6692
Monitor value plateaued at `loss` == 0.653759. Applying new learning rate: 0.001000 -> 0.000250
0m 24s (- 2m 40s) (65 13%) - loss: -3.7602 - accuracy: 0.6961
0m 25s (- 2m 37s) (70 14%) - loss: -4.4008 - accuracy: 0.6961
0m 27s (- 2m 34s) (75 15%) - loss: -5.3935 - accuracy: 0.704

In [6]:
from common.utils import wordpunct_space_tokenize
model([wordpunct_space_tokenize('test.email@microsoft.com is a testing email address')])
# model([wordpunct_space_tokenize('Any inquiries email thesloth_197@gmail.com for assistance')])

(tensor([[2, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 [[{'name': 'EMAIL', 'values': ['test.email@microsoft.com']}]])

Evaluate model accuracy by using