# Training an entities recognition model

Importing the required code files

In [1]:
from os import getcwd, path
import sys

BASE_PATH = path.dirname(getcwd())
sys.path.append(BASE_PATH)

from config import START_TAG, STOP_TAG

In [2]:
print(BASE_PATH)

/Users/2359media/Documents/botbot-nlp


The training data must be an array that:
- Contains tuples of (sentence, tags)
- Sentence will be splitted using nltk.wordpunct_tokenize
- Tags will be splitted using .split() - hence spaces by default

Each entity must be separated into 3 kinds of tag: B- (Begin), I- (Inside) and O- (Outside)

_This is to help with separation in the case of consecutive entities_

A `dictionary` to translate from these tags into consecutive indices must be defined
This dictionary will contain:
- The empty token
- `START_TAG` and `END_TAG` tokens (imported from global configs - used internally to indicate start and end of sentence)
- Entities B-, I-, O- tokens

**Sample training data for email recognition:**

In [3]:
# training_data = [('hi thanh', '- - B-name'), ('hello duc, how are you?', '- - B-name - - - - - - - -')]

# tag_to_ix = {'-': 0, '<START>': 1, '<STOP>': 2, 'B-name': 3, 'I-name': 4}

training_data = [(
    'My email address is at luungoc2005@gmail.com.',
    '- - - - - - - - - - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL -'
), (
    'Contact me at contact@2359media.net.',
    '- - - - - - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL -'
), (
    'test.email@microsoft.com is a testing email address',
    'B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL - - - - - - - - - -'
), (
    'Any inquiries email thesloth_197@gmail.com for assistance',
    '- - - - - - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL - - - -'
), (
    'Email addresses include test.noreply@gmail.com hello.vietnam@hallo.org contact@rocket.net',
    '- - - - - - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL'
), (
    'Contact: tester@github.com at any hours',
    '- - - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL - - - - - -'
)]

tag_to_ix = {
    '-': 1, # O tag but using '-' for readability
    'B-EMAIL': 2,
    'I-EMAIL': 3,
}

In [4]:
from entities_recognition.transformer.model import TransformerSequenceTaggerWrapper
from entities_recognition.transformer.train import TransformerSequenceTaggerLearner
from entities_recognition.transformer.data import TransformerEntitiesRecognitionDataset
from common.callbacks import PrintLoggerCallback, EarlyStoppingCallback, ReduceLROnPlateau
from common.modules import BertAdam

model = TransformerSequenceTaggerWrapper({'tag_to_ix': tag_to_ix})
learner = TransformerSequenceTaggerLearner(model)
training_data = TransformerEntitiesRecognitionDataset(training_data, tag_to_ix)

In [5]:
learner.fit(
    training_data=training_data,
    epochs=500,
    batch_size=2,
    callbacks=[
        PrintLoggerCallback(log_every=5),
        ReduceLROnPlateau(reduce_factor=2, patience=10)
#         EarlyStoppingCallback()
    ]
)

0m 3s (- 6m 12s) (5 1%) - loss: 14.8268 - accuracy: 0.5841
0m 5s (- 4m 28s) (10 2%) - loss: 11.4850 - accuracy: 0.5830
0m 7s (- 3m 46s) (15 3%) - loss: 5.4484 - accuracy: 0.6670
0m 8s (- 3m 24s) (20 4%) - loss: 5.9744 - accuracy: 0.7136
0m 9s (- 3m 9s) (25 5%) - loss: 4.6831 - accuracy: 0.6365
0m 11s (- 3m 0s) (30 6%) - loss: 2.5189 - accuracy: 0.7088
0m 13s (- 2m 53s) (35 7%) - loss: 7.4187 - accuracy: 0.7490
0m 14s (- 2m 48s) (40 8%) - loss: -0.6111 - accuracy: 0.8388
0m 16s (- 2m 43s) (45 9%) - loss: -0.7899 - accuracy: 0.8388
0m 17s (- 2m 39s) (50 10%) - loss: -0.3407 - accuracy: 0.7357
Monitor value plateaued at `loss` == -3.304410. Applying new learning rate: 0.001000 -> 0.000500
Monitor value plateaued at `loss` == -2.247701. Applying new learning rate: 0.000500 -> 0.000250
0m 19s (- 2m 36s) (55 11%) - loss: 7.6279 - accuracy: 0.7207
Monitor value plateaued at `loss` == 7.627852. Applying new learning rate: 0.000250 -> 0.000125
Monitor value plateaued at `loss` == -2.001086. App

In [6]:
from common.utils import wordpunct_space_tokenize
model([wordpunct_space_tokenize('test.email@microsoft.com is a testing email address')])
# model([wordpunct_space_tokenize('Any inquiries email thesloth_197@gmail.com for assistance')])

(tensor([[2, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 [[{'name': 'EMAIL', 'values': ['test.email@microsoft.com']}]])

Evaluate model accuracy by using