# Training an entities recognition model

Importing the required code files

In [1]:
from os import getcwd, path
import sys

BASE_PATH = path.dirname(getcwd())
sys.path.append(BASE_PATH)

from config import START_TAG, STOP_TAG

In [2]:
print(BASE_PATH)

/Users/2359media/Documents/botbot-nlp


The training data must be an array that:
- Contains tuples of (sentence, tags)
- Sentence will be splitted using nltk.wordpunct_tokenize
- Tags will be splitted using .split() - hence spaces by default

Each entity must be separated into 3 kinds of tag: B- (Begin), I- (Inside) and O- (Outside)

_This is to help with separation in the case of consecutive entities_

A `dictionary` to translate from these tags into consecutive indices must be defined
This dictionary will contain:
- The empty token
- `START_TAG` and `END_TAG` tokens (imported from global configs - used internally to indicate start and end of sentence)
- Entities B-, I-, O- tokens

**Sample training data for email recognition:**

In [3]:
# training_data = [('hi thanh', '- - B-name'), ('hello duc, how are you?', '- - B-name - - - - - - - -')]

# tag_to_ix = {'-': 0, '<START>': 1, '<STOP>': 2, 'B-name': 3, 'I-name': 4}

training_data = [(
    'My email address is at luungoc2005@gmail.com.',
    '- - - - - - - - - - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL -'
), (
    'Contact me at contact@2359media.net.',
    '- - - - - - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL -'
), (
    'test.email@microsoft.com is a testing email address',
    'B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL - - - - - - - - - -'
), (
    'Any inquiries email thesloth_197@gmail.com for assistance',
    '- - - - - - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL - - - -'
), (
    'Email addresses include test.noreply@gmail.com hello.vietnam@hallo.org contact@rocket.net',
    '- - - - - - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL'
), (
    'Contact: tester@github.com at any hours',
    '- - - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL - - - - - -'
)]

tag_to_ix = {
    '-': 1, # O tag but using '-' for readability
    'B-EMAIL': 2,
    'I-EMAIL': 3,
}

In [4]:
from entities_recognition.transformer.model import TransformerSequenceTaggerWrapper
from entities_recognition.transformer.train import TransformerSequenceTaggerLearner
from entities_recognition.transformer.data import TransformerEntitiesRecognitionDataset
from common.callbacks import PrintLoggerCallback, EarlyStoppingCallback, ReduceLROnPlateau
from common.modules import BertAdam

model = TransformerSequenceTaggerWrapper({'tag_to_ix': tag_to_ix})
learner = TransformerSequenceTaggerLearner(model)
training_data = TransformerEntitiesRecognitionDataset(training_data, tag_to_ix)

In [5]:
learner.fit(
    training_data=training_data,
    epochs=500,
    batch_size=2,
    callbacks=[
        PrintLoggerCallback(log_every=5),
        ReduceLROnPlateau(reduce_factor=4, patience=10)
#         EarlyStoppingCallback()
    ]
)

0m 3s (- 5m 0s) (5 1%) - loss: 6.6717 - accuracy: 0.7095
0m 4s (- 3m 56s) (10 2%) - loss: 9.3869 - accuracy: 0.6569
0m 6s (- 3m 26s) (15 3%) - loss: 5.3104 - accuracy: 0.7583
0m 7s (- 3m 10s) (20 4%) - loss: 3.1559 - accuracy: 0.7415
0m 9s (- 2m 59s) (25 5%) - loss: 1.5279 - accuracy: 0.6852
0m 10s (- 2m 51s) (30 6%) - loss: -0.6603 - accuracy: 0.7611
0m 12s (- 2m 44s) (35 7%) - loss: 2.9143 - accuracy: 0.6939
0m 13s (- 2m 38s) (40 8%) - loss: 1.8788 - accuracy: 0.6950
Monitor value plateaued at `loss` == 1.878804. Applying new learning rate: 0.001000 -> 0.000250
0m 15s (- 2m 34s) (45 9%) - loss: 4.3126 - accuracy: 0.7381
0m 16s (- 2m 29s) (50 10%) - loss: 0.3009 - accuracy: 0.6728
0m 18s (- 2m 26s) (55 11%) - loss: 3.4660 - accuracy: 0.6966
0m 19s (- 2m 23s) (60 12%) - loss: -0.9092 - accuracy: 0.7611
0m 21s (- 2m 21s) (65 13%) - loss: -0.7478 - accuracy: 0.7807
0m 22s (- 2m 19s) (70 14%) - loss: -0.1688 - accuracy: 0.8280
Monitor value plateaued at `loss` == -0.968981. Applying new l

In [18]:
from common.utils import wordpunct_space_tokenize
# model([wordpunct_space_tokenize('test.email@microsoft.com is a testing email address')])
# model([wordpunct_space_tokenize('Any inquiries email thesloth_197@gmail.com for assistance')])
model([wordpunct_space_tokenize('My first email address is actually luungoc2005@yahoo.com')])

[[{'name': 'EMAIL', 'values': ['luungoc2005@yahoo', 'com']}]]

Evaluate model accuracy by using