# Training an entities recognition model

Importing the required code files

In [1]:
from os import getcwd, path
import sys
import matplotlib.pyplot as plt

BASE_PATH = path.dirname(getcwd())
sys.path.append(BASE_PATH)

from config import START_TAG, STOP_TAG

In [2]:
print(BASE_PATH)

/Users/2359media/Documents/botbot-nlp


The training data must be an array that:
- Contains tuples of (sentence, tags)
- Sentence will be splitted using nltk.wordpunct_tokenize
- Tags will be splitted using .split() - hence spaces by default

Each entity must be separated into 3 kinds of tag: B- (Begin), I- (Inside) and O- (Outside)

_This is to help with separation in the case of consecutive entities_

A `dictionary` to translate from these tags into consecutive indices must be defined
This dictionary will contain:
- The empty token
- `START_TAG` and `END_TAG` tokens (imported from global configs - used internally to indicate start and end of sentence)
- Entities B-, I-, O- tokens

**Sample training data for email recognition:**

In [3]:
training_data = [(
    'My email address is at luungoc2005@gmail.com.',
    '- - - - - - - - - - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL -'
), (
    'Contact me at contact@2359media.net.',
    '- - - - - - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL -'
), (
    'test.email@microsoft.com is a testing email address',
    'B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL - - - - - - - - - -'
), (
    'Any inquiries email thesloth_197@gmail.com for assistance',
    '- - - - - - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL - - - -'
), (
    'Email addresses include test.noreply@gmail.com hello.vietnam@hallo.org contact@rocket.net',
    '- - - - - - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL'
), (
    'Contact: tester@github.com at any hours',
    '- - - B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL - - - - - -'
)]

tag_to_ix = {
    '-': 0, # O tag but using '-' for readability
    'B-EMAIL': 1,
    'I-EMAIL': 2,
    START_TAG: 3,
    STOP_TAG: 4
}

In [4]:
from entities_recognition.bilstm.model import SequenceTaggerWrapper
from entities_recognition.bilstm.train import SequenceTaggerLearner
from common.callbacks import PrintLoggerCallback

model = SequenceTaggerWrapper({'tag_to_ix': tag_to_ix})
learner = SequenceTaggerLearner(model)

In [5]:
learner.fit(
    training_data=training_data,
    epochs=50,
    callbacks=[PrintLoggerCallback(log_every=5)]
)

0m 2s (- 0m 52s) (5 5%) - loss: 11.6596 - accuracy: 0.5714
0m 5s (- 0m 46s) (10 10%) - loss: 5.5873 - accuracy: 0.7857
0m 7s (- 0m 43s) (15 15%) - loss: 1.2319 - accuracy: 1.0000
0m 9s (- 0m 39s) (20 20%) - loss: 0.2861 - accuracy: 1.0000
0m 12s (- 0m 37s) (25 25%) - loss: 0.2877 - accuracy: 1.0000
0m 14s (- 0m 34s) (30 30%) - loss: 0.1520 - accuracy: 1.0000
0m 17s (- 0m 32s) (35 35%) - loss: 0.0794 - accuracy: 1.0000
0m 19s (- 0m 29s) (40 40%) - loss: 0.0920 - accuracy: 1.0000
0m 22s (- 0m 27s) (45 45%) - loss: 0.1552 - accuracy: 1.0000
0m 24s (- 0m 24s) (50 50%) - loss: 0.0622 - accuracy: 1.0000
0m 27s (- 0m 22s) (55 55%) - loss: 0.1708 - accuracy: 1.0000
0m 29s (- 0m 19s) (60 60%) - loss: 0.0166 - accuracy: 1.0000
0m 31s (- 0m 17s) (65 65%) - loss: 0.0102 - accuracy: 1.0000
0m 34s (- 0m 14s) (70 70%) - loss: 0.0095 - accuracy: 1.0000
0m 36s (- 0m 12s) (75 75%) - loss: 0.0163 - accuracy: 1.0000
0m 39s (- 0m 9s) (80 80%) - loss: 0.0249 - accuracy: 1.0000
0m 41s (- 0m 7s) (85 85%) - lo

In [6]:
model('my email is luungoc2005@gmail.com')

(tensor(106.5679),
 [2,
  2,
  0,
  2,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  2,
  2,
  2,
  2,
  2,
  2,
  0,
  1,
  2,
  2,
  2,
  2,
  2,
  0,
  1,
  1,
  2,
  2,
  2,
  2])

Evaluate model accuracy by using