In [1]:
import sys
sys.path.append("src")

In [2]:
import torch

from ner.infer import NERInfer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_path = "logs/train/runs/2024-12-17/14-51-37/checkpoint/ner-epoch=01-val_loss=0.29.ckpt"
token_file = "data/vocab.pth"

In [4]:
vocab = torch.load(token_file)

vocab["vocab_size"]

23625

In [5]:
vocab["output_vocab"]

{0: '<pad>',
 1: 'O',
 2: 'B-LOC',
 3: 'B-PER',
 4: 'B-ORG',
 5: 'I-PER',
 6: 'I-ORG',
 7: 'B-MISC',
 8: 'I-LOC',
 9: 'I-MISC'}

In [6]:
infer = NERInfer(
    model_path = model_path,
    token_file = token_file,
    device = "cpu"
)

In [7]:
infer.vocab_size

23625

In [8]:
infer.token_vocab

{'<unk>': 0,
 '<pad>': 1,
 '.': 2,
 ',': 3,
 'the': 4,
 'of': 5,
 'in': 6,
 'to': 7,
 'a': 8,
 '(': 9,
 ')': 10,
 'and': 11,
 '"': 12,
 'on': 13,
 'said': 14,
 "'s": 15,
 '1': 16,
 'for': 17,
 '-': 18,
 'The': 19,
 'was': 20,
 '2': 21,
 '0': 22,
 '3': 23,
 'at': 24,
 'with': 25,
 'that': 26,
 'from': 27,
 'by': 28,
 ':': 29,
 'is': 30,
 'as': 31,
 'he': 32,
 '4': 33,
 'had': 34,
 'has': 35,
 'it': 36,
 'his': 37,
 'not': 38,
 'were': 39,
 'be': 40,
 'an': 41,
 'have': 42,
 'after': 43,
 'who': 44,
 'will': 45,
 '5': 46,
 'but': 47,
 'first': 48,
 'U.S.': 49,
 'been': 50,
 '$': 51,
 '--': 52,
 'two': 53,
 'are': 54,
 'their': 55,
 '6': 56,
 'beat': 57,
 'which': 58,
 'would': 59,
 'up': 60,
 'I': 61,
 'its': 62,
 'they': 63,
 'percent': 64,
 'year': 65,
 'out': 66,
 'Thursday': 67,
 'this': 68,
 'last': 69,
 'million': 70,
 'over': 71,
 'Wednesday': 72,
 'one': 73,
 '7': 74,
 'government': 75,
 'against': 76,
 '/': 77,
 'police': 78,
 'when': 79,
 'second': 80,
 'also': 81,
 'Tuesday': 

In [9]:
infer.model

GruNER(
  (emb): Embedding(23625, 256)
  (gru): GRU(256, 128, num_layers=2, batch_first=True, bidirectional=True)
  (out_layer): Linear(in_features=256, out_features=10, bias=True)
)

In [10]:
infer.class_dict

{0: '<pad>',
 1: 'O',
 2: 'B-LOC',
 3: 'B-PER',
 4: 'B-ORG',
 5: 'I-PER',
 6: 'I-ORG',
 7: 'B-MISC',
 8: 'I-LOC',
 9: 'I-MISC'}

In [11]:
text = "I am superman"
text_2 = ["Indonesia is beautiful", "How are you?"]

In [12]:
tokens = text.split() if isinstance(text, str) else [word for sentence in text for word in sentence.split()]
tokens

['I', 'am', 'superman']

In [13]:
tokens = text_2.split() if isinstance(text_2, str) else [word.split() for word in text_2]
tokens

[['Indonesia', 'is', 'beautiful'], ['How', 'are', 'you?']]

In [14]:
infer._tokenize(text)

[61, 1522, 23056]

In [15]:
infer.preprocess(text).shape

torch.Size([1, 3])

In [16]:
test = infer.preprocess(text_2)
test

tensor([[ 1016,    30, 19540],
        [ 9423,    54,     0]])

In [17]:
res = infer.forward(test)
res

tensor([[-1.6698,  0.9379,  2.3227, -0.6825,  0.1408, -0.8804, -1.0139, -0.4062,
         -0.3091, -1.4901],
        [-1.8821,  4.8053, -1.6945, -2.2487, -1.3119, -0.9299, -0.7507, -2.3760,
         -1.3562, -1.7751],
        [-1.5014,  3.8283, -1.7746, -0.6473, -0.2962, -1.5041, -0.7825, -1.2823,
         -2.8455, -2.3085],
        [-1.3262,  1.4667, -1.0180,  0.1720,  0.7854, -1.3220, -0.6112,  0.8863,
         -1.8669, -1.3474],
        [-1.4321,  2.9012, -2.4392, -0.1595, -0.4343, -0.8318, -0.6084, -0.1114,
         -2.5309, -1.4364],
        [-1.0711,  1.9604, -2.5304,  0.1553, -0.2315, -0.8657, -0.8520,  1.3299,
         -2.8278, -0.5405]])

In [18]:
res.view(*test.shape, 10)

tensor([[[-1.6698,  0.9379,  2.3227, -0.6825,  0.1408, -0.8804, -1.0139,
          -0.4062, -0.3091, -1.4901],
         [-1.8821,  4.8053, -1.6945, -2.2487, -1.3119, -0.9299, -0.7507,
          -2.3760, -1.3562, -1.7751],
         [-1.5014,  3.8283, -1.7746, -0.6473, -0.2962, -1.5041, -0.7825,
          -1.2823, -2.8455, -2.3085]],

        [[-1.3262,  1.4667, -1.0180,  0.1720,  0.7854, -1.3220, -0.6112,
           0.8863, -1.8669, -1.3474],
         [-1.4321,  2.9012, -2.4392, -0.1595, -0.4343, -0.8318, -0.6084,
          -0.1114, -2.5309, -1.4364],
         [-1.0711,  1.9604, -2.5304,  0.1553, -0.2315, -0.8657, -0.8520,
           1.3299, -2.8278, -0.5405]]])

In [19]:
torch.softmax(res.view(*test.shape, 10), dim=1)

tensor([[[0.3342, 0.0150, 0.9665, 0.4455, 0.5319, 0.4020, 0.2808, 0.6427,
          0.6992, 0.4560],
         [0.2703, 0.7156, 0.0174, 0.0930, 0.1244, 0.3826, 0.3653, 0.0897,
          0.2454, 0.3429],
         [0.3955, 0.2694, 0.0161, 0.4615, 0.3436, 0.2154, 0.3539, 0.2676,
          0.0553, 0.2011]],

        [[0.3135, 0.1463, 0.6841, 0.3702, 0.6035, 0.2375, 0.3586, 0.3417,
          0.5271, 0.2406],
         [0.2820, 0.6141, 0.1652, 0.2657, 0.1782, 0.3877, 0.3596, 0.1260,
          0.2713, 0.2201],
         [0.4046, 0.2397, 0.1508, 0.3641, 0.2183, 0.3748, 0.2818, 0.5324,
          0.2016, 0.5392]]])

In [20]:
# infer.post_process(res, tokens)

In [21]:
infer.predict(text)

[{'entity': 'O', 'score': 0.6076, 'index': 0, 'word': 'I'},
 {'entity': 'B-MISC', 'score': 0.4674, 'index': 1, 'word': 'am'},
 {'entity': 'B-PER', 'score': 0.5778, 'index': 2, 'word': 'superman'}]

In [22]:
infer.predict(text_2)

[[{'entity': 'B-LOC', 'score': 0.9665, 'index': 0, 'word': 'Indonesia'},
  {'entity': 'O', 'score': 0.7156, 'index': 1, 'word': 'is'},
  {'entity': 'B-PER', 'score': 0.4615, 'index': 2, 'word': 'beautiful'}],
 [{'entity': 'B-LOC', 'score': 0.6841, 'index': 0, 'word': 'How'},
  {'entity': 'O', 'score': 0.6141, 'index': 1, 'word': 'are'},
  {'entity': 'I-MISC', 'score': 0.5392, 'index': 2, 'word': 'you?'}]]