In [None]:
from google.colab import files, drive
!mkdir -p drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
### step 1 : import all relevant open library

import csv
import sys
import re
import logging
import os,re
import pickle
import sys,os,time
from typing import Union

import numpy as np

import math
from itertools import chain
from collections import Counter


import json
import random

  
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [None]:
## step: 1 check cuda version
print(torch.__version__) 

1.8.1+cu101


In [None]:
### step 2：define relevant folders

## step 2.1: define master folder
master_holder='/content/gdrive/My Drive/ETNET/ipo_pdf/ner'#



In [None]:
## step 2.2: define data folder and train/test data file
data_folder=os.path.join(master_holder,'data')
train_data=os.path.join(data_folder,'revised_full.txt')
test_data=os.path.join(data_folder,'test.txt')


In [None]:
## step 2.3: define model folder, model file and other model related files

model_folder=os.path.join(master_holder,'model3')

sent_vocab_file=os.path.join(model_folder,'sent_vocab.json')
tag_vocab_file=os.path.join(model_folder,'tag_vocab.json')


model_file=os.path.join(model_folder,'model.pth')
optimizer_file=os.path.join(model_folder,'optimizer.pth')

In [None]:
### step 3 : define loading data function

### step 3.1: define function to read train/test data files
def read_corpus(filepath):

    sentences, tags = [], []
    sent, tag = ['<START>'], ['<START>']
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
                line = line.split('\t')
                if len(line)==2:
                 # print(line[0],line[1])
                  sent.append(line[0].strip())
                  tag.append(line[1].strip())

                else:
                    if len(sent) > 1:
                     # print(sent,tag)
                      sentences.append(sent + ['<END>'])
                      tags.append(tag + ['<END>'])
                      sent, tag = ['<START>'], ['<START>']
    return sentences, tags

In [None]:
### step 3.2: define dictionary to contain word and word indice
def words2indices(origin, vocab):
    if isinstance(origin[0], list):
        result = [[vocab[w] for w in sent] for sent in origin]
    else:
        result = [vocab[w] for w in origin]
    return result

def indices2words(origin, vocab):
    if isinstance(origin[0], list):
        result = [[vocab.id2word(w) for w in sent] for sent in origin]
    else:
        result = [vocab.id2word(w) for w in origin]
    return result

In [None]:
### step 3.3: convert training data, word and word indice to train/dev datasset

def generate_train_dev_dataset(filepath, sent_vocab, tag_vocab, train_proportion=0.8):
  
    sentences, tags = read_corpus(filepath)
    sentences = words2indices(sentences, sent_vocab)
    tags = words2indices(tags, tag_vocab)
    data = list(zip(sentences, tags))
    random.shuffle(data)
    n_train = int(len(data) * train_proportion)
    train_data, dev_data = data[: n_train], data[n_train:]
    return train_data, dev_data

In [None]:
### step 4: define model training related function

### step 4.1: step define supplementary function to pad data and batch training

def pad(data, padded_token, device):
    lengths = [len(sent) for sent in data]
    max_len = lengths[0]
    padded_data = []
    for s in data:
        padded_data.append(s + [padded_token] * (max_len - len(s)))
    return torch.tensor(padded_data, device=device), lengths


def batch_iter(data, batch_size=32, shuffle=True):
   
    data_size = len(data)
    indices = list(range(data_size))
    if shuffle:
        random.shuffle(indices)
    batch_num = (data_size + batch_size - 1) // batch_size
    for i in range(batch_num):
        batch = [data[idx] for idx in indices[i * batch_size: (i + 1) * batch_size]]
        batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)
        sentences = [x[0] for x in batch]
        tags = [x[1] for x in batch]
        yield sentences, tags

def print_var(**kwargs):
    for k, v in kwargs.items():
        print(k, v)

In [None]:
### step 3.4: define Vocab class to words and word indice

class Vocab:
    def __init__(self, word2id, id2word):
        self.UNK = '<UNK>'
        self.PAD = '<PAD>'
        self.START = '<START>'
        self.END = '<END>'
        self.__word2id = word2id
        self.__id2word = id2word

    def get_word2id(self):
        return self.__word2id

    def get_id2word(self):
        return self.__id2word

    def __getitem__(self, item):
        if self.UNK in self.__word2id:
            return self.__word2id.get(item, self.__word2id[self.UNK])
        return self.__word2id[item]

    def __len__(self):
        return len(self.__word2id)

    def id2word(self, idx):
        return self.__id2word[idx]

    @staticmethod
    def build(data, max_dict_size, freq_cutoff, is_tags):
      
        word_counts = Counter(chain(*data))
        valid_words = [w for w, d in word_counts.items() if d >= freq_cutoff]
        valid_words = sorted(valid_words, key=lambda x: word_counts[x], reverse=True)
        valid_words = valid_words[: max_dict_size]
        valid_words += ['<PAD>']
        word2id = {w: idx for idx, w in enumerate(valid_words)}
        if not is_tags:
            word2id['<UNK>'] = len(word2id)
            valid_words += ['<UNK>']
        return Vocab(word2id=word2id, id2word=valid_words)

    def save(self, file_path):
        with open(file_path, 'w', encoding='utf8') as f:
            json.dump({'word2id': self.__word2id, 'id2word': self.__id2word}, f, ensure_ascii=False)

    @staticmethod
    def load(file_path):
        with open(file_path, 'r', encoding='utf8') as f:
            entry = json.load(f)
        return Vocab(word2id=entry['word2id'], id2word=entry['id2word'])

In [None]:
### step 5:load dataset

### step 5.1:  use read_corpus function to load training data

sentences, tags = read_corpus(train_data)


In [None]:
### step 5.2 : use vocab class to convert training data to word and word indice and then save to model files

sent_vocab = Vocab.build(sentences, int(13000), int(1), is_tags=False)
tag_vocab = Vocab.build(tags,int(13000), int(1), is_tags=True)
sent_vocab.save(sent_vocab_file)
tag_vocab.save(tag_vocab_file)

In [None]:
### step 5.3:  use generate_train_dev_dataset to convert training data, vocab, tag files into train/dev dataset

train_data, dev_data = generate_train_dev_dataset(train_data, sent_vocab, tag_vocab)
print('num of training examples: %d' % (len(train_data)))
print('num of development examples: %d' % (len(dev_data)))

num of training examples: 10443
num of development examples: 2611


In [None]:
### step 6 :define model
### step 6.1: define Bi-LSTM-CRF model structure

class BiLSTMCRF(nn.Module):
    def __init__(self, sent_vocab, tag_vocab, dropout_rate=0.5, embed_size=256, hidden_size=256):
        super(BiLSTMCRF, self).__init__()
        self.dropout_rate = dropout_rate
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.sent_vocab = sent_vocab
        self.tag_vocab = tag_vocab
        self.embedding = nn.Embedding(len(sent_vocab), embed_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.encoder = nn.LSTM(input_size=embed_size, hidden_size=hidden_size, bidirectional=True)
        self.hidden2emit_score = nn.Linear(hidden_size * 2, len(self.tag_vocab))
        self.transition = nn.Parameter(torch.randn(len(self.tag_vocab), len(self.tag_vocab)))  # shape: (K, K)

    def forward(self, sentences, tags, sen_lengths):
        mask = (sentences != self.sent_vocab[self.sent_vocab.PAD]).to(self.device)  # shape: (b, len)
        sentences = sentences.transpose(0, 1)  # shape: (len, b)
        sentences = self.embedding(sentences)  # shape: (len, b, e)
        emit_score = self.encode(sentences, sen_lengths)  # shape: (b, len, K)
        loss = self.cal_loss(tags, mask, emit_score)  # shape: (b,)
        return loss

    def encode(self, sentences, sent_lengths):
        padded_sentences = pack_padded_sequence(sentences, sent_lengths)
        hidden_states, _ = self.encoder(padded_sentences)
        hidden_states, _ = pad_packed_sequence(hidden_states, batch_first=True)  # shape: (b, len, 2h)
        emit_score = self.hidden2emit_score(hidden_states)  # shape: (b, len, K)
        emit_score = self.dropout(emit_score)  # shape: (b, len, K)
        return emit_score

    def cal_loss(self, tags, mask, emit_score):
        batch_size, sent_len = tags.shape
        # calculate score for the tags
        score = torch.gather(emit_score, dim=2, index=tags.unsqueeze(dim=2)).squeeze(dim=2)  # shape: (b, len)
        score[:, 1:] += self.transition[tags[:, :-1], tags[:, 1:]]
        total_score = (score * mask.type(torch.float)).sum(dim=1)  # shape: (b,)
        # calculate the scaling factor
        d = torch.unsqueeze(emit_score[:, 0], dim=1)  # shape: (b, 1, K)
        for i in range(1, sent_len):
            n_unfinished = mask[:, i].sum()
            d_uf = d[: n_unfinished]  # shape: (uf, 1, K)
            emit_and_transition = emit_score[: n_unfinished, i].unsqueeze(dim=1) + self.transition  # shape: (uf, K, K)
            log_sum = d_uf.transpose(1, 2) + emit_and_transition  # shape: (uf, K, K)
            max_v = log_sum.max(dim=1)[0].unsqueeze(dim=1)  # shape: (uf, 1, K)
            log_sum = log_sum - max_v  # shape: (uf, K, K)
            d_uf = max_v + torch.logsumexp(log_sum, dim=1).unsqueeze(dim=1)  # shape: (uf, 1, K)
            d = torch.cat((d_uf, d[n_unfinished:]), dim=0)
        d = d.squeeze(dim=1)  # shape: (b, K)
        max_d = d.max(dim=-1)[0]  # shape: (b,)
        d = max_d + torch.logsumexp(d - max_d.unsqueeze(dim=1), dim=1)  # shape: (b,)
        llk = total_score - d  # shape: (b,)
        loss = -llk  # shape: (b,)
        return loss

    def predict(self, sentences, sen_lengths):
        batch_size = sentences.shape[0]
        mask = (sentences != self.sent_vocab[self.sent_vocab.PAD])  # shape: (b, len)
        sentences = sentences.transpose(0, 1)  # shape: (len, b)
        sentences = self.embedding(sentences)  # shape: (len, b, e)
        emit_score = self.encode(sentences, sen_lengths)  # shape: (b, len, K)
        tags = [[[i] for i in range(len(self.tag_vocab))]] * batch_size  # list, shape: (b, K, 1)
        d = torch.unsqueeze(emit_score[:, 0], dim=1)  # shape: (b, 1, K)
        for i in range(1, sen_lengths[0]):
            n_unfinished = mask[:, i].sum()
            d_uf = d[: n_unfinished]  # shape: (uf, 1, K)
            emit_and_transition = self.transition + emit_score[: n_unfinished, i].unsqueeze(dim=1)  # shape: (uf, K, K)
            new_d_uf = d_uf.transpose(1, 2) + emit_and_transition  # shape: (uf, K, K)
            d_uf, max_idx = torch.max(new_d_uf, dim=1)
            max_idx = max_idx.tolist()  # list, shape: (nf, K)
            tags[: n_unfinished] = [[tags[b][k] + [j] for j, k in enumerate(max_idx[b])] for b in range(n_unfinished)]
            d = torch.cat((torch.unsqueeze(d_uf, dim=1), d[n_unfinished:]), dim=0)  # shape: (b, 1, K)
        d = d.squeeze(dim=1)  # shape: (b, K)
        _, max_idx = torch.max(d, dim=1)  # shape: (b,)
        max_idx = max_idx.tolist()
        tags = [tags[b][k] for b, k in enumerate(max_idx)]
        return tags

    def save(self, filepath):
        params = { 'sent_vocab': self.sent_vocab,  'tag_vocab': self.tag_vocab, 'args': dict(dropout_rate=self.dropout_rate, embed_size=self.embed_size, hidden_size=self.hidden_size),
            'state_dict': self.state_dict() }
        torch.save(params, filepath)

    @staticmethod
    def load(filepath, device_to_load):
        params = torch.load(filepath, map_location=lambda storage, loc: storage)
        model = BiLSTMCRF(params['sent_vocab'], params['tag_vocab'], **params['args'])
        model.load_state_dict(params['state_dict'])
        model.to(device_to_load)
        return model

    @property
    def device(self):
        return self.embedding.weight.device

In [None]:
### step 6.2: define loss function for training model

def cal_dev_loss(model, dev_data, batch_size, sent_vocab, tag_vocab, device):

 
    is_training = model.training
    model.eval()
    loss, n_sentences = 0, 0
    with torch.no_grad():
        for sentences, tags in batch_iter(dev_data, batch_size, shuffle=False):
            sentences, sent_lengths = pad(sentences, sent_vocab[sent_vocab.PAD], device)
            tags, _ = pad(tags, tag_vocab[sent_vocab.PAD], device)
            batch_loss = model(sentences, tags, sent_lengths)  # shape: (b,)
            loss += batch_loss.sum().item()
            n_sentences += len(sentences)
    model.train(is_training)
    return loss / n_sentences

In [None]:
### step 6.3: define all hyper-parameters of Bi-LSTM-CRF model

model_save_path =model_file
optimizer_save_path = optimizer_file
min_dev_loss = float('inf')
device = torch.device('cuda')
patience, decay_num = 0, 0


max_epoch = 3000
log_every = 500
validation_every = 500

dropout=0.5
embed_size= 128
hidden_size= 128
batch_size=1024*2

clip_max_norm= 5.0
lr= 0.001
patience_threshold=0.98
max_patience= 4
max_decay=3
lr_decay= 0.5


In [None]:
### step 7. instantiate model and hyper-parameters

### step 7.1: instantiate Bi-LSTM-CRF model with word,tag and hyper-paramters

model = BiLSTMCRF(sent_vocab, tag_vocab, float(dropout), int(embed_size), int(hidden_size)).to(device)


for name, param in model.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, 0, 0.01)
        else:
            nn.init.constant_(param.data, 0)


In [None]:
### step 7.2: define optimizer

optimizer = torch.optim.Adam(model.parameters(), lr=float(lr))
train_iter = 0  # train iter num
record_loss_sum, record_tgt_word_sum, record_batch_size = 0, 0, 0  # sum in one training log
cum_loss_sum, cum_tgt_word_sum, cum_batch_size = 0, 0, 0  # sum in one validation log
record_start, cum_start = time.time(), time.time()

In [None]:
### step 8: training process

print('start training...')
for epoch in range(max_epoch):
  for sentences, tags in batch_iter(train_data, batch_size=int(batch_size)):
    train_iter += 1
    current_batch_size = len(sentences)
    sentences, sent_lengths = pad(sentences, sent_vocab[sent_vocab.PAD], device)
    tags, _ = pad(tags, tag_vocab[tag_vocab.PAD], device)

    # back propagation
    optimizer.zero_grad()
    batch_loss = model(sentences, tags, sent_lengths)  # shape: (b,)
    loss = batch_loss.mean()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=float(clip_max_norm))
    optimizer.step()

    record_loss_sum += batch_loss.sum().item()
    record_batch_size += current_batch_size
    record_tgt_word_sum += sum(sent_lengths)

    cum_loss_sum += batch_loss.sum().item()
    cum_batch_size += current_batch_size
    cum_tgt_word_sum += sum(sent_lengths)

    if train_iter % log_every == 0:
      print('log: epoch %d, iter %d, %.1f words/sec, avg_loss %f, time %.1f sec' %(epoch + 1, train_iter, record_tgt_word_sum / (time.time() - record_start), record_loss_sum / record_batch_size, time.time() - record_start))
      record_loss_sum, record_batch_size, record_tgt_word_sum = 0, 0, 0
      record_start = time.time()

    if train_iter % validation_every == 0:
      print('dev: epoch %d, iter %d, %.1f words/sec, avg_loss %f, time %.1f sec' % (epoch + 1, train_iter, cum_tgt_word_sum / (time.time() - cum_start),cum_loss_sum / cum_batch_size, time.time() - cum_start))
      cum_loss_sum, cum_batch_size, cum_tgt_word_sum = 0, 0, 0
      model.save(model_save_path)
      torch.save(optimizer.state_dict(), optimizer_save_path)

      dev_loss = cal_dev_loss(model, dev_data, 216, sent_vocab, tag_vocab, device)
      if dev_loss < min_dev_loss * float(patience_threshold):
        min_dev_loss = dev_loss
        model.save(model_save_path)
        torch.save(optimizer.state_dict(), optimizer_save_path)
        patience = 0
      else:
        patience += 1
        if patience == int(max_patience):
          decay_num += 1
          if decay_num == int(max_decay):
            print('Early stop. Save result model to %s' % model_save_path)
            
            lr = optimizer.param_groups[0]['lr'] * float(lr_decay)
            model = BiLSTMCRF.load(model_save_path, device)
            optimizer.load_state_dict(torch.load(optimizer_save_path))
            for param_group in optimizer.param_groups:
              param_group['lr'] = lr
            patience = 0
      print('dev: epoch %d, iter %d, dev_loss %f, patience %d, decay_num %d' %(epoch + 1, train_iter, dev_loss, patience, decay_num))
      cum_start = time.time()
      if train_iter % log_every == 0:
        record_start = time.time()
print('Reached %d epochs, Save result model to %s' % (max_epoch, model_save_path))

start training...
log: epoch 84, iter 500, 73246.1 words/sec, avg_loss 27.093003, time 250.1 sec
dev: epoch 84, iter 500, 73246.0 words/sec, avg_loss 27.093003, time 250.1 sec
dev: epoch 84, iter 500, dev_loss 1.523411, patience 0, decay_num 0
log: epoch 167, iter 1000, 73055.4 words/sec, avg_loss 11.250969, time 250.8 sec
dev: epoch 167, iter 1000, 73055.3 words/sec, avg_loss 11.250969, time 250.8 sec
dev: epoch 167, iter 1000, dev_loss 1.383882, patience 0, decay_num 0
log: epoch 250, iter 1500, 72972.2 words/sec, avg_loss 4.873937, time 250.5 sec
dev: epoch 250, iter 1500, 72972.2 words/sec, avg_loss 4.873937, time 250.5 sec
dev: epoch 250, iter 1500, dev_loss 1.497915, patience 1, decay_num 0
log: epoch 334, iter 2000, 72869.7 words/sec, avg_loss 2.726739, time 251.4 sec
dev: epoch 334, iter 2000, 72869.7 words/sec, avg_loss 2.726739, time 251.4 sec
dev: epoch 334, iter 2000, dev_loss 1.554167, patience 2, decay_num 0
log: epoch 417, iter 2500, 72745.4 words/sec, avg_loss 1.886146,

In [None]:
#### step 9: Test model with test dataset

In [None]:
#### step 9.1: reload word and tag files
sent_vocab = Vocab.load(sent_vocab_file)
tag_vocab = Vocab.load(tag_vocab_file)

In [None]:
### step 9.2: load test dataset
sentences, tags = read_corpus(test_data)

In [None]:
### step 9.3: convert test dataset from text into indice
sentences_indice = words2indices(sentences[:4], sent_vocab)


In [None]:
### step 9.4 : preview the test dataset as indice
sentences=sentences[:4]

In [None]:
sentences_indice

[[0,
  5,
  1746,
  4471,
  12577,
  12577,
  10318,
  1459,
  379,
  4,
  12577,
  88,
  12577,
  177,
  1],
 [0, 1871, 426, 63, 228, 205, 1],
 [0, 468, 1],
 [0, 149, 537, 330, 223, 13, 2337, 315, 330, 223, 1]]

In [None]:
### step 9.5 : convert tag into indice
tags_indice = words2indices(tags[:4], tag_vocab)

In [None]:
### step 9.6 : combine test datasets and tags into test object
test_objects = list(zip(sentences_indice, tags_indice))

In [None]:
### step 9.7: reload model

device = torch.device('cuda' )
model = BiLSTMCRF.load(model_file, device)
print('start testing...')
print('using device', device)

start testing...
using device cuda


In [None]:
model.eval()


BiLSTMCRF(
  (embedding): Embedding(13002, 128)
  (dropout): Dropout(p=0.5, inplace=False)
  (encoder): LSTM(128, 128, bidirectional=True)
  (hidden2emit_score): Linear(in_features=256, out_features=27, bias=True)
)

In [None]:
### step 9.8 : define function to convert token into vocabs

def tokens_to_spans( tokens, tags):
  spans = []
  curr, start, end, ent_cls = 0, None, None, None
  sentence = " ".join(tokens)

  for token, tag in zip(tokens, tags):

    if tag == "O":
      if ent_cls is not None:
        spans.append((start, end, ent_cls))
        start, end, ent_cls = None, None, None

    elif tag.startswith("B-"):
        ent_cls = tag.split("-")[1]
        start = curr
        end = curr + len(token)
    else:  # I-xxx
        try:
          end += len(token) + 1
        except:
          end = 0
          end += len(token) + 1
                    # advance curr
    curr += len(token) + 1
               
            # handle remaining span
    if ent_cls is not None:
      spans.append((start, end, ent_cls))
  return sentence, spans

In [None]:
### step 10 : try model with dataset without tag
### step 10.1 : load test dataset with tag
test_sentences=[]
with open(test_data,'r',encoding='utf-8') as f:
  for line in f.read().split('\n\n'):
        line=[l.split('\t') for l in line.split('\n')]
        words=[i[0] for i in line if len(i)==2]
        test_sentences.append(' '.join(words))

In [None]:
!pip install intervaltree



In [None]:
from intervaltree import Interval, IntervalTree

In [None]:
### step 10.2: define hyper-parameters
max_len = 223

In [None]:
### step 10.3 : run model with test dataset 
for test_sent in test_sentences[:500]:
  ori_tokens=test_sent.split(' ')
  tokens=['<START>']+ori_tokens+['<END>']

  tokens_idx = words2indices([tokens], sent_vocab)[0]
 
  lengths = len(tokens_idx)


    
  padded_data=tokens_idx + [sent_vocab[sent_vocab.PAD]] * (max_len - len(tokens_idx))
  padded_tokens_idx, tokens_idx_len=torch.tensor([padded_data], device=device),[lengths]
  pred_tag_idx = model.predict(padded_tokens_idx, tokens_idx_len)[0][1:-1]
  pred_tags=[tag_vocab.id2word(p) for p in pred_tag_idx]
  print(ori_tokens)
  print(pred_tags,'\n')

['Mr.', 'Koh', 'How', 'Thim', '(高厚琛先生)', '125', 'Kim', 'Tian', 'Road', '#05-96', 'Singapore', '160125', 'Singaporean']
['B-PERSON', 'I-PERSON', 'I-PERSON', 'E-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] 

['ICBCI', '(Joint', 'Lead', 'Manager', 'only)']
['B-J', 'O', 'O', 'O', 'O'] 

['DBS']
['S-J'] 

['United', 'Kingdom', 'J.P.', 'Morgan', 'Securities', 'LLC,', 'or', 'J.P.', 'Morgan']
['O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'E-ORG', 'O', 'B-J', 'E-J'] 

['Industry', 'consultant', 'Ipsos', 'Sdn.', 'Bhd.', '23rd', 'Floor,', 'Centerpoint', 'North', 'Mid', 'Valley', 'City,', 'Lingkaran', 'Syed', 'Putra', '59200', 'Kuala', 'Lumpur', 'Malaysia']
['B-TITLE', 'E-TITLE', 'B-ORG', 'I-ORG', 'E-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] 

['Auditor', 'and', 'Reporting', 'Accountants', 'Deloitte', 'Touche', 'Tohmatsu']
['B-TITLE', 'I-TITLE', 'I-TITLE', 'E-TITLE', 'B-ORG', 'I-ORG', 'E-ORG'] 

['Legal', 'advisers', 'to', 'our', 'Company', 'As', 'Hong', 'Kong', 

In [None]:

with torch.no_grad(): 
 for sent_sample,sent_sample_token_idx in zip(test_sample,test_token_idx):
  
    lengths = len(sent_sample_token_idx)
    max_len = 88

    
    padded_data=sent_sample_token_idx + [sent_vocab[sent_vocab.PAD]] * (max_len - len(sent_sample_token_idx))
 
    padded_sent_idx, sent_idx_len=torch.tensor([padded_data], device=device),[lengths]

    pred_tag_idx = model.predict(padded_sent_idx, sent_idx_len)[0][1:-1]

    pred_tags=[tag_vocab.id2word(p) for p in pred_tag_idx]
    
    
    sent_sample,sent_span=tokens_to_spans( sent_sample[1:-1],pred_tags)
    sent_span=list(set(sent_span))
    t = IntervalTree()
    for s in sent_span:
      t[s[0]:s[1]] = s[2]
    t_=t.copy()
    t_.merge_overlaps()
    print(r'sample sentence:',sent_sample)
    for tt in list(t_):
        t_start,t_end=tt.begin,tt.end
  
        for s in sent_span:
          if s[0]==t_start and s[1]==t_end:
            print(r'entity:         ',sent_sample[t_start:t_end],':',s[2])
    print('\n')
    #print(sent_sample,sent_span)
   #   sent, pred_tags = sent[1: -1],  pred_tags[1: -1]
   #   print(sent,pred_tags)


[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m


sample sentence: Authorised representatives ZHANG Lianghong (張梁洪) Shanxianghu Xiaozu Pufa Village Committee Changning Town Boluo County Guangdong Province, the PRC*


sample sentence: Compliance adviser Alliance Capital Partners Limited a licensed corporation under the SFC to carry on Type 1 (dealing in securities) and 6 (advising corporate finance) regulated activities SFO Room 1502–1503A, Wing On House 71 Des Voeux Road Central Central, Hong Kong
entity:          Alliance Capital Partners Limited : ORG
entity:          Compliance adviser : TITLE


sample sentence: Company secretary . Mr. Lau Wai Piu Patrick (劉偉彪) (Fellow member of the Hong Kong Institute Certified Public Accountants) Unit B, 13th Floor Winsan Tower 98 Thomson Road Wanchai,
entity:          Company secretary : TITLE
entity:          Mr. Lau Wai Piu Patrick : PERSON


sample sentence: United Kingdom J.P. Morgan Securities LLC, or J.P. Morgan (Joint Bookrunner in relation to the Inte