### Base BERT with second-to-last pooling for word representation, then apply CRF to calculate sentence score, and minimize the negative log likelihood to train

In [1]:
import torch
from utils.bert_model import BERT_CRF
from utils.utils import prepare, predict, train, evaluate

hparams = {
    # 'path':'data/data_test_en.txt', 
    'path':'data/data.txt',
    'epochs': 100,
    'batch_size': 32,
    'embedding_dim': 768,
    'device':'cuda:0',
    'bert':'bert-base-chinese',
    'seq_length':256,
    'learning_rate': 3e-5,
    'save_path':'model_params/bert_base_model'
}

In [10]:
attr_dict, loaders = prepare(hparams)

In [3]:
bert_model = BERT_CRF(hparams,attr_dict['tag2idx']).to(hparams['device'])

In [4]:
bert_model = train(hparams, bert_model, loaders, lr=hparams['learning_rate'], schedule=True)

epoch 1 , step 27 , loss: 316.9347: : 28it [00:20,  1.34it/s]
{'macro_f1': 0.1628, 'weighted_f1': 0.8052}
epoch 2 , step 27 , loss: 152.0206: : 28it [00:20,  1.33it/s]
{'macro_f1': 0.3235, 'weighted_f1': 0.8574}
epoch 3 , step 27 , loss: 109.7785: : 28it [00:20,  1.36it/s]
{'macro_f1': 0.4243, 'weighted_f1': 0.8813}
epoch 4 , step 27 , loss: 85.2446: : 28it [00:20,  1.37it/s]
{'macro_f1': 0.447, 'weighted_f1': 0.8882}
epoch 5 , step 27 , loss: 72.6541: : 28it [00:20,  1.35it/s]
{'macro_f1': 0.4732, 'weighted_f1': 0.8947}
epoch 6 , step 27 , loss: 59.6001: : 28it [00:20,  1.38it/s]
{'macro_f1': 0.4989, 'weighted_f1': 0.8986}
epoch 7 , step 27 , loss: 50.1959: : 28it [00:20,  1.37it/s]
{'macro_f1': 0.522, 'weighted_f1': 0.9015}
epoch 8 , step 27 , loss: 43.1029: : 28it [00:20,  1.36it/s]
{'macro_f1': 0.5324, 'weighted_f1': 0.9028}
epoch 9 , step 27 , loss: 38.2612: : 28it [00:20,  1.36it/s]
{'macro_f1': 0.5398, 'weighted_f1': 0.9053}
epoch 10 , step 27 , loss: 34.9644: : 28it [00:20,  1.

In [11]:
evaluate(bert_model, loaders[1])

              precision    recall  f1-score   support

           2       1.00      1.00      1.00    167874
           3       0.93      0.91      0.92       143
           4       0.98      0.97      0.97      1505
           5       0.93      0.91      0.92       377
           6       0.98      0.95      0.96      6027
           7       0.96      0.95      0.95      1573
           8       0.97      0.98      0.98      6749
           9       0.95      0.94      0.95       294
          10       0.98      0.98      0.98      1448
          11       0.93      0.94      0.94       449
          12       0.97      0.96      0.96      2432
          13       0.95      0.96      0.96      1520
          14       0.97      0.95      0.96      2850
          15       0.93      0.93      0.93       771
          16       0.96      0.95      0.95      5565
          17       0.96      0.96      0.96      1574
          18       0.97      0.96      0.96      3162
          19       0.86    

{'macro_f1': 0.9544, 'weighted_f1': 0.99}

In [9]:
torch.save(bert_model.state_dict(),hparams['save_path'])

In [4]:
# bert_model.load_state_dict(torch.load(hparams['save_path']))

<All keys matched successfully>

In [9]:
predict(['中国人民大学第三十五届一二九合唱音乐节如期举行，信息学院分团委文化部将组织信院全体同学参加','张配天获得最佳论文奖'], bert_model, attr_dict['tokenizer'], 50)

************************
[CLS] O
中 B-department
国 I-department
人 I-department
民 I-department
大 I-department
学 I-department
第 B-department
三 I-scholarship
十 I-scholarship
五 I-scholarship
届 I-scholarship
一 I-scholarship
二 I-scholarship
九 I-scholarship
合 I-scholarship
唱 I-scholarship
音 I-scholarship
乐 I-scholarship
节 I-scholarship
如 O
期 I-scholarship
举 O
行 O
， O
信 B-department
息 I-department
学 I-department
院 I-department
分 I-department
团 I-organization
委 I-organization
文 B-organization
化 I-organization
部 I-organization
将 O
组 O
织 O
信 O
院 O
全 O
体 O
同 O
学 O
参 O
加 O
[SEP] O
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
************************
[CLS] O
张 B-name
配 I-name
天 I-name
获 O
得 O
最 O
佳 I-award
论 O
文 O
奖 O
[SEP] O
[PAD] O
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
[PAD] [PAD]
[PAD

### Built-in BertForTokenClassification with labels input, normalize the output logits for classification, use the output loss for training

In [None]:
import torch
from utils.bert_model import BERT_BASE
from utils.utils import prepare, predict, train, evaluate

hparams = {
    # 'path':'data/data_test_en.txt', 
    'path':'data/data.txt',
    'epochs': 100,
    'batch_size': 40,
    'embedding_dim': 768,
    'device': 'cuda:0',
    'bert': 'bert-base-chinese',
    'seq_length': 256,
    'learning_rate': 3e-5
}

attr_dict, loaders = prepare(hparams)

In [None]:
bert_model = BERT_BASE(hparams, attr_dict['tag2idx']).to(hparams['device'])

In [None]:
bert_model = train(hparams, bert_model, loaders, lr=hparams['learning_rate'], schedule=True)

In [None]:
evaluate(bert_model, loaders[1])

### NER pretrained BERT with last hidden state pooling, then directly map the 768 dimensional hidden states to the tagset space, minimize the negative log likelihood for classification

In [1]:
import torch
from utils.bert_model import BERT_NER
from utils.utils import prepare, predict, train, evaluate

hparams = {
    # 'path':'data/data_test_en.txt', 
    'path':'data/data.txt',
    'epochs': 100,
    'batch_size': 32,
    'embedding_dim': 768,
    'device':'cuda:0',
    # 'bert':'dslim/bert-base-NER', 
    'bert': 'ckiplab/bert-base-chinese-ner',
    'seq_length': 256,
    'learning_rate': 3e-5
}
attr_dict, loaders = prepare(hparams)

In [2]:
bert_model = BERT_NER(hparams, attr_dict['tag2idx']).to(hparams['device'])

In [3]:
bert_model = train(hparams, bert_model, loaders, lr=hparams['learning_rate'], schedule=True)

epoch 1 , step 29 , loss: 82.0513: : 30it [00:03,  8.45it/s]
{'macro_f1': 0.0295, 'weighted_f1': 0.2466}
epoch 2 , step 29 , loss: 64.9663: : 30it [00:03,  9.09it/s]
{'macro_f1': 0.0295, 'weighted_f1': 0.2466}
epoch 3 , step 29 , loss: 63.0864: : 30it [00:03,  9.21it/s]
{'macro_f1': 0.0295, 'weighted_f1': 0.2466}
epoch 4 , step 29 , loss: 63.4545: : 30it [00:03,  9.11it/s]
{'macro_f1': 0.0295, 'weighted_f1': 0.2466}
epoch 5 , step 29 , loss: 64.0466: : 30it [00:03,  9.94it/s]
{'macro_f1': 0.0295, 'weighted_f1': 0.2466}
epoch 6 , step 29 , loss: 63.5238: : 30it [00:03,  9.86it/s]
{'macro_f1': 0.0295, 'weighted_f1': 0.2466}
epoch 7 , step 29 , loss: 63.3384: : 30it [00:03,  9.62it/s]
{'macro_f1': 0.0295, 'weighted_f1': 0.2466}
epoch 8 , step 29 , loss: 63.6640: : 30it [00:03,  8.89it/s]
{'macro_f1': 0.0295, 'weighted_f1': 0.2466}
epoch 9 , step 29 , loss: 63.2327: : 30it [00:03,  9.55it/s]
{'macro_f1': 0.0295, 'weighted_f1': 0.2466}
epoch 10 , step 29 , loss: 63.1874: : 30it [00:03,  9.0

In [None]:
evaluate(bert_model, loaders[0])