### Train a biomedical NER model from scratch

In [1]:
# 1. get the corpus
from flair.data import Corpus
from flair.datasets import ColumnCorpus

columns = {0: 'text', 1: 'pos', 3: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = 'D:/FloridaBlue/Flair/Optim/data/ner/NCBI-disease'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')
    
    
print(corpus)

2020-09-18 00:02:15,222 Reading data from D:\FloridaBlue\Flair\Optim\data\ner\NCBI-disease
2020-09-18 00:02:15,222 Train: D:\FloridaBlue\Flair\Optim\data\ner\NCBI-disease\train.txt
2020-09-18 00:02:15,222 Dev: D:\FloridaBlue\Flair\Optim\data\ner\NCBI-disease\dev.txt
2020-09-18 00:02:15,222 Test: D:\FloridaBlue\Flair\Optim\data\ner\NCBI-disease\test.txt
Corpus: 5424 train + 923 dev + 940 test sentences


In [2]:
# 2. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type="ner")

# 3. initialize embeddings
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings

embedding_types = [

    # word embeddings trained on PubMed and PMC
    WordEmbeddings("pubmed"),

    # flair embeddings trained on PubMed and PMC
    FlairEmbeddings("pubmed-forward"),
    FlairEmbeddings("pubmed-backward"),
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [3]:
# 4. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type="ner",
    use_crf=True,
    locked_dropout=0.5
)

In [4]:
# 5. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train(
base_path="taggers/ncbi-disease-flair-word-readdataset",
    train_with_dev=False,
    max_epochs=200,
    learning_rate=0.1,
    mini_batch_size=32
)

2020-09-18 00:02:41,359 ----------------------------------------------------------------------------------------------------
2020-09-18 00:02:41,359 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('pubmed')
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.1, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=275, bias=True)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.1, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=275, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4296, out_features=4296, bias=True)
  (rnn): LSTM(4296, 256, batch_first=True, bi

2020-09-18 00:06:53,556 BAD EPOCHS (no improvement): 1
2020-09-18 00:06:53,557 ----------------------------------------------------------------------------------------------------
2020-09-18 00:06:57,624 epoch 5 - iter 17/170 - loss 0.89187520 - samples/sec: 133.79 - lr: 0.100000
2020-09-18 00:07:01,956 epoch 5 - iter 34/170 - loss 0.93408523 - samples/sec: 125.64 - lr: 0.100000
2020-09-18 00:07:05,895 epoch 5 - iter 51/170 - loss 0.94454618 - samples/sec: 138.14 - lr: 0.100000
2020-09-18 00:07:09,856 epoch 5 - iter 68/170 - loss 0.93506569 - samples/sec: 137.59 - lr: 0.100000
2020-09-18 00:07:13,580 epoch 5 - iter 85/170 - loss 0.91479233 - samples/sec: 146.14 - lr: 0.100000
2020-09-18 00:07:17,490 epoch 5 - iter 102/170 - loss 0.91909872 - samples/sec: 139.52 - lr: 0.100000
2020-09-18 00:07:21,661 epoch 5 - iter 119/170 - loss 0.90916990 - samples/sec: 130.43 - lr: 0.100000
2020-09-18 00:07:25,623 epoch 5 - iter 136/170 - loss 0.89463209 - samples/sec: 137.34 - lr: 0.100000
2020-09-1

2020-09-18 00:11:31,629 epoch 10 - iter 136/170 - loss 0.60109235 - samples/sec: 139.25 - lr: 0.100000
2020-09-18 00:11:35,549 epoch 10 - iter 153/170 - loss 0.59377645 - samples/sec: 138.76 - lr: 0.100000
2020-09-18 00:11:39,669 epoch 10 - iter 170/170 - loss 0.61314742 - samples/sec: 132.05 - lr: 0.100000
2020-09-18 00:11:39,669 ----------------------------------------------------------------------------------------------------
2020-09-18 00:11:39,669 EPOCH 10 done: loss 0.6131 - lr 0.1000000
2020-09-18 00:11:44,718 DEV : loss 0.5931841135025024 - score 0.865
2020-09-18 00:11:44,780 BAD EPOCHS (no improvement): 3
2020-09-18 00:11:44,789 ----------------------------------------------------------------------------------------------------
2020-09-18 00:11:48,872 epoch 11 - iter 17/170 - loss 0.53825267 - samples/sec: 133.60 - lr: 0.100000
2020-09-18 00:11:52,845 epoch 11 - iter 34/170 - loss 0.55978375 - samples/sec: 136.96 - lr: 0.100000
2020-09-18 00:11:56,818 epoch 11 - iter 51/170 -

2020-09-18 00:15:59,327 epoch 16 - iter 34/170 - loss 0.48419668 - samples/sec: 147.19 - lr: 0.100000
2020-09-18 00:16:03,337 epoch 16 - iter 51/170 - loss 0.51148497 - samples/sec: 135.68 - lr: 0.100000
2020-09-18 00:16:07,391 epoch 16 - iter 68/170 - loss 0.50779985 - samples/sec: 134.18 - lr: 0.100000
2020-09-18 00:16:11,646 epoch 16 - iter 85/170 - loss 0.51349543 - samples/sec: 127.84 - lr: 0.100000
2020-09-18 00:16:15,516 epoch 16 - iter 102/170 - loss 0.50470270 - samples/sec: 140.56 - lr: 0.100000
2020-09-18 00:16:19,630 epoch 16 - iter 119/170 - loss 0.50637068 - samples/sec: 132.23 - lr: 0.100000
2020-09-18 00:16:23,550 epoch 16 - iter 136/170 - loss 0.50876104 - samples/sec: 138.78 - lr: 0.100000
2020-09-18 00:16:27,381 epoch 16 - iter 153/170 - loss 0.51450126 - samples/sec: 142.02 - lr: 0.100000
2020-09-18 00:16:31,318 epoch 16 - iter 170/170 - loss 0.51589805 - samples/sec: 138.21 - lr: 0.100000
2020-09-18 00:16:31,318 -----------------------------------------------------

2020-09-18 00:20:15,692 epoch 21 - iter 170/170 - loss 0.39766587 - samples/sec: 139.28 - lr: 0.050000
2020-09-18 00:20:15,692 ----------------------------------------------------------------------------------------------------
2020-09-18 00:20:15,692 EPOCH 21 done: loss 0.3977 - lr 0.0500000
2020-09-18 00:20:20,716 DEV : loss 0.5868346691131592 - score 0.8621
Epoch    21: reducing learning rate of group 0 to 2.5000e-02.
2020-09-18 00:20:20,787 BAD EPOCHS (no improvement): 4
2020-09-18 00:20:20,789 ----------------------------------------------------------------------------------------------------
2020-09-18 00:20:24,821 epoch 22 - iter 17/170 - loss 0.39045621 - samples/sec: 134.94 - lr: 0.025000
2020-09-18 00:20:28,863 epoch 22 - iter 34/170 - loss 0.39491051 - samples/sec: 134.61 - lr: 0.025000
2020-09-18 00:20:32,957 epoch 22 - iter 51/170 - loss 0.39480124 - samples/sec: 132.85 - lr: 0.025000
2020-09-18 00:20:36,775 epoch 22 - iter 68/170 - loss 0.38915492 - samples/sec: 142.51 - 

2020-09-18 00:24:17,337 epoch 27 - iter 51/170 - loss 0.33723315 - samples/sec: 137.51 - lr: 0.012500
2020-09-18 00:24:21,304 epoch 27 - iter 68/170 - loss 0.35854419 - samples/sec: 137.10 - lr: 0.012500
2020-09-18 00:24:25,133 epoch 27 - iter 85/170 - loss 0.35326548 - samples/sec: 142.12 - lr: 0.012500
2020-09-18 00:24:29,219 epoch 27 - iter 102/170 - loss 0.34716650 - samples/sec: 133.21 - lr: 0.012500
2020-09-18 00:24:33,305 epoch 27 - iter 119/170 - loss 0.34130911 - samples/sec: 133.40 - lr: 0.012500
2020-09-18 00:24:37,224 epoch 27 - iter 136/170 - loss 0.33419773 - samples/sec: 138.83 - lr: 0.012500
2020-09-18 00:24:41,156 epoch 27 - iter 153/170 - loss 0.33337409 - samples/sec: 138.41 - lr: 0.012500
2020-09-18 00:24:45,258 epoch 27 - iter 170/170 - loss 0.33337686 - samples/sec: 132.60 - lr: 0.012500
2020-09-18 00:24:45,258 ----------------------------------------------------------------------------------------------------
2020-09-18 00:24:45,258 EPOCH 27 done: loss 0.3334 - l

2020-09-18 00:28:29,653 ----------------------------------------------------------------------------------------------------
2020-09-18 00:28:29,653 EPOCH 32 done: loss 0.3305 - lr 0.0062500
2020-09-18 00:28:34,645 DEV : loss 0.5325443148612976 - score 0.8764
2020-09-18 00:28:34,716 BAD EPOCHS (no improvement): 3
2020-09-18 00:28:34,716 ----------------------------------------------------------------------------------------------------
2020-09-18 00:28:38,626 epoch 33 - iter 17/170 - loss 0.29750031 - samples/sec: 139.26 - lr: 0.006250
2020-09-18 00:28:42,403 epoch 33 - iter 34/170 - loss 0.31811356 - samples/sec: 144.04 - lr: 0.006250
2020-09-18 00:28:46,360 epoch 33 - iter 51/170 - loss 0.31450960 - samples/sec: 137.48 - lr: 0.006250
2020-09-18 00:28:50,281 epoch 33 - iter 68/170 - loss 0.31428859 - samples/sec: 139.08 - lr: 0.006250
2020-09-18 00:28:54,183 epoch 33 - iter 85/170 - loss 0.30714149 - samples/sec: 139.51 - lr: 0.006250
2020-09-18 00:28:58,188 epoch 33 - iter 102/170 - 

2020-09-18 00:32:35,012 epoch 38 - iter 68/170 - loss 0.32549229 - samples/sec: 136.63 - lr: 0.001563
2020-09-18 00:32:38,948 epoch 38 - iter 85/170 - loss 0.32145918 - samples/sec: 138.23 - lr: 0.001563
2020-09-18 00:32:43,010 epoch 38 - iter 102/170 - loss 0.31129124 - samples/sec: 133.94 - lr: 0.001563
2020-09-18 00:32:46,906 epoch 38 - iter 119/170 - loss 0.31501892 - samples/sec: 139.64 - lr: 0.001563
2020-09-18 00:32:50,743 epoch 38 - iter 136/170 - loss 0.31675252 - samples/sec: 141.78 - lr: 0.001563
2020-09-18 00:32:54,670 epoch 38 - iter 153/170 - loss 0.32111842 - samples/sec: 138.53 - lr: 0.001563
2020-09-18 00:32:58,648 epoch 38 - iter 170/170 - loss 0.32335200 - samples/sec: 136.74 - lr: 0.001563
2020-09-18 00:32:58,648 ----------------------------------------------------------------------------------------------------
2020-09-18 00:32:58,649 EPOCH 38 done: loss 0.3234 - lr 0.0015625
2020-09-18 00:33:03,647 DEV : loss 0.5304630398750305 - score 0.8785
2020-09-18 00:33:03,7

2020-09-18 00:36:42,426 EPOCH 43 done: loss 0.3257 - lr 0.0007813
2020-09-18 00:36:47,857 DEV : loss 0.5393401980400085 - score 0.876
2020-09-18 00:36:47,928 BAD EPOCHS (no improvement): 2
2020-09-18 00:36:47,928 ----------------------------------------------------------------------------------------------------
2020-09-18 00:36:52,106 epoch 44 - iter 17/170 - loss 0.35665796 - samples/sec: 130.38 - lr: 0.000781
2020-09-18 00:36:56,056 epoch 44 - iter 34/170 - loss 0.37376548 - samples/sec: 137.72 - lr: 0.000781
2020-09-18 00:36:59,951 epoch 44 - iter 51/170 - loss 0.35279468 - samples/sec: 139.67 - lr: 0.000781
2020-09-18 00:37:03,990 epoch 44 - iter 68/170 - loss 0.34280395 - samples/sec: 134.70 - lr: 0.000781
2020-09-18 00:37:08,111 epoch 44 - iter 85/170 - loss 0.34460978 - samples/sec: 131.99 - lr: 0.000781
2020-09-18 00:37:11,941 epoch 44 - iter 102/170 - loss 0.33491708 - samples/sec: 142.12 - lr: 0.000781
2020-09-18 00:37:15,845 epoch 44 - iter 119/170 - loss 0.33394136 - sampl

2020-09-18 00:40:55,323 epoch 49 - iter 102/170 - loss 0.31615778 - samples/sec: 136.75 - lr: 0.000391
2020-09-18 00:40:59,438 epoch 49 - iter 119/170 - loss 0.31353262 - samples/sec: 132.51 - lr: 0.000391
2020-09-18 00:41:03,227 epoch 49 - iter 136/170 - loss 0.31687287 - samples/sec: 143.58 - lr: 0.000391
2020-09-18 00:41:07,179 epoch 49 - iter 153/170 - loss 0.32412925 - samples/sec: 138.00 - lr: 0.000391
2020-09-18 00:41:11,084 epoch 49 - iter 170/170 - loss 0.32499816 - samples/sec: 139.33 - lr: 0.000391
2020-09-18 00:41:11,084 ----------------------------------------------------------------------------------------------------
2020-09-18 00:41:11,084 EPOCH 49 done: loss 0.3250 - lr 0.0003906
2020-09-18 00:41:16,083 DEV : loss 0.5407122373580933 - score 0.8719
Epoch    49: reducing learning rate of group 0 to 1.9531e-04.
2020-09-18 00:41:16,157 BAD EPOCHS (no improvement): 4
2020-09-18 00:41:16,157 ------------------------------------------------------------------------------------

{'test_score': 0.8647294589178357,
 'dev_score_history': [0.6949384404924761,
  0.760989010989011,
  0.8270967741935485,
  0.8021327014218009,
  0.7961564859299931,
  0.8395546823837591,
  0.86625,
  0.8564231738035265,
  0.8484848484848484,
  0.864965774735532,
  0.8673926969891095,
  0.8612321095208463,
  0.8849009900990098,
  0.8757097791798107,
  0.8748451053283767,
  0.8607120549656464,
  0.862600123228589,
  0.8628181253879578,
  0.8621334996880848,
  0.8692933083176986,
  0.8621334996880848,
  0.8657844990548205,
  0.8675914249684741,
  0.8700564971751413,
  0.8735053492762744,
  0.8757841907151819,
  0.8670012547051442,
  0.8757097791798107,
  0.8692356285533797,
  0.8754716981132076,
  0.8751576292559899,
  0.8764186633039093,
  0.8775894538606404,
  0.8763339610797238,
  0.8767295597484277,
  0.8758664146187777,
  0.8753148614609572,
  0.8785399622404028,
  0.8758664146187777,
  0.8758664146187777,
  0.8733459357277883,
  0.8754716981132076,
  0.8760226557583387,
  0.87295597