## Drug NER with Flair NLP

In [1]:
# Reading in the corpus
from flair.data import Corpus
from flair.datasets import ColumnCorpus

columns = {0: "text", 1: "ner"}
data_folder = "/home/pjwa227/Corpus2/"

corpus: Corpus = ColumnCorpus(data_folder, columns,
                             train_file = "train.txt", 
                             test_file = "test.txt",
                             dev_file = "val.txt")

2020-12-11 09:17:50,035 Reading data from /home/pjwa227/Corpus2
2020-12-11 09:17:50,036 Train: /home/pjwa227/Corpus2/train.txt
2020-12-11 09:17:50,036 Dev: /home/pjwa227/Corpus2/val.txt
2020-12-11 09:17:50,036 Test: /home/pjwa227/Corpus2/test.txt


In [2]:
# Quick check
print(len(corpus.train))

print(corpus.train[1].to_tagged_string("ner"))

6108
INTRAVENTRICULAR HEMORRHAGE COUMADIN <S-DRUG> TOXICITY


In [3]:
# Tell flair what tag we want to predict
tag_type = "ner"
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

In [4]:
# Load embeddings
from flair.embeddings import PooledFlairEmbeddings, StackedEmbeddings

embeddings : StackedEmbeddings = StackedEmbeddings([PooledFlairEmbeddings("news-forward"),
                                                    PooledFlairEmbeddings("news-backward"),])

In [5]:
# Initialize the sequence tagger
from flair.models import SequenceTagger

tagger : SequenceTagger = SequenceTagger(hidden_size =256,
                                         embeddings = embeddings,
                                         tag_dictionary = tag_dictionary,
                                         tag_type = tag_type,
                                         use_crf = True)
print(tagger)

SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): PooledFlairEmbeddings(
      (context_embeddings): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05, inplace=False)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=300, bias=True)
        )
      )
    )
    (list_embedding_1): PooledFlairEmbeddings(
      (context_embeddings): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05, inplace=False)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=300, bias=True)
        )
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=8192, out_features=8192, bias=True)
  (rnn): LSTM(8192, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_feat

In [7]:
# Train
from flair.trainers import ModelTrainer
trainer : ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train('resources/taggers/pooled_flair_ner',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=150)

2020-12-11 09:20:40,735 ----------------------------------------------------------------------------------------------------
2020-12-11 09:20:40,736 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): PooledFlairEmbeddings(
      (context_embeddings): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05, inplace=False)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=300, bias=True)
        )
      )
    )
    (list_embedding_1): PooledFlairEmbeddings(
      (context_embeddings): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05, inplace=False)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=300, bias=True)
        )
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linea

2020-12-11 09:22:10,612 epoch 4 - iter 190/191 - loss 0.23417755 - samples/sec: 285.11 - lr: 0.100000
2020-12-11 09:22:10,719 ----------------------------------------------------------------------------------------------------
2020-12-11 09:22:10,720 EPOCH 4 done: loss 0.2343 - lr 0.1000000
2020-12-11 09:22:12,392 DEV : loss 0.14673033356666565 - score 0.9852
2020-12-11 09:22:12,437 BAD EPOCHS (no improvement): 0
saving best model
2020-12-11 09:22:13,819 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode resetting embeddings
2020-12-11 09:22:15,816 epoch 5 - iter 19/191 - loss 0.20310709 - samples/sec: 304.80 - lr: 0.100000
2020-12-11 09:22:17,973 epoch 5 - iter 38/191 - loss 0.21860178 - samples/sec: 282.12 - lr: 0.100000
2020-12-11 09:22:20,011 epoch 5 - iter 57/191 - loss 0.20697103 - samples/sec: 298.53 - lr: 0.100000
2020-12-11 09:22:22,018 epoch 5 - iter 76/191 - loss 0.21105400 - samples

train mode resetting embeddings
train mode resetting embeddings
2020-12-11 09:24:11,625 epoch 10 - iter 19/191 - loss 0.21384910 - samples/sec: 318.92 - lr: 0.100000
2020-12-11 09:24:13,553 epoch 10 - iter 38/191 - loss 0.17983645 - samples/sec: 315.61 - lr: 0.100000
2020-12-11 09:24:15,492 epoch 10 - iter 57/191 - loss 0.16848433 - samples/sec: 313.76 - lr: 0.100000
2020-12-11 09:24:17,523 epoch 10 - iter 76/191 - loss 0.17219882 - samples/sec: 299.65 - lr: 0.100000
2020-12-11 09:24:19,552 epoch 10 - iter 95/191 - loss 0.17270463 - samples/sec: 299.86 - lr: 0.100000
2020-12-11 09:24:21,704 epoch 10 - iter 114/191 - loss 0.17320633 - samples/sec: 282.60 - lr: 0.100000
2020-12-11 09:24:23,642 epoch 10 - iter 133/191 - loss 0.17200518 - samples/sec: 313.96 - lr: 0.100000
2020-12-11 09:24:25,648 epoch 10 - iter 152/191 - loss 0.16814369 - samples/sec: 303.30 - lr: 0.100000
2020-12-11 09:24:27,745 epoch 10 - iter 171/191 - loss 0.16601013 - samples/sec: 290.15 - lr: 0.100000
2020-12-11 09:

2020-12-11 09:26:14,445 epoch 15 - iter 114/191 - loss 0.15135247 - samples/sec: 302.82 - lr: 0.100000
2020-12-11 09:26:16,334 epoch 15 - iter 133/191 - loss 0.14614353 - samples/sec: 322.07 - lr: 0.100000
2020-12-11 09:26:18,306 epoch 15 - iter 152/191 - loss 0.13793228 - samples/sec: 308.55 - lr: 0.100000
2020-12-11 09:26:20,347 epoch 15 - iter 171/191 - loss 0.14112843 - samples/sec: 298.08 - lr: 0.100000
2020-12-11 09:26:22,352 epoch 15 - iter 190/191 - loss 0.14042224 - samples/sec: 303.57 - lr: 0.100000
2020-12-11 09:26:22,445 ----------------------------------------------------------------------------------------------------
2020-12-11 09:26:22,446 EPOCH 15 done: loss 0.1407 - lr 0.1000000
2020-12-11 09:26:24,152 DEV : loss 0.08359065651893616 - score 0.9924
2020-12-11 09:26:24,197 BAD EPOCHS (no improvement): 3
2020-12-11 09:26:24,198 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode r

2020-12-11 09:28:13,900 ----------------------------------------------------------------------------------------------------
2020-12-11 09:28:13,901 EPOCH 20 done: loss 0.1132 - lr 0.0500000
2020-12-11 09:28:15,667 DEV : loss 0.06442078948020935 - score 0.9964
2020-12-11 09:28:15,714 BAD EPOCHS (no improvement): 0
saving best model
2020-12-11 09:28:17,015 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode resetting embeddings
2020-12-11 09:28:19,152 epoch 21 - iter 19/191 - loss 0.14249304 - samples/sec: 284.81 - lr: 0.050000
2020-12-11 09:28:21,135 epoch 21 - iter 38/191 - loss 0.11844561 - samples/sec: 306.80 - lr: 0.050000
2020-12-11 09:28:23,145 epoch 21 - iter 57/191 - loss 0.12169889 - samples/sec: 302.73 - lr: 0.050000
2020-12-11 09:28:25,167 epoch 21 - iter 76/191 - loss 0.11362124 - samples/sec: 300.95 - lr: 0.050000
2020-12-11 09:28:27,345 epoch 21 - iter 95/191 - loss 0.11081206 - sa

2020-12-11 09:30:09,391 epoch 26 - iter 19/191 - loss 0.09527874 - samples/sec: 310.70 - lr: 0.025000
2020-12-11 09:30:11,364 epoch 26 - iter 38/191 - loss 0.11263308 - samples/sec: 308.39 - lr: 0.025000
2020-12-11 09:30:13,458 epoch 26 - iter 57/191 - loss 0.11129355 - samples/sec: 290.58 - lr: 0.025000
2020-12-11 09:30:15,463 epoch 26 - iter 76/191 - loss 0.10447875 - samples/sec: 303.40 - lr: 0.025000
2020-12-11 09:30:17,536 epoch 26 - iter 95/191 - loss 0.10445523 - samples/sec: 293.56 - lr: 0.025000
2020-12-11 09:30:19,510 epoch 26 - iter 114/191 - loss 0.10658359 - samples/sec: 308.19 - lr: 0.025000
2020-12-11 09:30:21,500 epoch 26 - iter 133/191 - loss 0.10535090 - samples/sec: 305.63 - lr: 0.025000
2020-12-11 09:30:23,487 epoch 26 - iter 152/191 - loss 0.11005809 - samples/sec: 306.31 - lr: 0.025000
2020-12-11 09:30:25,512 epoch 26 - iter 171/191 - loss 0.10641899 - samples/sec: 300.47 - lr: 0.025000
2020-12-11 09:30:27,465 epoch 26 - iter 190/191 - loss 0.10613644 - samples/se

2020-12-11 09:32:10,455 epoch 31 - iter 114/191 - loss 0.07085037 - samples/sec: 303.63 - lr: 0.012500
2020-12-11 09:32:12,474 epoch 31 - iter 133/191 - loss 0.07102088 - samples/sec: 301.36 - lr: 0.012500
2020-12-11 09:32:14,560 epoch 31 - iter 152/191 - loss 0.07624621 - samples/sec: 291.62 - lr: 0.012500
2020-12-11 09:32:16,712 epoch 31 - iter 171/191 - loss 0.07438820 - samples/sec: 282.77 - lr: 0.012500
2020-12-11 09:32:18,678 epoch 31 - iter 190/191 - loss 0.07623625 - samples/sec: 309.35 - lr: 0.012500
2020-12-11 09:32:18,787 ----------------------------------------------------------------------------------------------------
2020-12-11 09:32:18,788 EPOCH 31 done: loss 0.0761 - lr 0.0125000
2020-12-11 09:32:20,499 DEV : loss 0.06360176205635071 - score 0.9954
2020-12-11 09:32:20,547 BAD EPOCHS (no improvement): 1
2020-12-11 09:32:20,548 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode r

2020-12-11 09:34:08,677 ----------------------------------------------------------------------------------------------------
2020-12-11 09:34:08,678 EPOCH 36 done: loss 0.0872 - lr 0.0062500
2020-12-11 09:34:10,389 DEV : loss 0.06305371224880219 - score 0.9952
2020-12-11 09:34:10,434 BAD EPOCHS (no improvement): 2
2020-12-11 09:34:10,435 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode resetting embeddings
2020-12-11 09:34:12,504 epoch 37 - iter 19/191 - loss 0.10088748 - samples/sec: 294.14 - lr: 0.006250
2020-12-11 09:34:14,437 epoch 37 - iter 38/191 - loss 0.08659380 - samples/sec: 314.82 - lr: 0.006250
2020-12-11 09:34:16,557 epoch 37 - iter 57/191 - loss 0.08442040 - samples/sec: 286.98 - lr: 0.006250
2020-12-11 09:34:18,559 epoch 37 - iter 76/191 - loss 0.07927500 - samples/sec: 303.96 - lr: 0.006250
2020-12-11 09:34:20,604 epoch 37 - iter 95/191 - loss 0.08709667 - samples/sec: 297.45 

2020-12-11 09:36:02,183 epoch 42 - iter 19/191 - loss 0.09793593 - samples/sec: 311.88 - lr: 0.003125
2020-12-11 09:36:04,114 epoch 42 - iter 38/191 - loss 0.09449224 - samples/sec: 315.05 - lr: 0.003125
2020-12-11 09:36:06,105 epoch 42 - iter 57/191 - loss 0.09023496 - samples/sec: 305.51 - lr: 0.003125
2020-12-11 09:36:08,079 epoch 42 - iter 76/191 - loss 0.09882283 - samples/sec: 308.27 - lr: 0.003125
2020-12-11 09:36:10,064 epoch 42 - iter 95/191 - loss 0.09534125 - samples/sec: 306.55 - lr: 0.003125
2020-12-11 09:36:12,107 epoch 42 - iter 114/191 - loss 0.09373724 - samples/sec: 297.81 - lr: 0.003125
2020-12-11 09:36:14,147 epoch 42 - iter 133/191 - loss 0.09134983 - samples/sec: 298.27 - lr: 0.003125
2020-12-11 09:36:16,184 epoch 42 - iter 152/191 - loss 0.09504252 - samples/sec: 298.59 - lr: 0.003125
2020-12-11 09:36:18,319 epoch 42 - iter 171/191 - loss 0.09197753 - samples/sec: 284.98 - lr: 0.003125
2020-12-11 09:36:20,277 epoch 42 - iter 190/191 - loss 0.08977990 - samples/se

2020-12-11 09:38:01,939 epoch 47 - iter 114/191 - loss 0.09216911 - samples/sec: 308.28 - lr: 0.000781
2020-12-11 09:38:03,946 epoch 47 - iter 133/191 - loss 0.08880435 - samples/sec: 303.12 - lr: 0.000781
2020-12-11 09:38:05,985 epoch 47 - iter 152/191 - loss 0.08588117 - samples/sec: 298.40 - lr: 0.000781
2020-12-11 09:38:07,983 epoch 47 - iter 171/191 - loss 0.08798767 - samples/sec: 304.51 - lr: 0.000781
2020-12-11 09:38:09,971 epoch 47 - iter 190/191 - loss 0.09102496 - samples/sec: 306.11 - lr: 0.000781
2020-12-11 09:38:10,071 ----------------------------------------------------------------------------------------------------
2020-12-11 09:38:10,071 EPOCH 47 done: loss 0.0907 - lr 0.0007813
2020-12-11 09:38:11,752 DEV : loss 0.06521383672952652 - score 0.9952
2020-12-11 09:38:11,797 BAD EPOCHS (no improvement): 1
2020-12-11 09:38:11,798 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode r

2020-12-11 09:39:59,346 ----------------------------------------------------------------------------------------------------
2020-12-11 09:39:59,347 EPOCH 52 done: loss 0.0841 - lr 0.0003906
2020-12-11 09:40:01,051 DEV : loss 0.06649672985076904 - score 0.9952
2020-12-11 09:40:01,097 BAD EPOCHS (no improvement): 2
2020-12-11 09:40:01,098 ----------------------------------------------------------------------------------------------------
train mode resetting embeddings
train mode resetting embeddings
2020-12-11 09:40:03,057 epoch 53 - iter 19/191 - loss 0.09526768 - samples/sec: 310.61 - lr: 0.000391
2020-12-11 09:40:05,022 epoch 53 - iter 38/191 - loss 0.08726608 - samples/sec: 309.65 - lr: 0.000391
2020-12-11 09:40:07,143 epoch 53 - iter 57/191 - loss 0.08492163 - samples/sec: 287.05 - lr: 0.000391
2020-12-11 09:40:09,160 epoch 53 - iter 76/191 - loss 0.08552666 - samples/sec: 301.76 - lr: 0.000391
2020-12-11 09:40:11,148 epoch 53 - iter 95/191 - loss 0.08620328 - samples/sec: 306.14 

2020-12-11 09:41:52,739 epoch 58 - iter 19/191 - loss 0.07818306 - samples/sec: 278.62 - lr: 0.000195
2020-12-11 09:41:54,787 epoch 58 - iter 38/191 - loss 0.08109949 - samples/sec: 297.06 - lr: 0.000195
2020-12-11 09:41:56,720 epoch 58 - iter 57/191 - loss 0.07677263 - samples/sec: 314.75 - lr: 0.000195
2020-12-11 09:41:58,721 epoch 58 - iter 76/191 - loss 0.07952894 - samples/sec: 304.18 - lr: 0.000195
2020-12-11 09:42:00,655 epoch 58 - iter 95/191 - loss 0.07812685 - samples/sec: 314.54 - lr: 0.000195
2020-12-11 09:42:02,610 epoch 58 - iter 114/191 - loss 0.07813404 - samples/sec: 311.34 - lr: 0.000195
2020-12-11 09:42:04,621 epoch 58 - iter 133/191 - loss 0.07928216 - samples/sec: 302.55 - lr: 0.000195
2020-12-11 09:42:06,593 epoch 58 - iter 152/191 - loss 0.07991739 - samples/sec: 308.49 - lr: 0.000195
2020-12-11 09:42:08,513 epoch 58 - iter 171/191 - loss 0.07931291 - samples/sec: 316.81 - lr: 0.000195
2020-12-11 09:42:10,537 epoch 58 - iter 190/191 - loss 0.08207521 - samples/se

{'test_score': 0.9913285847011458,
 'dev_score_history': [0.9826883910386965,
  0.9818088649756597,
  0.9852116267210608,
  0.9852191641182467,
  0.985485103132162,
  0.9880071446797652,
  0.9829733163913597,
  0.9923508414074452,
  0.9928716904276985,
  0.99288256227758,
  0.9870459740919482,
  0.9943991853360489,
  0.9911009407576913,
  0.994140127388535,
  0.9923508414074452,
  0.9941341494516706,
  0.9949005609382968,
  0.9898063200815495,
  0.9946496815286624,
  0.9964322120285423,
  0.9903307888040713,
  0.9954081632653061,
  0.994908350305499,
  0.9946496815286624,
  0.9949057564951604,
  0.9964322120285423,
  0.9944048830111902,
  0.9949057564951604,
  0.9943934760448522,
  0.9946524064171123,
  0.9954105048444671,
  0.9951567677797604,
  0.9949057564951604,
  0.9951592356687898,
  0.9949057564951604,
  0.9951592356687898,
  0.9949057564951604,
  0.995161701044054,
  0.9946496815286624,
  0.9949057564951604,
  0.9951592356687898,
  0.9951592356687898,
  0.9951592356687898,
  0.