# Training Flair on BioScope Dataset

In [1]:
import re

In [2]:
# from Text_Preprocessing import lower_case,html_parser,replace_contractions
# from Text_Preprocessing import remove_special, remove_stopwords, word_stem

In [3]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

columns = {0: 'text', 1: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = 'data_negonly'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train_data.txt',
                              test_file='test_data.txt',
                              dev_file='dev_data.txt')

2020-10-17 17:42:12,377 Reading data from data_negonly
2020-10-17 17:42:12,378 Train: data_negonly\train_data.txt
2020-10-17 17:42:12,379 Dev: data_negonly\dev_data.txt
2020-10-17 17:42:12,379 Test: data_negonly\test_data.txt


In [4]:
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings, ELMoEmbeddings
from flair.embeddings import TransformerWordEmbeddings

#biodisbert_embeddings = TransformerWordEmbeddings('embeddings/pretrained_bert_tf/biobert-base-discharge-cased')

from typing import List
embedding_types: List[TokenEmbeddings] = [
    FlairEmbeddings('pubmed-forward'),
    FlairEmbeddings('pubmed-backward'),
    WordEmbeddings('pubmed'),
    #biodisbert_embeddings,
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [5]:
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)
#  initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

In [6]:
#  start training
trainer.train('models/negonly_bioscope_flair-word',
              learning_rate=0.1,
              mini_batch_size=32,
              embeddings_storage_mode='none',
              max_epochs=150)

2020-10-17 17:43:19,183 ----------------------------------------------------------------------------------------------------
2020-10-17 17:43:19,184 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.1, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=275, bias=True)
      )
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.1, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=275, bias=True)
      )
    )
    (list_embedding_2): WordEmbeddings('pubmed')
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4296, out_features=4296, bias=True)
  (rnn): LSTM(4296, 256, batch_first=True, bi

2020-10-17 17:54:50,332 BAD EPOCHS (no improvement): 0
saving best model
2020-10-17 17:55:04,034 ----------------------------------------------------------------------------------------------------
2020-10-17 17:55:17,896 epoch 5 - iter 31/319 - loss 0.65837147 - samples/sec: 71.57 - lr: 0.100000
2020-10-17 17:55:31,414 epoch 5 - iter 62/319 - loss 0.62416473 - samples/sec: 73.39 - lr: 0.100000
2020-10-17 17:55:44,752 epoch 5 - iter 93/319 - loss 0.58718492 - samples/sec: 74.37 - lr: 0.100000
2020-10-17 17:55:58,082 epoch 5 - iter 124/319 - loss 0.56744854 - samples/sec: 74.43 - lr: 0.100000
2020-10-17 17:56:11,499 epoch 5 - iter 155/319 - loss 0.53899016 - samples/sec: 73.95 - lr: 0.100000
2020-10-17 17:56:25,807 epoch 5 - iter 186/319 - loss 0.53589419 - samples/sec: 69.35 - lr: 0.100000
2020-10-17 17:56:39,377 epoch 5 - iter 217/319 - loss 0.50701937 - samples/sec: 73.11 - lr: 0.100000
2020-10-17 17:56:53,063 epoch 5 - iter 248/319 - loss 0.50970897 - samples/sec: 72.49 - lr: 0.1000

2020-10-17 18:11:14,846 epoch 10 - iter 248/319 - loss 0.35717283 - samples/sec: 67.75 - lr: 0.100000
2020-10-17 18:11:28,234 epoch 10 - iter 279/319 - loss 0.34547621 - samples/sec: 74.11 - lr: 0.100000
2020-10-17 18:11:41,741 epoch 10 - iter 310/319 - loss 0.34413352 - samples/sec: 73.45 - lr: 0.100000
2020-10-17 18:11:45,560 ----------------------------------------------------------------------------------------------------
2020-10-17 18:11:45,561 EPOCH 10 done: loss 0.3497 - lr 0.1000000
2020-10-17 18:12:08,003 DEV : loss 0.2713014483451843 - score 0.6342
2020-10-17 18:12:08,020 BAD EPOCHS (no improvement): 1
2020-10-17 18:12:08,021 ----------------------------------------------------------------------------------------------------
2020-10-17 18:12:21,452 epoch 11 - iter 31/319 - loss 0.37891458 - samples/sec: 73.88 - lr: 0.100000
2020-10-17 18:12:35,259 epoch 11 - iter 62/319 - loss 0.34448022 - samples/sec: 71.85 - lr: 0.100000
2020-10-17 18:12:48,900 epoch 11 - iter 93/319 - los

2020-10-17 18:26:46,351 epoch 16 - iter 62/319 - loss 0.22849687 - samples/sec: 71.70 - lr: 0.050000
2020-10-17 18:27:00,338 epoch 16 - iter 93/319 - loss 0.22822303 - samples/sec: 70.92 - lr: 0.050000
2020-10-17 18:27:13,279 epoch 16 - iter 124/319 - loss 0.22845960 - samples/sec: 76.67 - lr: 0.050000
2020-10-17 18:27:26,868 epoch 16 - iter 155/319 - loss 0.23476195 - samples/sec: 73.00 - lr: 0.050000
2020-10-17 18:27:40,955 epoch 16 - iter 186/319 - loss 0.22311295 - samples/sec: 70.42 - lr: 0.050000
2020-10-17 18:27:54,935 epoch 16 - iter 217/319 - loss 0.22189479 - samples/sec: 70.97 - lr: 0.050000
2020-10-17 18:28:08,924 epoch 16 - iter 248/319 - loss 0.22296146 - samples/sec: 70.93 - lr: 0.050000
2020-10-17 18:28:23,313 epoch 16 - iter 279/319 - loss 0.22332264 - samples/sec: 68.94 - lr: 0.050000
2020-10-17 18:28:37,515 epoch 16 - iter 310/319 - loss 0.22118819 - samples/sec: 69.86 - lr: 0.050000
2020-10-17 18:28:41,294 ------------------------------------------------------------

2020-10-17 18:42:35,954 ----------------------------------------------------------------------------------------------------
2020-10-17 18:42:35,954 EPOCH 21 done: loss 0.1927 - lr 0.0500000
2020-10-17 18:42:58,519 DEV : loss 0.15418784320354462 - score 0.7478
2020-10-17 18:42:58,539 BAD EPOCHS (no improvement): 2
2020-10-17 18:42:58,541 ----------------------------------------------------------------------------------------------------
2020-10-17 18:43:12,770 epoch 22 - iter 31/319 - loss 0.24066495 - samples/sec: 69.72 - lr: 0.050000
2020-10-17 18:43:26,775 epoch 22 - iter 62/319 - loss 0.23364209 - samples/sec: 70.84 - lr: 0.050000
2020-10-17 18:43:40,518 epoch 22 - iter 93/319 - loss 0.22930987 - samples/sec: 72.19 - lr: 0.050000
2020-10-17 18:43:54,371 epoch 22 - iter 124/319 - loss 0.22036933 - samples/sec: 71.61 - lr: 0.050000
2020-10-17 18:44:07,259 epoch 22 - iter 155/319 - loss 0.21916094 - samples/sec: 76.99 - lr: 0.050000
2020-10-17 18:44:20,858 epoch 22 - iter 186/319 - lo

2020-10-17 18:58:03,112 epoch 27 - iter 155/319 - loss 0.17563197 - samples/sec: 70.97 - lr: 0.025000
2020-10-17 18:58:17,213 epoch 27 - iter 186/319 - loss 0.17039951 - samples/sec: 70.36 - lr: 0.025000
2020-10-17 18:58:31,216 epoch 27 - iter 217/319 - loss 0.17146511 - samples/sec: 70.84 - lr: 0.025000
2020-10-17 18:58:45,601 epoch 27 - iter 248/319 - loss 0.17197406 - samples/sec: 68.97 - lr: 0.025000
2020-10-17 18:58:59,637 epoch 27 - iter 279/319 - loss 0.17079013 - samples/sec: 70.67 - lr: 0.025000
2020-10-17 18:59:12,509 epoch 27 - iter 310/319 - loss 0.17457302 - samples/sec: 77.08 - lr: 0.025000
2020-10-17 18:59:16,039 ----------------------------------------------------------------------------------------------------
2020-10-17 18:59:16,040 EPOCH 27 done: loss 0.1725 - lr 0.0250000
2020-10-17 18:59:38,438 DEV : loss 0.13730180263519287 - score 0.7817
2020-10-17 18:59:38,457 BAD EPOCHS (no improvement): 3
2020-10-17 18:59:38,458 ------------------------------------------------

Epoch    32: reducing learning rate of group 0 to 6.2500e-03.
2020-10-17 19:13:19,366 BAD EPOCHS (no improvement): 4
2020-10-17 19:13:19,367 ----------------------------------------------------------------------------------------------------
2020-10-17 19:13:33,231 epoch 33 - iter 31/319 - loss 0.16490116 - samples/sec: 71.56 - lr: 0.006250
2020-10-17 19:13:45,750 epoch 33 - iter 62/319 - loss 0.15421176 - samples/sec: 79.25 - lr: 0.006250
2020-10-17 19:13:59,995 epoch 33 - iter 93/319 - loss 0.15442723 - samples/sec: 69.65 - lr: 0.006250
2020-10-17 19:14:13,329 epoch 33 - iter 124/319 - loss 0.15884794 - samples/sec: 74.41 - lr: 0.006250
2020-10-17 19:14:27,696 epoch 33 - iter 155/319 - loss 0.15178386 - samples/sec: 69.06 - lr: 0.006250
2020-10-17 19:14:41,102 epoch 33 - iter 186/319 - loss 0.15387190 - samples/sec: 74.01 - lr: 0.006250
2020-10-17 19:14:53,641 epoch 33 - iter 217/319 - loss 0.15525473 - samples/sec: 79.12 - lr: 0.006250
2020-10-17 19:15:08,642 epoch 33 - iter 248/319

2020-10-17 19:29:02,535 epoch 38 - iter 248/319 - loss 0.14809872 - samples/sec: 75.10 - lr: 0.006250
2020-10-17 19:29:16,462 epoch 38 - iter 279/319 - loss 0.14663571 - samples/sec: 71.23 - lr: 0.006250
2020-10-17 19:29:30,683 epoch 38 - iter 310/319 - loss 0.14509568 - samples/sec: 69.77 - lr: 0.006250
2020-10-17 19:29:34,109 ----------------------------------------------------------------------------------------------------
2020-10-17 19:29:34,110 EPOCH 38 done: loss 0.1450 - lr 0.0062500
2020-10-17 19:29:56,547 DEV : loss 0.13180363178253174 - score 0.7881
2020-10-17 19:29:56,566 BAD EPOCHS (no improvement): 2
2020-10-17 19:29:56,567 ----------------------------------------------------------------------------------------------------
2020-10-17 19:30:09,877 epoch 39 - iter 31/319 - loss 0.18794963 - samples/sec: 74.54 - lr: 0.006250
2020-10-17 19:30:23,720 epoch 39 - iter 62/319 - loss 0.17133832 - samples/sec: 71.68 - lr: 0.006250
2020-10-17 19:30:37,227 epoch 39 - iter 93/319 - lo

2020-10-17 19:44:06,019 epoch 44 - iter 62/319 - loss 0.14001649 - samples/sec: 67.04 - lr: 0.003125
2020-10-17 19:44:19,745 epoch 44 - iter 93/319 - loss 0.13434070 - samples/sec: 72.28 - lr: 0.003125
2020-10-17 19:44:33,976 epoch 44 - iter 124/319 - loss 0.13875487 - samples/sec: 69.71 - lr: 0.003125
2020-10-17 19:44:47,812 epoch 44 - iter 155/319 - loss 0.14107986 - samples/sec: 71.70 - lr: 0.003125
2020-10-17 19:45:01,675 epoch 44 - iter 186/319 - loss 0.14574790 - samples/sec: 71.57 - lr: 0.003125
2020-10-17 19:45:15,832 epoch 44 - iter 217/319 - loss 0.14855470 - samples/sec: 70.08 - lr: 0.003125
2020-10-17 19:45:29,562 epoch 44 - iter 248/319 - loss 0.15027277 - samples/sec: 72.26 - lr: 0.003125
2020-10-17 19:45:42,761 epoch 44 - iter 279/319 - loss 0.14982547 - samples/sec: 75.16 - lr: 0.003125
2020-10-17 19:45:56,689 epoch 44 - iter 310/319 - loss 0.14824388 - samples/sec: 71.24 - lr: 0.003125
2020-10-17 19:46:00,407 ------------------------------------------------------------

2020-10-17 19:59:34,776 epoch 49 - iter 310/319 - loss 0.14546601 - samples/sec: 76.78 - lr: 0.000781
2020-10-17 19:59:38,466 ----------------------------------------------------------------------------------------------------
2020-10-17 19:59:38,467 EPOCH 49 done: loss 0.1442 - lr 0.0007813
2020-10-17 20:00:01,319 DEV : loss 0.130280002951622 - score 0.7797
2020-10-17 20:00:01,338 BAD EPOCHS (no improvement): 1
2020-10-17 20:00:01,339 ----------------------------------------------------------------------------------------------------
2020-10-17 20:00:15,202 epoch 50 - iter 31/319 - loss 0.16295339 - samples/sec: 71.57 - lr: 0.000781
2020-10-17 20:00:29,192 epoch 50 - iter 62/319 - loss 0.15509692 - samples/sec: 70.91 - lr: 0.000781
2020-10-17 20:00:42,375 epoch 50 - iter 93/319 - loss 0.16035853 - samples/sec: 75.27 - lr: 0.000781
2020-10-17 20:00:56,597 epoch 50 - iter 124/319 - loss 0.15796684 - samples/sec: 69.76 - lr: 0.000781
2020-10-17 20:01:10,031 epoch 50 - iter 155/319 - loss

2020-10-17 20:14:36,653 epoch 55 - iter 124/319 - loss 0.15538465 - samples/sec: 69.76 - lr: 0.000391
2020-10-17 20:14:50,714 epoch 55 - iter 155/319 - loss 0.15054356 - samples/sec: 70.56 - lr: 0.000391
2020-10-17 20:15:05,014 epoch 55 - iter 186/319 - loss 0.15232913 - samples/sec: 69.38 - lr: 0.000391
2020-10-17 20:15:18,791 epoch 55 - iter 217/319 - loss 0.15234856 - samples/sec: 72.01 - lr: 0.000391
2020-10-17 20:15:31,860 epoch 55 - iter 248/319 - loss 0.15454081 - samples/sec: 75.92 - lr: 0.000391
2020-10-17 20:15:45,178 epoch 55 - iter 279/319 - loss 0.15408177 - samples/sec: 74.50 - lr: 0.000391
2020-10-17 20:15:58,961 epoch 55 - iter 310/319 - loss 0.15121921 - samples/sec: 71.98 - lr: 0.000391
2020-10-17 20:16:02,493 ----------------------------------------------------------------------------------------------------
2020-10-17 20:16:02,494 EPOCH 55 done: loss 0.1514 - lr 0.0003906
2020-10-17 20:16:24,924 DEV : loss 0.12957905232906342 - score 0.782
2020-10-17 20:16:24,942 BA

2020-10-17 20:30:04,706 DEV : loss 0.12968739867210388 - score 0.7826
Epoch    60: reducing learning rate of group 0 to 9.7656e-05.
2020-10-17 20:30:04,725 BAD EPOCHS (no improvement): 4
2020-10-17 20:30:04,726 ----------------------------------------------------------------------------------------------------
2020-10-17 20:30:04,726 ----------------------------------------------------------------------------------------------------
2020-10-17 20:30:04,727 learning rate too small - quitting training!
2020-10-17 20:30:04,728 ----------------------------------------------------------------------------------------------------
2020-10-17 20:30:18,897 ----------------------------------------------------------------------------------------------------
2020-10-17 20:30:18,898 Testing using best model ...
2020-10-17 20:30:18,899 loading file models\negonly_bioscope_flair-word\best-model.pt
2020-10-17 20:30:49,749 0.8007	0.7812	0.7909
2020-10-17 20:30:49,750 
Results:
- F1-score (micro) 0.7909


{'test_score': 0.7908611599297012,
 'dev_score_history': [0.00546448087431694,
  0.5428051001821494,
  0.46522781774580335,
  0.6311926605504586,
  0.3157894736842105,
  0.6455026455026455,
  0.6749116607773852,
  0.6286919831223629,
  0.7458745874587458,
  0.6342412451361867,
  0.7,
  0.529126213592233,
  0.7396351575456054,
  0.75,
  0.7504424778761063,
  0.7305785123966942,
  0.7015503875968992,
  0.7379679144385026,
  0.7727272727272727,
  0.7540394973070017,
  0.747795414462081,
  0.7379679144385026,
  0.7455830388692579,
  0.7890222984562607,
  0.7714285714285715,
  0.7716814159292036,
  0.7816901408450704,
  0.7602739726027397,
  0.7835420393559929,
  0.7670250896057348,
  0.7797833935018051,
  0.7624309392265194,
  0.7824561403508772,
  0.7793103448275863,
  0.774869109947644,
  0.7908611599297012,
  0.7783595113438044,
  0.788091068301226,
  0.7731958762886597,
  0.7789473684210526,
  0.7800687285223368,
  0.7867132867132868,
  0.7860869565217392,
  0.7731958762886597,
  0.775

In [None]:
# load the model you trained
from flair.models import SequenceTagger
model_bc5cdr = SequenceTagger.load('models/bc5cdr-chem/final-model.pt')

In [None]:
from flair.data import Sentence

# create example sentence
ss = Sentence('(Acetaminophen:)')

# predict tags and print
model_bc5cdr.predict(ss)

print(ss.to_tagged_string())

In [None]:
ss.get_spans('ner')

In [None]:
ss = Sentence('dob 04 03 1949')
model_bc5cdr.predict(ss)
ss.get_spans('ner')

In [None]:
ss.get_spans('ner')

In [None]:
ss = Sentence('PRINIVIL TABS 20 MG (LISINOPRIL) 1 po qd')

# predict tags and print
model_bc5cdr.predict(ss)

print(ss.to_tagged_string())

In [None]:
#Prprocess the text
preprocess = re.sub('[^a-zA-Z0-9]', ' ', 'PRINIVIL TABS 20 MG (LISINOPRIL) 1 po qd')
preprocess = preprocess.lower()

#pass through flair Sentence module
preprocess = Sentence(preprocess)

# predict tags and print
model_bc5cdr.predict(preprocess)
print(preprocess.to_tagged_string())

In [None]:
#Looks like removing punctuation helped above in recognizing the drug

In [None]:
from flair.data import Sentence

# create example sentence
ss = Sentence('PRINIVIL TABS 20 MG (LISINOPRIL) 1 po qd Last Refill: #30 x 2 : Carl Savem MD (08/27/2010) HUMULIN INJ 70/30 (INSULIN REG & ISOPHANE (HUMAN)) 20 units ac breakfast Last Refill: #600 u x 0 : Carl Savem MD (08/27/2010)'.lower())

# predict tags and print
model_bc5cdr.predict(ss)

print(ss.to_tagged_string())

In [None]:
ss = 'PRINIVIL TABS 20 MG (LISINOPRIL) 1 po qd Last Refill: #30 x 2 : Carl Savem MD (08/27/2010) HUMULIN INJ 70/30 (INSULIN REG & ISOPHANE (HUMAN)) 20 units ac breakfast Last Refill: #600 u x 0 : Carl Savem MD (08/27/2010)'.lower()
ss = remove_special(ss)
ss = Sentence(ss)
# predict tags and print
model_bc5cdr.predict(ss)

print(ss.to_tagged_string())

**Passing the entire text as a sentence**

In [None]:
f= open("smr.txt","r")
smr =f.read()
print (smr)

In [None]:
smr= remove_special(smr.lower())
print (smr)

In [None]:
ss = Sentence(smr)
# predict tags and print
model_bc5cdr.predict(ss)

print(ss.to_tagged_string())

In [None]:
for entity in ss.get_spans('ner'):
    print(entity)

In [None]:
ss2 = Sentence('cholesterol cholesterol')
# predict tags and print
model_bc5cdr.predict(ss2)

print(ss2.to_tagged_string())

In [None]:
ss2.get_spans('ner')

In [None]:
f= open("smr.txt","r")
lines = f.readlines()
lines

In [None]:
sentences = []
chemicals = []
for num,line in enumerate(lines):
    #pline = (remove_special(line)).lower()
    pline = line
    ss = Sentence(pline)
    model_bc5cdr.predict(ss)
    chemicals.append((num,pline,ss.get_spans('ner')))

In [None]:
chemicals

In [None]:
[chem for chem in chemicals if chem[2] != []]

In [None]:
[chem[2] for chem in chemicals if chem[2] != []]