# Training Flair on BioScope Dataset

In [1]:
import re

In [2]:
# from Text_Preprocessing import lower_case,html_parser,replace_contractions
# from Text_Preprocessing import remove_special, remove_stopwords, word_stem

In [3]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

columns = {0: 'text', 1: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = 'D:/FloridaBlue/Flair/Negation/Flairv0.6/data/data_speconly'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train_data_spec.txt',
                              test_file='test_data_spec.txt',
                              dev_file='dev_data_spec.txt')

2020-10-20 15:28:42,648 Reading data from D:\FloridaBlue\Flair\Negation\Flairv0.6\data\data_speconly
2020-10-20 15:28:42,649 Train: D:\FloridaBlue\Flair\Negation\Flairv0.6\data\data_speconly\train_data_spec.txt
2020-10-20 15:28:42,650 Dev: D:\FloridaBlue\Flair\Negation\Flairv0.6\data\data_speconly\dev_data_spec.txt
2020-10-20 15:28:42,650 Test: D:\FloridaBlue\Flair\Negation\Flairv0.6\data\data_speconly\test_data_spec.txt


In [4]:
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings, ELMoEmbeddings
from flair.embeddings import TransformerWordEmbeddings

#biodisbert_embeddings = TransformerWordEmbeddings('embeddings/pretrained_bert_tf/biobert-base-discharge-cased')

from typing import List
embedding_types: List[TokenEmbeddings] = [
    FlairEmbeddings('pubmed-forward'),
    FlairEmbeddings('pubmed-backward'),
    WordEmbeddings('pubmed'),
    #biodisbert_embeddings,
]
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [5]:
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)
#  initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

In [None]:
#  start training
trainer.train('models/speconly_bioscope_flair-word',
              learning_rate=0.1,
              mini_batch_size=32,
              embeddings_storage_mode='none',
              max_epochs=150)

2020-10-20 15:29:40,892 ----------------------------------------------------------------------------------------------------
2020-10-20 15:29:40,893 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.1, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=275, bias=True)
      )
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.1, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=275, bias=True)
      )
    )
    (list_embedding_2): WordEmbeddings('pubmed')
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4296, out_features=4296, bias=True)
  (rnn): LSTM(4296, 256, batch_first=True, bi

In [None]:
# load the model you trained
from flair.models import SequenceTagger
model_bc5cdr = SequenceTagger.load('models/bc5cdr-chem/final-model.pt')

In [None]:
from flair.data import Sentence

# create example sentence
ss = Sentence('(Acetaminophen:)')

# predict tags and print
model_bc5cdr.predict(ss)

print(ss.to_tagged_string())

In [None]:
ss.get_spans('ner')

In [None]:
ss = Sentence('dob 04 03 1949')
model_bc5cdr.predict(ss)
ss.get_spans('ner')

In [None]:
ss.get_spans('ner')

In [None]:
ss = Sentence('PRINIVIL TABS 20 MG (LISINOPRIL) 1 po qd')

# predict tags and print
model_bc5cdr.predict(ss)

print(ss.to_tagged_string())

In [None]:
#Prprocess the text
preprocess = re.sub('[^a-zA-Z0-9]', ' ', 'PRINIVIL TABS 20 MG (LISINOPRIL) 1 po qd')
preprocess = preprocess.lower()

#pass through flair Sentence module
preprocess = Sentence(preprocess)

# predict tags and print
model_bc5cdr.predict(preprocess)
print(preprocess.to_tagged_string())

In [None]:
#Looks like removing punctuation helped above in recognizing the drug

In [None]:
from flair.data import Sentence

# create example sentence
ss = Sentence('PRINIVIL TABS 20 MG (LISINOPRIL) 1 po qd Last Refill: #30 x 2 : Carl Savem MD (08/27/2010) HUMULIN INJ 70/30 (INSULIN REG & ISOPHANE (HUMAN)) 20 units ac breakfast Last Refill: #600 u x 0 : Carl Savem MD (08/27/2010)'.lower())

# predict tags and print
model_bc5cdr.predict(ss)

print(ss.to_tagged_string())

In [None]:
ss = 'PRINIVIL TABS 20 MG (LISINOPRIL) 1 po qd Last Refill: #30 x 2 : Carl Savem MD (08/27/2010) HUMULIN INJ 70/30 (INSULIN REG & ISOPHANE (HUMAN)) 20 units ac breakfast Last Refill: #600 u x 0 : Carl Savem MD (08/27/2010)'.lower()
ss = remove_special(ss)
ss = Sentence(ss)
# predict tags and print
model_bc5cdr.predict(ss)

print(ss.to_tagged_string())

**Passing the entire text as a sentence**

In [None]:
f= open("smr.txt","r")
smr =f.read()
print (smr)

In [None]:
smr= remove_special(smr.lower())
print (smr)

In [None]:
ss = Sentence(smr)
# predict tags and print
model_bc5cdr.predict(ss)

print(ss.to_tagged_string())

In [None]:
for entity in ss.get_spans('ner'):
    print(entity)

In [None]:
ss2 = Sentence('cholesterol cholesterol')
# predict tags and print
model_bc5cdr.predict(ss2)

print(ss2.to_tagged_string())

In [None]:
ss2.get_spans('ner')

In [None]:
f= open("smr.txt","r")
lines = f.readlines()
lines

In [None]:
sentences = []
chemicals = []
for num,line in enumerate(lines):
    #pline = (remove_special(line)).lower()
    pline = line
    ss = Sentence(pline)
    model_bc5cdr.predict(ss)
    chemicals.append((num,pline,ss.get_spans('ner')))

In [None]:
chemicals

In [None]:
[chem for chem in chemicals if chem[2] != []]

In [None]:
[chem[2] for chem in chemicals if chem[2] != []]