# Preprocessing

In [None]:
# installing packages you need
!pip3 install nltk 
!pip3 install wget
!pip3 install tqdm
!pip3 install matplotlib
!pip3 install spacy
!python3 -m spacy download en_core_web_sm

In [7]:
# importing all packages we need
import nltk
import numpy as np
import re
#import wget # downloads files
import tarfile # unzip tar files
import xml.etree.ElementTree as ET # used to xml files
import os # to access your directories 
import pandas as pd # useful table stuff to do manipulations on (and stuff)
import matplotlib.pyplot as plt # plotting stuff when necessary 
import zipfile # to unzip zips
from tqdm import tqdm # To show progress bar for for-loops

path = os.getcwd()
directory_unannotated = path + '/unannotated_xml'
directory_annotated = path + '/train_xml'
path_to_zip = path + '/osfstorage-archive.zip'
path_to_extract = path + '/test_labels'

In [None]:
wget.download('https://bionlp.nlm.nih.gov/tac2017adversereactions/train_xml.tar.gz')
wget.download('https://bionlp.nlm.nih.gov/tac2017adversereactions/unannotated_xml.tar.gz')
wget.download("https://files.osf.io/v1/resources/n84w3/providers/osfstorage/?zip=") # This download takes some time for some reason
tarfile.open('train_xml.tar.gz').extractall()
tarfile.open('unannotated_xml.tar.gz').extractall()
with zipfile.ZipFile(path_to_zip , 'r') as zip_ref:
    zip_ref.extractall(path_to_extract)

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [8]:
def special_case(word):
    if word == 'abnormalitiesincludes':
        return ['abnormalities','includes']
    return []

def extra_tokenization(tokens):
    updated_tokens = []
    regex1 = re.compile("[a-zA-Z]{3,}/[a-zA-Z]{3,}")
    regex2 = re.compile("^[-]{1}[a-zA-Z]{3,}")
    regex3 = re.compile("^[.]{1}[a-zA-Z]{3,}")
    regex4 = re.compile("^[/]{1}[a-zA-Z]{3,}")
    regex5 = re.compile("^[']{1}[a-zA-Z]{3,}")
    regex6 = re.compile("^[a-zA-Z]{4,}[.]{1}[a-zA-Z]{4,}")
    regex7 = re.compile("^[a-zA-Z]{3,}[-]{1}$")
    regex8 = re.compile('^([A-Za-z][a-z]+)([A-Za-z][a-z]+){0,3}([A-Z][a-z]+)+$')
    regex9 = re.compile('[A-Za-z]+[/]$')
    for token in tokens:
        candidate1 = regex1.findall(token)
        candidate2 = regex2.findall(token)
        candidate3 = regex3.findall(token)
        candidate4 = regex4.findall(token)
        candidate5 = regex5.findall(token)
        candidate6 = regex6.findall(token)
        candidate7 = regex7.findall(token)
        candidate8 = regex8.findall(token)
        candidate9 = regex9.findall(token)
        if candidate1:
            new_tokens = candidate1[0].split('/')
            new_tokens.insert(1,'/')
            for item in new_tokens:
                updated_tokens.append(item)
        elif candidate2:
            new_tokens = candidate2[0].split('-')
            new_tokens[0] = '-'
            for item in new_tokens:
                updated_tokens.append(item)
        elif candidate3:
            new_tokens = candidate3[0].split('.')
            new_tokens[0] = '.'
            for item in new_tokens:
                updated_tokens.append(item)
        elif candidate4:
            new_tokens = candidate4[0].split('/')
            new_tokens[0] = '/'
            for item in new_tokens:
                updated_tokens.append(item)
        elif candidate5:
            new_tokens = candidate5[0].split('\'')
            new_tokens[0] = '\''
            for item in new_tokens:
                updated_tokens.append(item)
        elif candidate6:
            new_tokens = candidate6[0].split('.')
            new_tokens.insert(1,'.')
            for item in new_tokens:
                updated_tokens.append(item)
        elif candidate7:
            new_tokens = candidate7[0].split('-')
            new_tokens[1] = '-'
            for item in new_tokens:
                updated_tokens.append(item)
        elif candidate8:
            new_tokens = candidate8[0]
            for item in new_tokens:
                if item:
                    updated_tokens.append(item)
        elif candidate9:
            new_tokens = candidate9[0].split('/')
            new_tokens[1] = '/'
            for item in new_tokens:
                updated_tokens.append(item)
        elif special_case(token):
            for item in special_case(token):
                updated_tokens.append(token)
        else:
            updated_tokens.append(token)
    return updated_tokens

def get_sentences(text):
    sentences = []
    for sentence in sent_tokenize(text):
        for split_sentence in re.split('\n+',sentence):
            if split_sentence == '':
                continue
            sentences.append(split_sentence)
    return sentences
    

def span(text):  
  sentence_id = 0
  offset = 0
  for sentence in get_sentences(text):
    tokens = extra_tokenization(nltk.word_tokenize(sentence))
    pos_tags = nltk.pos_tag(tokens)
    for token,pos_tag in zip(tokens,pos_tags):
        offset = text.find(token,offset)
        yield [token, offset, offset + len(token),sentence_id,pos_tag[1]]
        offset += len(token)
    sentence_id += 1
    

def get_section_name(mention_name):
    for section in root.findall('./Text/Section'):
        if section.get('id') == mention_name:
            return section.get('name')
        
        
def reduceTokens(tokens):
  for token in tokens:
    try:
      token.pop(1)
      token.pop(1)
    except:
      break

In [9]:
from nltk import sent_tokenize, word_tokenize
from tqdm import tqdm
import os 
import re
counter_total = 0
counter = 0;

# create pandas framework for the training set
df_cols_annotated = ["file_name","adverse_reactions","warnings_and_precautions","boxed_warnings"]
df_rows_annotated = []
for filename in tqdm(sorted(os.listdir(directory_annotated))):
    if filename.endswith(".xml"):
        fullname = os.path.join(directory_annotated, filename)
        tree = ET.parse(fullname)
        root = tree.getroot()
        text_adverse_reactions, text_warnings_and_precautions, text_boxed_warnings = "","",""
        for section in root.findall('./Text/Section'):
            if section.get('name') == "adverse reactions":
                text_adverse_reactions = section.text
            if section.get('name') == "warnings and precautions":
                text_warnings_and_precautions = section.text
            if section.get('name') == "boxed warnings":
                text_boxed_warnings = section.text  
        df_rows_annotated.append({"file_name": filename,"adverse_reactions": text_adverse_reactions, "warnings_and_precautions": text_warnings_and_precautions, "boxed_warnings": text_boxed_warnings})
df = pd.DataFrame(df_rows_annotated,columns = df_cols_annotated)

span_text = lambda x : span(x)
df['adverse_reactions'] = df['adverse_reactions'].apply(span_text).apply(list) # each xml gets tokenized
df['warnings_and_precautions'] = df['warnings_and_precautions'].apply(span_text).apply(list) # each xml gets tokenized
df['boxed_warnings'] = df['boxed_warnings'].apply(span_text).apply(list) # each xml gets tokenized​
for filename in tqdm(df['file_name']):
      index = df[df['file_name']==filename].index.values.astype(int)[0]
      fullname = os.path.join(directory_annotated, filename)
      tree = ET.parse(fullname)
      root = tree.getroot()
      tokens_adverse_reactions = df['adverse_reactions'][index]
      for token in tokens_adverse_reactions:
          token.append("O")
      tokens_warnings_and_precautions = df['warnings_and_precautions'][index]
      for token in tokens_warnings_and_precautions:
          token.append("O")
      tokens_boxed_warnings = df['boxed_warnings'][index]
      for token in tokens_boxed_warnings:
          token.append("O")
      for mention in root.findall('./Mentions/Mention'):
          interval = [(int(x),int(x) + int(y)) for x, y in zip(mention.get('start').split(','), mention.get('len').split(','))]
          counter_total+=1
          if len(interval)>1:
            counter+=1
            continue
          tag = mention.get('type')
          first_token = True
          for i in interval:
            section_name = get_section_name(mention.get('section'))
            if section_name == 'adverse reactions':
                for token in tokens_adverse_reactions:
                    if token[1] >= i[0]: # start times are equal or greater
                        if first_token:
                            token[5] = "B-" + tag
                            first_token = False
                        else:
                            token[5] = 'I-' + tag
                        if token[2] >= i[1]: # if interval ends, break the loop
                            break
            elif section_name == 'boxed warnings':
                for token in tokens_boxed_warnings:
                      if token[1] >= i[0]: # start times are equal or greater
                        if first_token:
                            token[5] = "B-" + tag
                            first_token = False
                        else:
                            token[5] = 'I-' + tag
                        if token[2] >= i[1]: # if interval ends, break the loop
                            break
            elif section_name == 'warnings and precautions':
                for token in tokens_warnings_and_precautions:
                      if token[1] >= i[0]: # start times are equal or greater
                        if first_token:
                            token[5] = "B-" + tag
                            first_token = False
                        else:
                            token[5] = 'I-' + tag
                        if token[2] >= i[1]: # if interval ends, break the loop
                            break
                            

df['adverse_reactions'].apply(reduceTokens)
df['warnings_and_precautions'].apply(reduceTokens)
df['boxed_warnings'].apply(reduceTokens)
df = df.set_index('file_name',drop = True)
df.to_pickle('./train_labels.pkl')
print(counter,counter_total)

100%|██████████| 101/101 [00:02<00:00, 37.98it/s]
100%|██████████| 101/101 [00:03<00:00, 33.07it/s]


1078 15722


In [10]:
# get test set
import re
import os
import nltk
from nltk import word_tokenize, sent_tokenize
import numpy as np
counter = 0
counter_total=0

path = os.getcwd() + '/test_labels'

#introduce pandas frame work:
df_cols = ["adverse_reactions","adverse_reactions_tags","warnings_and_precautions","warnings_and_precautions_tags","boxed_warnings","boxed_warnings_tags"]
index = []

for filename in tqdm(sorted(os.listdir(path))):
    if filename.endswith("adverse_reactions.ann"):
        index.append(filename[:-22])  
        
df = pd.DataFrame(np.empty((len(index),len(df_cols),0)).tolist(),columns = df_cols, index=index)
        
for filename in tqdm(sorted(os.listdir(path))):
    for name, y in zip(["adverse_reactions","warnings_and_precautions","boxed_warnings"], [-22,-29,-19]):
        if filename.endswith(name + ".ann"): 
            fullname = os.path.join(path, filename)
            f = open(fullname,'r')
            for x in f:
              if x[0] == "T":
                #split string
                split_text = re.split('\t|\n|\s',x)
                tag = split_text[1]
                start = True
                intervals = []
                start_index = 0
                end_index = 0
                for n in split_text[2:]:
                    if start:
                        start = False
                        start_index = int(n)
                    else:
                        if n == re.split(';',n)[0]:
                            end_index = n
                            intervals.append([start_index,end_index])
                            break
                        else:
                            end_index = re.split(';',n)[0]
                            intervals.append([start_index,end_index])
                            start_index = re.split(';',n)[1]
                df.loc[filename[:int(y)],name + "_tags"].append([tag,intervals])
       
            
for filename in tqdm(sorted(os.listdir(path))):
    if filename.endswith("adverse_reactions.txt"): 
        fullname = os.path.join(path, filename)
        f = open(fullname,'r')
        df.loc[filename[:-22],"adverse_reactions"] = list(span(f.read()))
    if filename.endswith("warnings_and_precautions.txt"): 
        fullname = os.path.join(path, filename)
        f = open(fullname,'r')
        df.loc[filename[:-29],"warnings_and_precautions"] = list(span(f.read()))
    if filename.endswith("boxed_warnings.txt"): 
        fullname = os.path.join(path, filename)
        f = open(fullname,'r')
        df.loc[filename[:-19],"boxed_warnings"] = list(span(f.read()))

def add_Os(A):
    for i in A:
        i.append("O")
    return A

df['adverse_reactions'] = df['adverse_reactions'].apply(add_Os)
df['warnings_and_precautions'] = df['warnings_and_precautions'].apply(add_Os)
df['boxed_warnings'] = df['boxed_warnings'].apply(add_Os)

for name in ["adverse_reactions","warnings_and_precautions","boxed_warnings"]:
    for i in tqdm(range(len(df[name]))):
        for tag in df[name + '_tags'][i]:
            first_token = True
            counter_total+=1
            if len(tag[1])>1:
                counter+=1
                continue
            for interval in tag[1]:
                start = int(interval[0])
                end = int(interval[1])
                for token in df[name][i]:
                    if int(token[1]) >= start:
                        if first_token:
                            token[5] = "B-" + tag[0]
                            first_token = False
                        else:
                            token[5] = "I-" + tag[0]
                        if int(token[2]) >= end:
                            break
                        
df = df.drop(['adverse_reactions_tags','warnings_and_precautions_tags', 'boxed_warnings_tags'], axis=1)

df['adverse_reactions'].apply(reduceTokens)
df['warnings_and_precautions'].apply(reduceTokens)
df['boxed_warnings'].apply(reduceTokens)


df.to_pickle('./test_labels.pkl')
print(counter,counter_total)

100%|██████████| 480/480 [00:00<00:00, 697355.70it/s]
100%|██████████| 480/480 [00:03<00:00, 123.52it/s]
100%|██████████| 480/480 [00:20<00:00, 23.56it/s]
100%|██████████| 99/99 [00:02<00:00, 48.99it/s]
100%|██████████| 99/99 [00:01<00:00, 83.72it/s] 
100%|██████████| 99/99 [00:00<00:00, 1979.12it/s]


1160 14582


In [11]:
test_labels_df = pd.read_pickle('./test_labels.pkl')
train_labels_df = pd.read_pickle('./train_labels.pkl')

In [12]:
# removing columns, bunching together sentences in one list
#training_set
import itertools
import pandas as pd
key_f = lambda x: x[1]
texts_train = []
texts_test = []
for name in ["train","test"]:
    labels = pd.read_pickle(name +'_labels.pkl') # './drive/My Drive/Pickles/' +
    for index,row in labels.iterrows():
        for column_name in ["adverse_reactions","warnings_and_precautions","boxed_warnings"]:
          for key, group in itertools.groupby(row[column_name],key_f): # here we regroup again by sentence
            sentence = []
            for i in list(group):
              token = []
              token.append(i[0]) # string
              token.append(i[2]) # POS tag
              token.append(i[3]) # NER tag
              sentence.append(token)
            globals()["texts_" + name].append(sentence)
        
text = texts_test + texts_train

In [21]:
adverse_reactions_list = []
for sentence in text:
    for word in sentence:
        if word[2][2:] == 'Negation': # replace AdverseReaction with Severity if you want to check the severity distribution
            adverse_reactions_list.append(word[0].lower())

            
def CountFrequency(my_list):  
    freq = {} 
    for item in my_list: 
        if (item in freq): 
            freq[item] += 1
        else: 
            freq[item] = 1
    # sort dictionary
    return freq
dictionary = sorted(CountFrequency(adverse_reactions_list).items(), key = lambda kv:(kv[1], kv[0]),reverse= True)
print(len(dictionary))
dictionary[:20]

46


[('no', 130),
 ('not', 52),
 ('excluding', 24),
 ('without', 15),
 ('none', 10),
 ('nor', 5),
 ('other', 4),
 ('than', 3),
 ('placebo', 3),
 ('absence', 3),
 ('0.0', 3),
 ('the', 2),
 ('rather', 2),
 ('patients', 2),
 ('neither', 2),
 ('first', 2),
 ('evident', 2),
 ('dose', 2),
 ('did', 2),
 ('chemotherapy-treated', 2)]

In [None]:
from nltk.corpus import wordnet


def get_synonyms(sense):
    """Get the synonyms of word from WordNet"""
    try:
        lemmas = sense.lemma_names() # return lemmas of synonyms, if there are no lemmas, this will return an error
    except:
        lemmas = [] # if there is an error, then it basically means that there are no lemmas available for you sense
    return lemmas


def augment_sentence(sentence):
    """augment words in sentences"""
    sentences = [sentence]
    string_sentence = ' '.join(word[0] for word in sentence)
    for index,token in enumerate(sentence):
        if token[2][2:] == "Severity":
            sense = rules(token[0])
            synonyms = get_synonyms(sense)
            if len(synonyms) > 0:
                for synonym in synonyms:
                    if '_' in synonym:
                        continue
                    if synonym == token[0]:
                        continue # we don't want duplicates
                    new_sentences = sentence.copy()
                    new_sentences[index] = [synonym,token[1],token[2]] # [This drug causes severe skin rash, This drug causes serious skin rash, ..... ]
                    sentences.append(new_sentences)
    return sentences


def rules(word):
    lower = word.lower()
    if lower == "serious":
        return wordnet.synset('dangerous.s.02') 
    if lower == "severe":
        return wordnet.synset('dangerous.s.02')
    if lower == 'mild':
        return wordnet.synset('moderate.s.03')
    if lower == 'limit':
        return wordnet.synset('limit.n.01')
    if lower == 'significant':
        return wordnet.synset('significant.s.02')
    return None
        
    
    
augmented_text = []
for sentence in text:
    augmented_text += augment_sentence(sentence)

In [None]:
from nltk.corpus import wordnet

for i,j in enumerate(wordnet.synsets('rash')): # get wordnet senses for different words. It has four tags: wordnet.NOUN ADJ ADV VERB
  print("Meaning",i, "NLTK ID:", j.name())
  print("Definition:",j.definition())
  print("Synonyms:", ", ".join(j.lemma_names()))
  print()

In [18]:
#Creating word list
word_list = []
for sentence in text:
  for token in sentence:
    word_list.append(token[0])
word_list.append("ENDPAD")
words = list(set(word_list))

#Defining tags
def get_tags(text):
  tag_list = []
  for i in text:
    for j in i:
      tag_list.append(j[2])
  return list(set(tag_list))

tags = get_tags(text)
print(tags)


['I-Factor', 'B-Animal', 'O', 'B-DrugClass', 'B-Negation', 'B-Severity', 'I-Animal', 'B-AdverseReaction', 'I-DrugClass', 'I-Severity', 'I-AdverseReaction', 'B-Factor', 'I-Negation']


In [25]:
#Visualising snippet of text
text[3]

[['(', '(', 'O'],
 ['6.1', 'CD', 'O'],
 [')', ')', 'O'],
 ['To', 'TO', 'O'],
 ['report', 'VB', 'O'],
 ['SUSPECTED', 'NNP', 'O'],
 ['ADVERSE', 'NNP', 'O'],
 ['REACTIONS', 'NNP', 'O'],
 [',', ',', 'O'],
 ['contact', 'NN', 'O'],
 ['Genentech', 'NNP', 'O'],
 ['at', 'IN', 'O'],
 ['1-888-835-2555', 'CD', 'O'],
 ['or', 'CC', 'O'],
 ['FDA', 'NNP', 'O'],
 ['at', 'IN', 'O'],
 ['1-800-FDA-1088', 'CD', 'O'],
 ['or', 'CC', 'O'],
 ['gov', 'VB', 'O'],
 ['/', 'CD', 'O'],
 ['medwatch', 'NN', 'O']]

In [None]:
#Negation
import spacy
#Load installed model "en_core_web_sm"
nlp = spacy.load("en_core_web_sm")
doc = nlp("This is no severe adverse reaction") 
  #'Not' gets marked as neg, 'no' gets missed
[token.text for token in doc]
[token.dep_ for token in doc]

['nsubj', 'ROOT', 'det', 'amod', 'amod', 'attr']

In [None]:
#Counting occurences
counter = 0
for sentence in text:
  for token in sentence:
    if token[2] == "I-Negation": 
      counter = counter + 1

print(counter)
#Not=1887 occurences, no=350
#B-Negation=267, I-Negation=32

32


In [None]:
!pip install negspacy

Collecting negspacy
  Downloading https://files.pythonhosted.org/packages/b7/69/0c8f46cef8d8b6ee8925270e2d48c7ebd93153dcfbd28db778eaf3588f3f/negspacy-0.1.7.tar.gz
Building wheels for collected packages: negspacy
  Building wheel for negspacy (setup.py) ... [?25l[?25hdone
  Created wheel for negspacy: filename=negspacy-0.1.7-cp36-none-any.whl size=8052 sha256=5340ae2b9594bec048974f5c74a404fdac4083b25dd6b7adb9962d757ffe62aa
  Stored in directory: /root/.cache/pip/wheels/21/b4/31/6dcfab7cfed000ebfc983d2242dd8a801a2c0af4eff58c68dc
Successfully built negspacy
Installing collected packages: negspacy
Successfully installed negspacy-0.1.7


In [20]:
import spacy
from negspacy.negation import Negex

nlp = spacy.load("en_core_web_sm")
negex = Negex(nlp, ent_types=["PERSON","ORG"])
nlp.add_pipe(negex, last=True)

doc = nlp("She does not like Steve Jobs but likes Apple products.")
doc2 = nlp("This is no severe adverse reaction") 

#doc3 = nlp(text)

for e in doc2.ents:
	print(e.text, e._.negex)

In [19]:
#Creating word list
word_list = []
for sentence in text:
  for token in sentence:
    word_list.append(token[0])
word_list.append("ENDPAD")
words = list(set(word_list))

print(len(word_list))

word_list2 = " ".join(word_list)

520176


In [42]:
import spacy
import en_core_sci_sm
from negspacy.negation import Negex

nlp = spacy.load("en_core_sci_sm")
negex = Negex(nlp, chunk_prefix = ["nor", "no","excluding","without","none"], termination=["but", "however", "nevertheless", "except"])
nlp.add_pipe(negex, last=True)

doc = nlp(word_list2[:1000])
doc2 = nlp("This is no severe adverse reaction") 

for e in doc.ents:
	print(e.text, e._.negex)

ADVERSE REACTIONS False
clinical studies False
conditions False
adverse reaction False
rates False
clinical studies False
drug False
rates True
clinical studies True
drug True
rates True
patient population True
clinical practice True
EXCERPT False
adverse reactions False
incidence False
upper respiratory tract infections False
nasopharyngitis False
headache False
hypertension False
increased False
ALT False
injection site reactions False
SUSPECTED False
contact Genentech False
FDA False
800-FDA-1088 False
gov False
medwatch False
Clinical Trials False
Experience False
Rheumatoid Arthritis False
Patients False
Treated with False
Intravenous ACTEMRA False
ACTEMRA-IV False
ACTEMRA-IV False
data False
rheumatoid arthritis False
RA False
double-blind False
controlled False
multicenter studies False
studies False
patients False
doses False
ACTEMRA-IV False
monotherapy False
patients False
ACTEMRA-IV False


In [38]:
def link_coreference(tagged_text):
    text = tagged_text
    relations = []
    for sentence in text:
        adv_reactions = []
        sev = []
        for index,word in enumerate(sentence):
            if word[2][2:] == 'AdverseReaction':
                adv_reactions.append((word[0],index))
            if word[0] == 'she':
                sev.append((word[0],index))
        for s in sev:
            min = 1000
            link = ''
            for adv in adv_reactions:
                if np.abs(adv[1]-s[1]<min):
                    min = adv[1]-s[1]
                    link = [s[0],adv[0]]
            if link:
                relations.append(link)
    return relations

print(link_coreference(text))
print(len(link_coreference(text)))

[]
0


In [43]:
# Load your usual SpaCy model (one of SpaCy English models)
import spacy
nlp = spacy.load('en')

# load NeuralCoref and add it to the pipe of SpaCy's model
import neuralcoref
coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')

# You're done. You can now use NeuralCoref the same way you usually manipulate a SpaCy document and it's annotations.
doc = nlp(u'My sister has a dog. She loves him.')

doc._.has_coref
doc._.coref_clusters

ModuleNotFoundError: No module named 'neuralcoref'

In [45]:
import neuralcoref

ModuleNotFoundError: No module named 'neuralcoref'

# BERT implementation

In [None]:
!pip install transformers==2.6.0

In [None]:
# Bert experiment Using TensorFlow backend.
## Code heavily inspired from https://www.depends-on-the-definition.com/named-entity-recognition-with-bert/

import torch
from tqdm import tqdm, trange
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

torch.__version__

In [None]:
print(torch.cuda.is_available())

In [None]:
MAX_LEN = 200
bs = 32

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

torch.cuda.get_device_name(0) 

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

In [None]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
texts = texts_train + texts_test # combined data set of 200 drug labels
labels = labels_train + labels_test
print(len(texts),len(labels))

In [None]:
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(texts, labels)
]

In [None]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [None]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [None]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [None]:
attention_masks = [[float(i != tag2idx["PAD"]) for i in ii] for ii in input_ids]

In [None]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, # changed test size from 0.1 to 0.2
                                                            random_state=2018, test_size=0.1, random_state=2018)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1, random_state=2018)

In [None]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [None]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [None]:
import transformers
from transformers import BertForTokenClassification, AdamW

transformers.__version__

In [None]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)

In [None]:
model.cuda(); # might throw error: CUDA error: device-side assert triggered

In [None]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,        
    lr=3e-5,
    eps=1e-8
)

In [None]:
from transformers import get_linear_schedule_with_warmup

epochs = 5
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,
    
    num_training_steps=total_steps
)

In [None]:
!pip install seqeval

In [None]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
## Store the average loss after each epoch so we can plot them.
from tqdm import tqdm, trange
loss_values, validation_loss_values = [], []

for _ in trange(epochs, desc="Epoch"):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.
    
    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()    
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()
        
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    
    
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    
    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        eval_accuracy += flat_accuracy(logits, label_ids)
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    
    eval_loss = eval_loss / nb_eval_steps
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tag_values[p_i] for p in predictions for p_i in p]
    valid_tags = [tag_values[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
    #Added to have full table of F1 score values
    print("Classification report: {}".format(classification_report(pred_tags, valid_tags)))
    print()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(loss_values, 'b-o', label="training loss")
plt.plot(validation_loss_values, 'r-o', label="validation loss")

# Label the plot.
plt.title("Learning curve")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.show()

In [None]:
test_sentence = """

    <Section name="adverse reactions" id="S1">    VII. ADVERSE REACTIONS

  A. METHOXSALEN:

  The most commonly reported side effect of methoxsalen alone is nausea, which occurs with approximately 10% of all patients. This effect may be minimized or avoided by instructing the patient to take methoxsalen with milk or food, or to divide the dose into two portions, taken approximately one-half hour apart. Other effects include nervousness, insomnia, and psychological depression.



   B. COMBINED METHOXSALEN/UVA THERAPY:

  1. PRURITUS:

  This adverse reaction occurs with approximately 10% of all patients. In most cases, pruritus can be alleviated with frequent application of bland emollients or other topical agents; severe pruritus may require systemic treatment. If pruritus is unresponsive to these measures, shield pruritic areas from further UVA exposure until the condition resolves. If intractable pruritus is generalized, UVA treatment should be discontinued until the pruritus disappears.


   4. OTHER ADVERSE REACTIONS:

  Those reported include edema, dizziness, headache, malaise, depression, hypopigmentation, vesiculation and bullae formation, non-specific rash, herpes simplex, miliaria, urticaria, folliculitis, gastrointestinal disturbances, cutaneous tenderness, leg cramps, hypotension, and extension of psoriasis.
"""

In [None]:
tokenized_sentence = tokenizer.encode(test_sentence)
input_ids = torch.tensor([tokenized_sentence]).cuda()

In [None]:
with torch.no_grad():
    output = model(input_ids)
label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)

In [None]:
# join bpe split tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
new_tokens, new_labels = [], []
for token, label_idx in zip(tokens, label_indices[0]):
    if token.startswith("##"):
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        new_labels.append(tag_values[label_idx])
        new_tokens.append(token)

In [None]:
for token, label in zip(new_tokens, new_labels):
    print("{}\t{}".format(token, label))

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# drive.mount('/content/drive')
# torch.save(model.state_dict(), '/content/drive')

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
print(os.getcwd())
print(os.listdir())

/content
['.config', 'unannotated_xml', 'drug_labels_unannotated_tokens.pkl', 'unannotated_xml.tar.gz', 'adc.json', 'train_xml', 'drug_labels_annotated.pkl', 'osfstorage-archive.zip', 'drive', 'drug_labels_annotated_tokens.pkl', 'test_labels.pkl', 'train_xml.tar.gz', 'test_labels', 'drug_labels_unannotated.pkl', 'sample_data']


In [None]:
torch.save(model.state_dict(), '/content/drive/My\ Drive/Bert_model')

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
!ls

adc.json			    sample_data
drive				    test_labels
drug_labels_annotated.pkl	    test_labels.pkl
drug_labels_annotated_tokens.pkl    train_xml
drug_labels_unannotated.pkl	    train_xml.tar.gz
drug_labels_unannotated_tokens.pkl  unannotated_xml
osfstorage-archive.zip		    unannotated_xml.tar.gz


In [None]:
model_save_name = 'Bert.pt'
path = F"/content/drive/My Drive/{model_save_name}" 
torch.save(model.state_dict(), path)

In [None]:
model_save_name = 'Bert.pt'
path = F"/content/drive/My Drive/{model_save_name}"
model.load_state_dict(torch.load(path))

# Conditional Random Field

Taken from: https://www.depends-on-the-definition.com/named-entity-recognition-conditional-random-fields-python/

Load in the files again from the pickles.

In [None]:
import numpy as np
import pandas as pd

test_labels_df = pd.read_pickle('./test_labels.pkl')
train_labels_df = pd.read_pickle('./train_labels.pkl')

Convert the sentences into the form: [[string1,pos1,tag1],[string2,pos2,tag2],...]

In [None]:
# removing columns, bunching together sentences in one list
import itertools
import pandas as pd
key_f = lambda x: x[1]
texts_train = []
texts_test = []
for name in ["train","test"]: #training set and test set
    labels = pd.read_pickle('./' + name +'_labels.pkl') 
    for index,row in labels.iterrows():
        for column_name in ["adverse_reactions","warnings_and_precautions","boxed_warnings"]:
          for key, group in itertools.groupby(row[column_name],key_f): # here we regroup again by sentence
            sentence = []
            for i in list(group):
              token = []
              token.append(i[0]) # string
              token.append(i[2]) # POS tag
              token.append(i[3]) # NER tag
              sentence.append(token)
            globals()["texts_" + name].append(sentence)
text = texts_test + texts_train

Test out whether sentences look correct

In [None]:
sent = text[10]
print(sent)

[['The', 'DT', 'O'], ['study', 'NN', 'O'], ['population', 'NN', 'O'], ['had', 'VBD', 'O'], ['a', 'DT', 'O'], ['mean', 'JJ', 'O'], ['age', 'NN', 'O'], ['of', 'IN', 'O'], ['52', 'CD', 'O'], ['years', 'NNS', 'O'], [',', ',', 'O'], ['82', 'CD', 'O'], ['%', 'NN', 'O'], ['were', 'VBD', 'O'], ['female', 'JJ', 'O'], ['and', 'CC', 'O'], ['74', 'CD', 'O'], ['%', 'NN', 'O'], ['were', 'VBD', 'O'], ['Caucasian', 'JJ', 'O'], ['.', '.', 'O']]


Now we craft a set of features (make a feature factory) and prepare the dataset.

In [None]:
import string
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk.stem.porter import PorterStemmer

def get_wordnet_pos(pos_tag):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag[0]
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def is_in_table(sent):
    return sent[0][0] == "Table" and sent[1][0].isnumeric() and sent[2][0] == ':'

def is_in_excerpt(sent):
    return sent[0][0] == "*"

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word.isdigit': word.isnumeric(),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.ispunct': word in string.punctuation,
        'word[-2:]': word[-2:].lower(),
        'word[-3:]': word[-3:].lower(),
        'word[-4:]': word[-4:].lower(),
        'word[-5:]': word[-5:].lower(),
        'postag': postag,
        'postag[:2]': postag[:2],
        'word.lemmatized': lemmatizer.lemmatize(word.lower(),get_wordnet_pos(postag)), 
        'word.stemmed': stemmer.stem(word),
        'word.is_in_excerpt' : is_in_excerpt(sent),
        'word.length_sentence' : len(sent)
    }
    if i > 0: # Check 1 token before the current token
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isdigit': word1.isnumeric(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.ispunct': word1 in string.punctuation,
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:word.lemmatized': lemmatizer.lemmatize(word1.lower(),get_wordnet_pos(postag1)),
            '-1:word.stemmed': stemmer.stem(word1)
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1: # Check 1 token after the current token
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isdigit': word1.isnumeric(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.ispunct': word1 in string.punctuation,
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+1:word.lemmatized': lemmatizer.lemmatize(word1.lower(),get_wordnet_pos(postag1)),
            '+1:word.stemmed': stemmer.stem(word1)
        })
    else:
        features['EOS'] = True
        
    if i > 1: # Check the token positioned 2 places before the current token
        word1 = sent[i-2][0]
        postag1 = sent[i-2][1]
        features.update({
            '-2:word.lower()': word1.lower(),
            '-2:word.istitle()': word1.istitle(),
            '-2:word.isdigit': word1.isnumeric(),
            '-2:word.isupper()': word1.isupper(),
            '-2:word.ispunct': word1 in string.punctuation,
            '-2:postag': postag1,
            '-2:postag[:2]': postag1[:2],
            '-2:word.lemmatized': lemmatizer.lemmatize(word1.lower(),get_wordnet_pos(postag1)),
            '-2:word.stemmed': stemmer.stem(word1)
        })
    
    if i < len(sent)-2: # Check the token positioned 2 places after the current token
        word1 = sent[i+2][0]
        postag1 = sent[i+2][1]
        features.update({
            '+2:word.lower()': word1.lower(),
            '+2:word.istitle()': word1.istitle(),
            '+2:word.isdigit': word1.isnumeric(),
            '+2:word.isupper()': word1.isupper(),
            '+2:word.ispunct': word1 in string.punctuation,
            '+2:postag': postag1,
            '+2:postag[:2]': postag1[:2],
            '+2:word.lemmatized': lemmatizer.lemmatize(word1.lower(),get_wordnet_pos(postag1)),
            '+2:word.stemmed': stemmer.stem(word1)
        })
    
    if i > 2: # Check the token positioned 3 places before the current token
        word1 = sent[i-3][0]
        postag1 = sent[i-3][1]
        features.update({
            '-3:word.lower()': word1.lower(),
            '-3:word.istitle()': word1.istitle(),
            '-3:word.isdigit': word1.isnumeric(),
            '-3:word.isupper()': word1.isupper(),
            '-3:word.ispunct': word1 in string.punctuation,
            '-3:postag': postag1,
            '-3:postag[:2]': postag1[:2],
            '-3:word.lemmatized': lemmatizer.lemmatize(word1.lower(),get_wordnet_pos(postag1)),
            '-3:word.stemmed': stemmer.stem(word1)
            
        })
    
    if i < len(sent)-3: # Check the token positioned 3 places after the current token
        word1 = sent[i+3][0]
        postag1 = sent[i+3][1]
        features.update({
            '+3:word.lower()': word1.lower(),
            '+3:word.istitle()': word1.istitle(),
            '+3:word.isdigit': word1.isnumeric(),
            '+3:word.isupper()': word1.isupper(),
            '+3:word.ispunct': word1 in string.punctuation,
            '+3:postag': postag1,
            '+3:postag[:2]': postag1[:2],
            '+3:word.lemmatized': lemmatizer.lemmatize(word1.lower(),get_wordnet_pos(postag1)),
            '+3:word.stemmed': stemmer.stem(word1)
        })

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

Split the data in a test and training set (10% vs 90%)

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from sklearn.model_selection import train_test_split
X = [sent2features(s) for s in text]
y = [sent2labels(s) for s in text]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2018)

In [None]:
!pip install sklearn_crfsuite
!pip install sklearn

Collecting sklearn_crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/95/99/869dde6dbf3e0d07a013c8eebfb0a3d30776334e0097f8432b631a9a3a19/python_crfsuite-0.9.7-cp36-cp36m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 3.9MB/s eta 0:00:01
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6


Run the CRFsuite algorithm with limited regularization (c1=0.1 and c2=0.1)

In [None]:
from sklearn_crfsuite import CRF
crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=False,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

Get the label names from the CRF classifier

In [None]:
labels = list(crf.classes_)
labels.remove('O')

Inspect the quality of the CRF (ie check recall, precision and F1 scores)

In [None]:
from sklearn_crfsuite import metrics

y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=labels, digits=3
))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                   precision    recall  f1-score   support

B-AdverseReaction      0.883     0.887     0.885      2364
I-AdverseReaction      0.818     0.836     0.827      1648
      B-DrugClass      0.625     0.500     0.556        30
      I-DrugClass      0.792     0.613     0.691        31
         B-Factor      0.710     0.528     0.606       125
       B-Severity      0.808     0.724     0.764       163
       B-Negation      0.722     0.448     0.553        29
       I-Severity      0.832     0.701     0.761       134
       I-Negation      0.000     0.000     0.000         1
         I-Factor      0.000     0.000     0.000         6
         B-Animal      0.889     0.727     0.800        11
         I-Animal      0.000     0.000     0.000         0

        micro avg      0.848     0.838     0.843      4542
        macro avg      0.590     0.497     0.537      4542
     weighted avg      0.846     0.838     0.841      4542



In [None]:
import sklearn
from  sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report

Inspecting the model weights

In [None]:
!pip install eli5

Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/97/2f/c85c7d8f8548e460829971785347e14e45fa5c6617da374711dec8cb38cc/eli5-0.10.1-py2.py3-none-any.whl (105kB)
[K     |████████████████████████████████| 112kB 2.7MB/s eta 0:00:01
Installing collected packages: eli5
Successfully installed eli5-0.10.1


In [None]:
import eli5
eli5.show_weights(crf, top=30)

Using TensorFlow backend.


From \ To,O,B-AdverseReaction,I-AdverseReaction,B-Animal,I-Animal,B-DrugClass,I-DrugClass,B-Factor,I-Factor,B-Negation,I-Negation,B-Severity,I-Severity
O,1.473,-0.003,-7.649,-0.006,0.0,-0.006,0.0,0.01,0.0,0.003,0.0,-0.001,0.0
B-AdverseReaction,-0.848,-1.321,2.975,0.0,0.0,0.0,0.0,-0.002,0.0,-0.009,0.0,-2.579,0.0
I-AdverseReaction,-0.094,-1.574,2.908,0.0,0.0,0.0,0.0,0.309,0.0,-0.813,0.0,-0.435,0.0
B-Animal,-0.1,0.0,0.0,0.0,2.316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-Animal,-1.006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-DrugClass,-1.113,-0.015,0.0,0.0,0.0,0.0,5.804,0.0,0.0,0.0,0.0,0.0,0.0
I-DrugClass,-1.389,-1.558,0.0,0.0,0.0,0.0,5.877,0.0,0.0,0.0,0.0,0.0,0.0
B-Factor,-0.596,0.71,0.0,0.0,0.0,0.0,0.0,0.0,5.909,0.0,0.0,0.005,0.0
I-Factor,-2.023,-0.479,0.0,0.0,0.0,0.0,0.0,0.0,5.515,0.0,0.0,0.0,0.0
B-Negation,-0.723,1.271,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.632,0.871,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12
+5.135,word.lemmatized:be,,,,,,,,,,,
+4.892,word.stemmed:b,,,,,,,,,,,
+3.577,EOS,,,,,,,,,,,
+3.381,word.stemmed:includ,,,,,,,,,,,
+3.302,word.lemmatized:have,,,,,,,,,,,
+3.298,word.lemmatized:a,,,,,,,,,,,
+2.766,-3:word.lower():discontinue,,,,,,,,,,,
+2.643,word.stemmed:occur,,,,,,,,,,,
+2.544,word[-2:]:ta,,,,,,,,,,,
+2.498,word[-2:]:.9,,,,,,,,,,,

Weight?,Feature
+5.135,word.lemmatized:be
+4.892,word.stemmed:b
+3.577,EOS
+3.381,word.stemmed:includ
+3.302,word.lemmatized:have
+3.298,word.lemmatized:a
+2.766,-3:word.lower():discontinue
+2.643,word.stemmed:occur
+2.544,word[-2:]:ta
+2.498,word[-2:]:.9

Weight?,Feature
+2.644,word[-5:]:died
+2.644,word.stemmed:die
+2.644,word.lemmatized:die
+2.644,word.lower():died
+2.612,word.lemmatized:fatal
+2.612,word.lower():fatal
+2.588,word.lemmatized:faint
+2.394,+1:word.stemmed:mucos
+2.359,-2:word.lower():extremity
+2.323,word[-2:]:sr

Weight?,Feature
+4.328,+2:word.lower():sound
+2.795,word.lemmatized:outcomes
+2.539,-1:word.lower():g.i
+2.539,-1:word.lemmatized:g.i
+2.539,-1:word.stemmed:g.i
+2.182,-3:word.lower():reverse
+2.128,word.lemmatized:generalize
+1.868,-3:word.lemmatized:reverse
+1.828,+3:word.lemmatized:pool
+1.803,+3:word.stemmed:M

Weight?,Feature
+2.174,word.stemmed:rat
+2.174,word.lemmatized:rat
+1.975,word.stemmed:monkey
+1.975,word.lemmatized:monkey
+1.512,word.lemmatized:rabbit
+1.512,word.stemmed:rabbit
+1.475,word.lemmatized:rodent
+1.475,word.stemmed:rodent
+1.385,-2:word.stemmed:rat
+1.385,-2:word.lemmatized:rat

Weight?,Feature
+0.714,word[-2:]:ys
+0.705,word.lemmatized:monkey
+0.705,word.stemmed:monkey
+0.704,word[-4:]:keys
+0.704,word[-5:]:nkeys
+0.704,word.lower():monkeys
+0.704,-1:word.lower():cynomolgus
+0.704,-1:word.stemmed:cynomolgu
+0.704,-1:word.lemmatized:cynomolgus
+0.704,word[-3:]:eys

Weight?,Feature
+1.871,-1:word.lower():using
+1.618,-1:word.lemmatized:related
+1.536,word.stemmed:coc
+1.499,-3:word.lower():43/938
+1.499,-3:word.lemmatized:43/938
+1.499,-3:word.stemmed:43/938
+1.472,word.stemmed:steroid
+1.472,word.lemmatized:steroid
+1.457,-2:word.lower():showed
+1.456,word.lower():nnrti-related

Weight?,Feature
+2.152,+1:word.lower():include
+1.772,+1:word.lower():use
+1.668,+2:word.lower():include
+1.620,-1:word.lower():treat
+1.394,word.lemmatized:preparation
+1.278,word.stemmed:prepar
+1.255,+3:word.lower():agonist
+1.169,-3:word.lower():exposure
+1.133,word.lemmatized:blocker
+1.133,word.stemmed:blocker

Weight?,Feature
+3.243,-1:word.lower():systems
+3.222,+1:word.lower():arms
+3.074,+1:word.lower():leads
+2.572,+3:word.lemmatized:observed
+2.203,+2:word.lower():resulted
+2.134,-3:word.lower():corticosteroids
+1.987,+3:word.lemmatized:botulinum
+1.987,+3:word.stemmed:botulinum
+1.987,+3:word.lower():botulinum
+1.798,-3:word.lemmatized:associated

Weight?,Feature
+1.158,+3:word.lemmatized:hyperplasia
+1.158,+3:word.stemmed:hyperplasia
+1.158,+3:word.lower():hyperplasia
+1.051,+3:word.lower():peginterferon
+1.051,+3:word.lemmatized:peginterferon
+1.051,+3:word.stemmed:peginterferon
+1.044,+2:word.lower():peginterferon
+1.044,+2:word.lemmatized:peginterferon
+1.044,+2:word.stemmed:peginterferon
+1.040,+3:word.stemmed:endometri

Weight?,Feature
+2.499,+2:word.lower():increase
+2.187,-1:word.lower():received
+1.616,word.lemmatized:chemotherapy-treated
+1.616,word.stemmed:chemotherapy-tr
+1.616,word.lower():chemotherapy-treated
+1.526,-3:word.lemmatized:bleed
+1.500,+2:word.lower():experienced
+1.479,+3:word.lower():representing
+1.472,+1:word.lower():increased
+1.415,+3:word.lemmatized:represent

Weight?,Feature
+1.914,-3:word.stemmed:normal
+1.742,-1:word.lower():chemotherapy-treated
+1.742,-1:word.lemmatized:chemotherapy-treated
+1.742,-1:word.stemmed:chemotherapy-tr
+1.276,+2:word.stemmed:apixaban
+1.276,+2:word.lemmatized:apixaban
+1.276,+2:word.lower():apixaban
+1.257,+3:word.lower():apixaban
+1.257,+3:word.lemmatized:apixaban
+1.257,+3:word.stemmed:apixaban

Weight?,Feature
+2.331,-3:word.stemmed:hct
+2.331,-3:word.lemmatized:hct
+2.331,-3:word.lower():hct
+2.114,+2:word.lower():increases
+1.916,-3:word.lower():transaminases
+1.897,-2:word.stemmed:subclin
+1.897,-2:word.lemmatized:subclinical
+1.897,-2:word.lower():subclinical
+1.817,word.lower():substantial
+1.817,word.lemmatized:substantial

Weight?,Feature
+2.213,+2:postag[:2]:PR
+1.886,-1:word.stemmed:life
+1.886,-1:word.lower():life
+1.886,-1:word.lemmatized:life
+1.753,-1:word.lower():required
+1.723,+2:word.lower():occurrence
+1.714,+2:word.lemmatized:occurrence
+1.714,+2:word.stemmed:occurr
+1.526,-3:word.lower():hb
+1.526,-3:word.lemmatized:hb


In [None]:
import pickle
import os
print(os.getcwd())
path = os.getcwd()+'/crf.pickle'
pickle.dump(crf, open(path, 'wb'))

/content


TODO: Using a suggested approach to find the best regularisation parameters from https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html

In [None]:
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=False
)
params_space = {
    'c1': scipy.stats.expon(scale=1),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=100,
                        n_jobs=3,
                        n_iter=10,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Look at the best classifier found by the random search.

In [None]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

NameError: ignored

Look at which regularization parameters the random search considered

In [None]:
_x = rs.cv_results_['param_c1']
_y = rs.cv_results_['param_c2']
_c = rs.cv_results_['mean_test_score']

fig = plt.figure()
fig.set_size_inches(12, 12)
ax = plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('C1')
ax.set_ylabel('C2')
ax.set_title("Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})".format(
    min(_c), max(_c)
))

ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])

print("Dark blue => {:0.4}, dark red => {:0.4}".format(min(_c), max(_c)))

Show the quality of the best CRF found by the random search

In [None]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=labels, digits=3
))
crf.to_pickle('crf_best_estimator_.pkl')

Check the weights of the new CRF classifier

In [None]:
import eli5
eli5.show_weights(crf, top=30)

If you want to test the model on any string


In [None]:
import nltk
from nltk import word_tokenize

input_text = 'Gastrointestinal: anorexia, nausea, vomiting, diarrhea, glossitis, dysphagia, enterocolitis, inflammatory lesions (with monilial overgrowth) in the anogenital region, and pancreatitis. Hepatotoxicity has been reported rarely. These reactions have been caused by both the oral and parenteral administration of tetracyclines. Superficial discoloration of the adult permanent dentition, reversible upon drug discontinuation and professional dental cleaning has been reported. Permanent tooth discoloration and enamel hypoplasia may occur with drugs of the tetracycline class when used during tooth development. (See  WARNINGS  .) Rare instances of esophagitis and esophageal ulcerations have been reported in patients receiving capsule and tablet forms of the drugs in the tetracycline class. Most of these patients took medications immediately before going to bed. (See  DOSAGE AND ADMINISTRATION  .)'
tokens = word_tokenize(input_text)
tagged_text = [nltk.pos_tag(tokens)]
test_set = [sent2features(s) for s in tagged_text]
crf.predict(test_set)

# LSTM-CRF

Bidirectional LSTM-CRF

Inspired from: https://www.depends-on-the-definition.com/sequence-tagging-lstm-crf/

Load data from pickles

In [None]:
test_labels_df = pd.read_pickle('./test_labels.pkl')
train_labels_df = pd.read_pickle('./train_labels.pkl')

In [None]:
# removing columns, bunching together sentences in one list
import itertools
import pandas as pd
key_f = lambda x: x[1]
texts_train = []
texts_test = []
for name in ["train","test"]: #training set and test set
    labels = pd.read_pickle('./' + name +'_labels.pkl') 
    for index,row in labels.iterrows():
        for column_name in ["adverse_reactions","warnings_and_precautions","boxed_warnings"]:
          for key, group in itertools.groupby(row[column_name],key_f): # here we regroup again by sentence
            sentence = []
            for i in list(group):
              token = []
              token.append(i[0]) # string
              token.append(i[2]) # POS tag
              token.append(i[3]) # NER tag
              sentence.append(token)
            globals()["texts_" + name].append(sentence)
text = texts_test + texts_train

Number of different words:

In [None]:
word_list = []
for sentence in text:
  for token in sentence:
    word_list.append(token[0])
word_list.append("ENDPAD")
words = list(set(word_list))

In [None]:
#Example of finding a word
""" 
i = 0
for sentence in text:
  for token in sentence:
    if token[0] == 'anaphylaxis':
      print(sentence)
      print(i)
  i += 1 """

In [None]:
text[2743]

In [None]:
#tags = ['AdverseReaction','Severity','Factor','Negation','Animal','DrugClass','O']

Check the length of the sentences.

In [None]:
import matplotlib.pyplot as plt
plt.style.use("ggplot")

plt.hist([len(s) for s in text], bins=100,range=[0, 400])
plt.show()

Introduce dictionaries of works and tags.

In [None]:
max_len = 200
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [None]:
#tokenize & prepare sentences (map sentence to numbers & pad sequence)
from keras.preprocessing.sequence import pad_sequences
X = [[word2idx[w[0]] for w in s] for s in text]

In [None]:
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=0)
y = [[tag2idx[w[2]] for w in s] for s in text]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

In [None]:
#change labels y to categorical
from keras.utils import to_categorical
y = [to_categorical(i, num_classes=len(tags)) for i in y]

In [None]:
#split train & test set
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1, random_state=2018)

In [None]:
!pip install git+https://www.github.com/keras-team/keras-contrib.git

Train model -fit LSTM-CRF with embedding layer

In [None]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF

In [None]:
input = Input(shape=(max_len,))
model = Embedding(input_dim=len(words) + 1, output_dim=20,
                  input_length=max_len)(input)  # 20-dim embedding
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)  # variational biLSTM
model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
crf = CRF(len(tags))  # CRF layer
out = crf(model)  # output

In [None]:
model = Model(input, out)

In [None]:
model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])


In [None]:
model.summary()


In [None]:
history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=5,
                    validation_split=0.1, verbose=1)


In [None]:
hist = pd.DataFrame(history.history)

In [None]:
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.figure(figsize=(12, 12))
plt.plot(hist["loss"])
plt.plot(hist["val_loss"])
plt.show()

In [None]:
!pip install seqeval

Evaluate model systematically

In [None]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

In [None]:
test_pred = model.predict(X_te, verbose=1)

In [None]:
idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_te)

In [None]:
print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels)))

In [None]:
print(classification_report(test_labels, pred_labels))

In [None]:
i = 625
p = model.predict(np.array([X_te[i]]))
p = np.argmax(p, axis=-1)
true = np.argmax(y_te[i], -1)
print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(X_te[i], true, p[0]):
    if w != 0:
        print("{:15}: {:5} {}".format(words[w-1], tags[t], tags[pred]))

In [None]:
test_sentence = ["The", "most", "commonly", "reported", "side", "effect", "of", "methoxsalen", "alone", "is", "nausea", "which", "occurs", "with", "approximately", "10%", "of", "all", "patients",".",
"Other", "effects", "include", "nervousness", "insomnia", "and", "psychological", "depression","."]

Transform every word to integer index. Map unknown words to zero.

In [None]:
x_test_sent = pad_sequences(sequences=[[word2idx.get(w, 0) for w in test_sentence]],
                            padding="post", value=0, maxlen=max_len)

In [None]:
tags

In [None]:
p = model.predict(np.array([x_test_sent[0]]))
p = np.argmax(p, axis=-1)
print("{:15}||{}".format("Word", "Prediction"))
print(30 * "=")
for w, pred in zip(test_sentence, p[0]):
    print("{:15}: {:5}".format(w, tags[pred]))

# Extending LSTM-CRF with character embeddings

Postfix and prefix contain a lot of information about meaning of a word, helps in text with rare words and expect a lot of unknown - like medical texts.

Taken from: https://www.depends-on-the-definition.com/lstm-with-char-embeddings-for-ner/ 

In [None]:
# Loading data & formatting to one text list
import itertools
import pandas as pd
key_f = lambda x: x[1]
texts_train = []
texts_test = []
for name in ["train","test"]: #training set and test set
    labels = pd.read_pickle('./drive/My Drive/Pickles/' + name +'_labels.pkl') 
    for index,row in labels.iterrows():
        for column_name in ["adverse_reactions","warnings_and_precautions","boxed_warnings"]:
          for key, group in itertools.groupby(row[column_name],key_f): # here we regroup again by sentence
            sentence = []
            for i in list(group):
              token = []
              token.append(i[0]) # string
              token.append(i[2]) # POS tag
              token.append(i[3]) # NER tag
              sentence.append(token)
            globals()["texts_" + name].append(sentence)
text = texts_test + texts_train

#Creating word list
word_list = []
for sentence in text:
  for token in sentence:
    word_list.append(token[0])
word_list.append("ENDPAD")
words = list(set(word_list))

def get_tags(text):
  tag_list = []
  for i in text:
    for j in i:
      tag_list.append(j[2])
  return list(set(tag_list))

#Defining tags
tags = get_tags(text)
print(tags)

n_words = len(words); n_words

Prepare tokens

In [None]:
##Finding longest word in database
largestWord=""
largestLen=0
for word in words:
  if largestLen<len(word):
    largestLen=len(word)
    largestWord=word

largestWord
#len(largestWord)

'//www.cdc.gov/vaccines/schedules/index.html'

Introduce dictionaries of words and tags

In [None]:
max_len = 200 #Sentence length
max_len_char = 10 #Set rather arbitrarily, could use longer or shorter

n_words = len(words); n_words

In [None]:
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1
word2idx["PAD"] = 0
idx2word = {i: w for w, i in word2idx.items()}
tag2idx = {t: i + 1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
idx2tag = {i: w for w, i in tag2idx.items()}

In [None]:
print(word2idx["dizziness"])
print(tag2idx["O"])

Map sentences to a sequence of numbers and then pad the sequence

In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
X_word = [[word2idx[w[0]] for w in s] for s in text]
X_word = pad_sequences(maxlen=max_len, sequences=X_word, value=word2idx["PAD"], padding='post', truncating='post')

In [None]:
#Dictionary for characters & create sequence of characters for every token
chars = set([w_i for w in words for w_i in w])
n_chars = len(chars)
print(n_chars)

In [None]:
char2idx = {c: i + 2 for i, c in enumerate(chars)}
char2idx["UNK"] = 1
char2idx["PAD"] = 0

In [None]:
X_char = []
for sentence in text:
    sent_seq = []
    for i in range(max_len):
        word_seq = []
        for j in range(max_len_char):
            try:
                word_seq.append(char2idx.get(sentence[i][0][j]))
            except:
                word_seq.append(char2idx.get("PAD"))
        sent_seq.append(word_seq)
    X_char.append(np.array(sent_seq))

In [None]:
y = [[tag2idx[w[2]] for w in s] for s in text]
y = pad_sequences(maxlen=max_len, sequences=y, value=tag2idx["PAD"], padding='post', truncating='post')

In [None]:
#Split train and test set
from sklearn.model_selection import train_test_split

X_word_tr, X_word_te, y_tr, y_te = train_test_split(X_word, y, test_size=0.1, random_state=2018)
X_char_tr, X_char_te, _, _ = train_test_split(X_char, y, test_size=0.1, random_state=2018)

Train model

In [None]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D
from keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D

Wrap parts that should apply to the characters in a TimeDistributed layer to apply same layers to every character sequence

In [None]:
# input and embedding for words
word_in = Input(shape=(max_len,))
emb_word = Embedding(input_dim=n_words + 2, output_dim=20,
                     input_length=max_len, mask_zero=True)(word_in)

# input and embeddings for characters
char_in = Input(shape=(max_len, max_len_char,))
emb_char = TimeDistributed(Embedding(input_dim=n_chars + 2, output_dim=10,
                           input_length=max_len_char, mask_zero=True))(char_in)
# character LSTM to get word encodings by characters
char_enc = TimeDistributed(LSTM(units=20, return_sequences=False,
                                recurrent_dropout=0.5))(emb_char)

# main LSTM
x = concatenate([emb_word, char_enc])
x = SpatialDropout1D(0.3)(x)
main_lstm = Bidirectional(LSTM(units=50, return_sequences=True,
                               recurrent_dropout=0.6))(x) #Model - used recurrend_dropout 0.1 in LSTM
out = TimeDistributed(Dense(len(tags) + 1, activation="sigmoid"))(main_lstm) #Use relu activation function?

model = Model([word_in, char_in], out)

In [None]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["acc"])
model.summary()

Model fit takes a bit of time (about 2 min per epoch)

In [None]:
history = model.fit([X_word_tr,
                     np.array(X_char_tr).reshape((len(X_char_tr), max_len, max_len_char))],
                    np.array(y_tr).reshape(len(y_tr), max_len, 1),
                    batch_size=32, epochs=10, validation_split=0.1, verbose=1)


In [None]:
hist = pd.DataFrame(history.history)

In [None]:
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.figure(figsize=(12,12))
plt.plot(hist["acc"])
plt.plot(hist["val_acc"])
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.figure(figsize=(12,12))
plt.plot(hist["loss"])
plt.plot(hist["val_loss"])
plt.show()

Look at predictions

In [None]:
y_pred = model.predict([X_word_te,
                        np.array(X_char_te).reshape((len(X_char_te),
                                                     max_len, max_len_char))])

In [None]:
i = 625 #725 looks weird (726 AUBAGIO), pred not all O #625 'true' is odd (626 SIGNIFOR)
#7 looks good #627 good
p = np.argmax(y_pred[i], axis=-1)
print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(X_word_te[i], y_te[i], p):
    if w != 0:
        print("{:15}: {:5} {}".format(idx2word[w], idx2tag[t], idx2tag[pred]))


In [None]:
!pip install seqeval

Evaluate

In [None]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

In [None]:
test_pred = model.predict([X_word_te,
                        np.array(X_char_te).reshape((len(X_char_te),
                                                     max_len, max_len_char))], verbose=1) #include verbose=1?
                                                     

In [None]:
i = 625 #725 looks weird (726 AUBAGIO), pred not all O #625 'true' is odd (626 SIGNIFOR)
#7 looks good #627 good
p = np.argmax(test_pred[i], axis=-1)
print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(X_word_te[i], y_te[i], p):
    if w != 0:
        print("{:15}: {:5} {}".format(idx2word[w], idx2tag[t], idx2tag[pred]))

In [None]:
def pred2label1(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p,axis=-1)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out

def pred2label2(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            # p_i = np.argmax(p,axis=-1)
            out_i.append(idx2tag[p].replace("PAD", "O"))
        out.append(out_i)
    return out
    
pred_labels = pred2label1(test_pred)
test_labels = pred2label2(y_te)


print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels)))

In [None]:
print(classification_report(test_labels, pred_labels))

# Relations

First need to format properly. Afterwards, this is a good source: https://www.depends-on-the-definition.com/attention-lstm-relation-classification/

Assuming have downloaded files - maybe copy paste that code here too

# Negation (SpaCy)

'Tries to use a slightly modified version of Das and Chen (2001) (http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.202.6418&rep=rep1&type=pdf). They detect words such as no, not, and never and then append a "neg"-suffix to every word appearing between a negation and a clause-level punctuation mark. Here want to create something similar with dependency parsing from spaCy.'

Taken from: https://stackoverflow.com/questions/54849111/negation-and-dependency-parsing-with-spacy

**Very** similar to: https://codereview.stackexchange.com/questions/214665/define-the-scope-of-negation-with-the-dependency-parser-of-spacy

In [None]:
!pip3 install spacy



In [None]:
import spacy
from spacy import displacy

nlp = spacy.load('en')

In [None]:
test_labels_df = pd.read_pickle('./test_labels.pkl')
train_labels_df = pd.read_pickle('./train_labels.pkl')

In [None]:
doc = nlp("Superficial discoloration of the adult permanent dentition is not reversible upon drug discontinuation but is reversible upon professional dental cleaning.")

In [None]:
doc2 = nlp("This doesn't result in severe skin_reactions")

In [None]:
#Slow, wants to open in new file; supposed to show graph of dependencies
options = {'compact': True, 'color': 'black', 'font': 'Arial'}
displacy.render(doc, style='dep', jupyter=True) #, options=options

In [None]:
negation_tokens = [tok for tok in doc if tok.dep_ == 'neg']
negation_head_tokens = [token.head for token in negation_tokens]

for token in negation_head_tokens:
    print(token.text, token.dep_, token.head.text, token.head.pos_, [child for child in token.children])

is ROOT is AUX [discoloration, not, reversible, but, is, .]


In [None]:
negation = [tok for tok in doc if tok.dep_ == 'neg']

In [None]:
print('DEPENDENCY RELATIONS')
print('Key: ')
print('TEXT, DEP, HEAD_TEXT, HEAD_POS, CHILDREN')

for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
      [child for child in token.children])

DEPENDENCY RELATIONS
Key: 
TEXT, DEP, HEAD_TEXT, HEAD_POS, CHILDREN
Superficial amod discoloration NOUN []
discoloration nsubj is AUX [Superficial, of]
of prep discoloration NOUN [dentition]
the det dentition NOUN []
adult compound dentition NOUN []
permanent amod dentition NOUN []
dentition pobj of ADP [the, adult, permanent]
is ROOT is AUX [discoloration, not, reversible, but, is, .]
not neg is AUX []
reversible acomp is AUX [upon]
upon prep reversible ADJ [discontinuation]
drug compound discontinuation NOUN []
discontinuation pobj upon SCONJ [drug]
but cc is AUX []
is conj is AUX [reversible]
reversible acomp is AUX [upon]
upon prep reversible ADJ [cleaning]
professional amod cleaning NOUN []
dental amod cleaning NOUN []
cleaning pobj upon SCONJ [professional, dental]
. punct is AUX []


In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("This is not a text")
[token.text for token in doc]
[token.dep_ for token in doc]

['nsubj', 'ROOT', 'neg', 'det', 'attr']

Setting up StanfordNLP in Python - Stanford parser recommended by teacher for dependecy grammar (good for negation handling)

Basics: https://www.analyticsvidhya.com/blog/2019/02/stanfordnlp-nlp-library-python/ 

In [None]:
!pip install stanfordnlp

In [None]:
import stanfordnlp

In [None]:
stanfordnlp.download('en')

In [None]:
pip freeze | grep torch

In [None]:
#nlp = stanfordnlp.Pipeline(processors = "tokenize,mwt,lemma,pos")

In [None]:
doc = nlp("""Superficial discoloration of the adult permanent dentition, 
not reversible upon drug discontinuation but reversible upon professional dental cleaning has been reported.""")

In [None]:
# WORKS
#tokenization 
doc.sentences[0].print_tokens()

In [None]:
#EMPTY OUTPUT
#dictionary that contains pos tags and their explanations
pos_dict = {
'CC': 'coordinating conjunction','CD': 'cardinal digit','DT': 'determiner',
'EX': 'existential there (like: \"there is\" ... think of it like \"there exists\")',
'FW': 'foreign word','IN':  'preposition/subordinating conjunction','JJ': 'adjective \'big\'',
'JJR': 'adjective, comparative \'bigger\'','JJS': 'adjective, superlative \'biggest\'',
'LS': 'list marker 1)','MD': 'modal could, will','NN': 'noun, singular \'desk\'',
'NNS': 'noun plural \'desks\'','NNP': 'proper noun, singular \'Harrison\'',
'NNPS': 'proper noun, plural \'Americans\'','PDT': 'predeterminer \'all the kids\'',
'POS': 'possessive ending parent\'s','PRP': 'personal pronoun I, he, she',
'PRP$': 'possessive pronoun my, his, hers','RB': 'adverb very, silently,',
'RBR': 'adverb, comparative better','RBS': 'adverb, superlative best',
'RP': 'particle give up','TO': 'to go \'to\' the store.','UH': 'interjection errrrrrrrm',
'VB': 'verb, base form take','VBD': 'verb, past tense took',
'VBG': 'verb, gerund/present participle taking','VBN': 'verb, past participle taken',
'VBP': 'verb, sing. present, non-3d take','VBZ': 'verb, 3rd person sing. present takes',
'WDT': 'wh-determiner which','WP': 'wh-pronoun who, what','WP$': 'possessive wh-pronoun whose',
'WRB': 'wh-abverb where, when','QF' : 'quantifier, bahut, thoda, kam (Hindi)','VM' : 'main verb',
'PSP' : 'postposition, common in indian langs','DEM' : 'demonstrative, common in indian langs'
}

#extract parts of speech
def extract_pos(doc):
  parsed_text = {'word':[], 'pos':[], 'exp':[]}
  for sent in doc.sentences:
    for wrd in sent.words:
      if wrd.pos in pos_dict.keys():
        pos_exp = pos_dict[wrd.pos]
      else:
        pos_exp = 'NA'
        parsed_text['word'].append(wrd.text)
        parsed_text['pos'].append(wrd.pos)
        parsed_text['exp'].append(pos_exp)
  #return a dataframe of pos and text
  return pd.DataFrame(parsed_text)

#extract pos
extract_pos(doc)

Unnamed: 0,word,pos,exp
0,",",",",
1,.,.,


In [None]:
#DOESN'T WORK
doc.sentences[0].print_dependencies()

Recommended to use Stanford CoreNLP Servr as a client - seems rather complicated
https://stackoverflow.com/questions/56527814/stanford-typed-dependencies-using-corenlp-in-python
Detailed
Goes through Java installation and the server link:
https://www.khalidalnajjar.com/setup-use-stanford-corenlp-server-python/

# Drug - Therapeutic indication (RxClass API)

RxClass API Python wrapper (to find therapeutic uses of drugs)
From: https://github.com/Ianphorsman/RxClassAPIWrapper

Careful: Many false negative results

RxNorm - given by national Library of Medicine - NIH sponsored.

RxClass API explanation (medium useful): https://rxnav.nlm.nih.gov/RxClassAPIs.html#
https://rxnav.nlm.nih.gov/RxClassIntro.html 

Web interface of RxClass - can research manually drugs: https://mor.nlm.nih.gov/RxClass/search?query=gilotrif&searchBy=drug&sourceIds=&drugSources=atc1-4%7Catc%2Cepc%7Cdailymed%2Cmeshpa%7Cmesh%2Cdisease%7Cmedrt%2Cchem%7Cdailymed%2Cmoa%7Cdailymed%2Cpe%7Cdailymed%2Cpk%7Cmedrt%2Ctc%7Cfmtsme%2Cva%7Cva%2Cdispos%7Csnomedct%2Cstruct%7Csnomedct%2Cschedule%7Crxnorm

In [None]:
import nltk
import numpy as np
import os # to access your directories 
import pandas as pd # useful table stuff to do manipulations on (and stuff)
import matplotlib.pyplot as plt # plotting stuff when necessary 
from tqdm import tqdm # most awesome package around

In [None]:
!pip install rxclass

In [None]:
# To access API helpers:

from rxclass_api import RxClassHelpers

# To access API wrapper functions directly:
from rxclass_api import RxAPIWrapper

# To start using available helper functions create a helper instance.

helper = RxClassHelpers()

Supplying a with statement will also automatically load and save gathered data given a filename.

In [None]:
helper = RxClassHelpers(filename='data')
with helper:
    ...

Obtaining Class's Id and Type

Unique identifiers are represented as classId(s). Every class in RxClass has a classId and classType.

In [None]:
helper.get_class_by_name('fluoxetine')

In [None]:
helper.get_class_by_name('farxiga')

In [None]:
helper.get_class_by_name('Drug hypersensitivity')

List all the class types with descriptions

In [None]:
helper.list_class_types()

Drug indications

In [None]:
helper.indications('imbruvica')

Drug's Mechanism of Action

In [None]:
helper.mechanism_of_action('gilotrif')

In [None]:
helper.similarly_acting_drugs('eliquis')

Class information of a given drug

In [None]:
helper.drug_info('eliquis')

Drugs that may prevent or treat a condition or acute reaction/ response

In [None]:
helper.drugs_that_may('prevent', 'seizure disorder')

helper.drugs_that_may('treat', 'b-cell lymphoma') #or seizures

# Word Cloud


Declare chunker function

In [None]:
import re 

def chunker(text,tag_list,debugging = False):
    # split text into sentences
    total_chunks = []
    sentences = get_sentences(text)
    for sentence in sentences:
        sentence_chunks = chunker_helper(sentence,tag_list,debugging = False)
        total_chunks += sentence_chunks
    return total_chunks
    

def chunker_helper(sentence,tag_list,debugging = False):
    path = os.getcwd()+'/crf09052020_2.pickle'
    crf = pickle.load(open(path, 'rb'))
    #split sentence into tokens
    tokens = extra_tokenization(word_tokenize(sentence))
    if debugging:
        print(tokens)
    tagged_text = [nltk.pos_tag(tokens)]
    feature_vector = [sent2features(s) for s in tagged_text]
    predictions = crf.predict(feature_vector)
    group_token = ['','']
    list_of_adverse_reactions = []
    previous_tag = ''
    for i,j in zip(tokens,predictions[0]):
        if j[2:] in tag_list:
            print(i,j)
            if j[0] == 'B':
                if group_token[0] != '':
                    final_group_token = [group_token[0][:-1],group_token[1]]
                    list_of_adverse_reactions.append(final_group_token[0].lower())
                group_token = ['','']
                group_token[1] = j[2:]
                group_token[0] += i + ' '
            else:
                if j[0] == 'I':
                    if group_token[1] == j[2:] and group_token[1]:
                        group_token[0] += i + ' '
    if group_token[0] != '':
                final_group_token = [group_token[0][:-1],group_token[1]]
                list_of_adverse_reactions.append(final_group_token[0].lower())
    return list_of_adverse_reactions

Get data from unannotated data set (2208 labels)

In [None]:
from tqdm import tqdm
import pickle
from nltk import word_tokenize

df_cols = ["adverse_reactions"]
index = []
df_rows = []

for filename in sorted(os.listdir(directory_unannotated)):
#     print(filename)
    if filename.endswith(".xml"):
        index.append(filename[:-4]) 
        text_file = ''
        fullname = os.path.join(directory_unannotated, filename)
        tree = ET.parse(fullname)
        root = tree.getroot()
        for section in root.findall('./Text/Section'):
            text_file += section.text + " "
        df_rows.append({'adverse_reactions': chunker(text_file,['AdverseReaction'])})
df = pd.DataFrame(df_rows,columns = df_cols, index=index)
df.to_pickle('./unanno.pkl')

Count frequency of found entities

In [None]:
df = pd.read_pickle('./unanno.pkl') # Experiment on unannotated data with pickle 4 ()
def count_frequency(my_list):  
    freq = {} 
    for item in my_list: 
        if (item in freq): 
            freq[item] += 1
        else: 
            freq[item] = 1
    return {k: v for k, v in sorted(freq.items(), key=lambda item: item[1],reverse = True)}
  
def distinct_elements(my_list):
    return(list(set(my_list)))

def length_list(my_list):
    return len(my_list)
df['adverse_reactions'] = df['adverse_reactions'].apply(distinct_elements)

total_list = []
for i in df['adverse_reactions']:
    total_list += i
    
df['adverse_reactions'] = df['adverse_reactions'].apply(count_frequency)
unique_list = distinct_elements(total_list)
dict = count_frequency(total_list)
print(len(unique_list))
unique_list
dict

ValueError: ignored

In [None]:
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

pill_mask = np.array(Image.open("759ff92884237a32a82e6f49b86c0aac.png"))

wordcloud_pill = WordCloud(background_color="white", mode="RGBA", max_font_size=500,min_font_size=5, max_words=2000, mask=pill_mask).generate_from_frequencies(frequencies=dict)

image_colors = ImageColorGenerator(pill_mask)
plt.figure(figsize=[75,75])
plt.imshow(wordcloud_pill.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
wordcloud_pill.to_file("cross2.png")

plt.show()

# Get aliases from UMLS 2017 database


In [None]:
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz

Collecting scispacy
  Downloading https://files.pythonhosted.org/packages/eb/50/95cd574c3ccf4a268b334ea3c4c3cf9f95d1f24d6c0be82024d51c3e460b/scispacy-0.2.4.tar.gz
Collecting awscli
[?25l  Downloading https://files.pythonhosted.org/packages/6f/11/5c77e1d568c5c9b34b8dfd5e00cadca8ce95435de1ef13052b26ffbc0be2/awscli-1.18.58-py2.py3-none-any.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 6.2MB/s 
[?25hCollecting conllu
  Downloading https://files.pythonhosted.org/packages/a8/03/4a952eb39cdc8da80a6a2416252e71784dda6bf9d726ab98065fff2aeb73/conllu-2.3.2-py2.py3-none-any.whl
Collecting nmslib>=1.7.3.6
[?25l  Downloading https://files.pythonhosted.org/packages/d5/fd/7d7428d29f12be5d1cc6d586d425b795cc9c596ae669593fd4f388602010/nmslib-2.0.6-cp36-cp36m-manylinux2010_x86_64.whl (12.9MB)
[K     |████████████████████████████████| 13.0MB 239kB/s 
Collecting pysbd
  Downloading https://files.pythonhosted.org/packages/3b/49/4799b3cdf80aee5fa4562a3929eda738845900bbeef4ee60481196ad4d1a/py

Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz
[?25l  Downloading https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz (500.6MB)
[K     |████████████████████████████████| 500.6MB 16kB/s 
Building wheels for collected packages: en-core-sci-lg
  Building wheel for en-core-sci-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-sci-lg: filename=en_core_sci_lg-0.2.4-cp36-none-any.whl size=501343162 sha256=bea0201dc66c0c36cfd4bd92081e182a3eb6f123678f089ca33f4b801e8039da
  Stored in directory: /root/.cache/pip/wheels/ea/ab/e5/fa667519032799529ce6a50944a82d6ae3603819cd07836aa2
Successfully built en-core-sci-lg
Installing collected packages: en-core-sci-lg
Successfully installed en-core-sci-lg-0.2.4


In [None]:
import en_core_sci_lg
import scispacy
from scispacy.umls_linking import UmlsEntityLinker
from scispacy.abbreviation import AbbreviationDetector
from scispacy.candidate_generation import CandidateGenerator
from scispacy.umls_utils import UmlsKnowledgeBase

candidate_generator = CandidateGenerator()

# kb = UmlsKnowledgeBase() # 2017 UMLS database

def get_aliases(entity,neigbhours):
    concept_id = candidate_generator.__call__([entity],k=neigbhours)[0][0].concept_id
    print(kb.cui_to_entity[concept_id].definition)
    return list(set([x.lower() for x in kb.cui_to_entity[concept_id].aliases]))

https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/tfidf_vectors_sparse.npz not found in cache, downloading to /tmp/tmp35ra76s9
Finished download, copying /tmp/tmp35ra76s9 to cache at /root/.scispacy/datasets/ea855fd121a193f03190a91417c209d4cd97e63d3ce4b456c248ef7c13a4ca77.03518aabd12de2103a27a50302f37c3d87b0f313a8be08b5ec306c9c4334b9b1.tfidf_vectors_sparse.npz
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/nmslib_index.bin not found in cache, downloading to /tmp/tmpty_31x7z
Finished download, copying /tmp/tmpty_31x7z to cache at /root/.scispacy/datasets/5f620d1bd549a98c005ed601a73806ea2cd1a86ae6c54bbc62bcb3b452ca2630.27a7ac6807fde6628311ff7d70b86fefc640d0eb70637b544c591722a2c16c2a.nmslib_index.bin
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/tfidf_vectorizer.joblib not found in cache, downloading to /tmp/tmpygpjea7i
Finished download, copying /tmp/tmpygpjea7i to cache at /root/.scispacy/datasets/ffb7a77cdcb3c9233c1e400



https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/concept_aliases.json not found in cache, downloading to /tmp/tmpts8f39mh
Finished download, copying /tmp/tmpts8f39mh to cache at /root/.scispacy/datasets/0f064d20aefab965d5772b2100f8436b3541e7d5313c76cfe5fe070902f149fe.31df9cdb04729860a81bd6c980224ed2bff582586c398d0c9b96ae4e257b9da2.concept_aliases.json
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_2017_aa_cat0129.json not found in cache, downloading to /tmp/tmpsih1m6c7
Finished download, copying /tmp/tmpsih1m6c7 to cache at /root/.scispacy/datasets/13b30cd31cd37c1b52f3df6ea023061172d16e9941660e677fdbb29489af7410.4ad71d86ce780e00cab131c7e3b81acfd2f11dd80ccd61125c8bcde506f2ab8a.umls_2017_aa_cat0129.json
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_semantic_type_tree.tsv not found in cache, downloading to /tmp/tmpbvd7mvg1
Finished download, copying /tmp/tmpbvd7mvg1 to cache at /root/.scispacy/datasets/21a1012c532c3a431d60895c509f5b4d45b0f