<a href="https://colab.research.google.com/github/lucyquirant/text_mining/blob/master/Text_Mining_Project_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip3 install nltk 
!pip3 install wget
!pip3 install numpy
!pip3 install tqdm
!pip3 install matplotlib
!pip3 install -U numpy scipy scikit-learn # maybe we need this, has a library which allows for tf-idf
!pip3 install spacy
!python3 -m spacy download en_core_web_sm

In [0]:
import nltk
import numpy as np
import wget
import tarfile
import xml.etree.ElementTree as ET
import os
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

path = os.getcwd()
directory_unannotated = os.getcwd() + '/unannotated_xml'
directory_annotated = os.getcwd() + '/train_xml'

In [0]:
wget.download('https://bionlp.nlm.nih.gov/tac2017adversereactions/train_xml.tar.gz')
wget.download('https://bionlp.nlm.nih.gov/tac2017adversereactions/unannotated_xml.tar.gz')
tarfile.open('train_xml.tar.gz').extractall()
tarfile.open('unannotated_xml.tar.gz').extractall()

In [0]:
# create pandas framework for the annotated and unannotated dataset
df_cols_annotated = ['file_name',"adverse_reactions","warnings_and_precautions","boxed_warnings"]
df_rows_annotated = []

for filename in tqdm(sorted(os.listdir(directory_annotated))):
    if filename.endswith(".xml"):
        fullname = os.path.join(directory_annotated, filename)
        tree = ET.parse(fullname)
        root = tree.getroot()
        text_adverse_reactions, text_warnings_and_precautions, text_boxed_warnings = "","",""
        for section in root.findall('./Text/Section'):
            if section.get('name') == "adverse reactions":
                text_adverse_reactions = section.text
            if section.get('name') == "warnings and precautions":
                text_warnings_and_precautions = section.text
            if section.get('name') == "boxed warnings":
                text_boxed_warnings = section.text  
        df_rows_annotated.append({"file_name": filename,"adverse_reactions": text_adverse_reactions, "warnings_and_precautions": text_warnings_and_precautions, "boxed_warnings": text_boxed_warnings})

out_df = pd.DataFrame(df_rows_annotated,columns = df_cols_annotated)
out_df.to_pickle("./drug_labels_annotated.pkl")

df_cols_unannotated = ['file_name',"adverse_reactions","warnings_and_precautions","boxed_warnings"]
df_rows_unannotated = []

for filename in tqdm(sorted(os.listdir(directory_unannotated))):
    if filename.endswith(".xml"):
        fullname = os.path.join(directory_unannotated, filename)
        tree = ET.parse(fullname)
        root = tree.getroot()
        text_adverse_reactions, text_warnings_and_precautions, text_boxed_warnings = "","",""
        for section in root.findall('./Text/Section'):
            if section.get('name') == "adverse reactions":
                text_adverse_reactions = section.text
            if section.get('name') == "warnings and precautions":
                text_warnings_and_precautions = section.text
            if section.get('name') == "boxed warnings":
                text_boxed_warnings = section.text  
        df_rows_unannotated.append({"file_name": filename,"adverse_reactions": text_adverse_reactions, "warnings_and_precautions": text_warnings_and_precautions, "boxed_warnings": text_boxed_warnings})

out_df = pd.DataFrame(df_rows_unannotated,columns = df_cols_unannotated)
out_df.to_pickle("./drug_labels_unannotated.pkl")

In [0]:
# Tokenize the words in the pandas framework using NLTK (maybe try spaCy and Gensim as well)
from nltk import word_tokenize

def span(text):  
  tokens = nltk.word_tokenize(text)
  offset = 0
  for token in tokens:
      offset = text.find(token,offset)
      yield [token, offset, offset + len(token)]
      offset += len(token)
  

span_text = lambda x : span(x)

drug_labels_annotated = pd.read_pickle('./drug_labels_annotated.pkl')
drug_labels_annotated['adverse_reactions'] = drug_labels_annotated['adverse_reactions'].apply(span_text).apply(list) # each xml gets tokenized
drug_labels_annotated['warnings_and_precautions'] = drug_labels_annotated['warnings_and_precautions'].apply(span_text).apply(list) # each xml gets tokenized
drug_labels_annotated['boxed_warnings'] = drug_labels_annotated['boxed_warnings'].apply(span_text).apply(list) # each xml gets tokenized
drug_labels_annotated.to_pickle("./drug_labels_annotated_tokens.pkl")

drug_labels_unannotated = pd.read_pickle('./drug_labels_unannotated.pkl')
drug_labels_unannotated['adverse_reactions'] = drug_labels_unannotated['adverse_reactions'].apply(span_text).apply(list) # each xml gets tokenized
drug_labels_unannotated['warnings_and_precautions'] = drug_labels_unannotated['warnings_and_precautions'].apply(span_text).apply(list) # each xml gets tokenized
drug_labels_unannotated['boxed_warnings'] = drug_labels_unannotated['boxed_warnings'].apply(span_text).apply(list) # each xml gets tokenized
drug_labels_unannotated.to_pickle("./drug_labels_unannotated_tokens.pkl")

In [0]:
drug_labels_annotated['adverse_reactions'][10]

In [0]:
# # Lemmatize with POS Tag
# from nltk.corpus import wordnet
# drug_labels_annotated =  pd.read_pickle('./drug_labels_annotated_tokens.pkl')
# drug_labels_unannotated = pd.read_pickle('./drug_labels_unannotated_tokens.pkl')
# def get_wordnet_pos(tag):
#     """Map POS tag to first character lemmatize() accepts"""
#     tag = tag[0].upper()
#     tag_dict = {"J": wordnet.ADJ,
#                 "N": wordnet.NOUN,
#                 "V": wordnet.VERB,
#                 "R": wordnet.ADV}

#     return tag_dict.get(tag, wordnet.NOUN)
  
# def lemmatize_text(words):
#     return [lemmatizer.lemmatize(word[0],get_wordnet_pos(word[1])) for word in words]

# lemmatizer = WordNetLemmatizer()

# drug_labels_annotated['adverse_reactions'] = drug_labels_annotated['adverse_reactions'].apply(lemmatize_text) # each xml gets tokenized
# drug_labels_annotated['warnings_and_precautions'] = drug_labels_annotated['warnings_and_precautions'].apply(lemmatize_text) # each xml gets tokenized
# drug_labels_annotated['boxed_warnings'] = drug_labels_annotated['boxed_warnings'].apply(lemmatize_text) # each xml gets tokenized

# drug_labels_unannotated['adverse_reactions'] = drug_labels_unannotated['adverse_reactions'].apply(lemmatize_text) # each xml gets tokenized
# drug_labels_unannotated['warnings_and_precautions'] = drug_labels_unannotated['warnings_and_precautions'].apply(lemmatize_text) # each xml gets tokenized
# drug_labels_unannotated['boxed_warnings'] = drug_labels_unannotated['boxed_warnings'].apply(lemmatize_text) # each xml gets tokenized

In [0]:
# back to the xml file, we want to add the tags from the training file to the tokens
drug_labels_annotated = pd.read_pickle('./drug_labels_annotated.pkl')
drug_labels_annotated_tokens = pd.read_pickle('./drug_labels_annotated_tokens.pkl')

for filename in tqdm(drug_labels_annotated_tokens['file_name']):
      index = drug_labels_annotated_tokens[drug_labels_annotated_tokens['file_name']==filename].index.values.astype(int)[0]
      fullname = os.path.join(directory_annotated, filename)
      tree = ET.parse(fullname)
      root = tree.getroot()
      tokens_adverse_reactions = drug_labels_annotated_tokens['adverse_reactions'][index]
      for token in tokens_adverse_reactions:
          token.append(0)
      tokens_warnings_and_precautions = drug_labels_annotated_tokens['warnings_and_precautions'][index]
      for token in tokens_warnings_and_precautions:
          token.append(0)
      tokens_boxed_warnings = drug_labels_annotated_tokens['boxed_warnings'][index]
      for token in tokens_boxed_warnings:
          token.append(0)
      for mention in root.findall('./Mentions/Mention'):
          interval = [(int(x),int(x) + int(y)) for x, y in zip(mention.get('start').split(','), mention.get('len').split(','))]
          tag = mention.get('type')
          for i in interval:
              if mention.get('section') == 'S1': 
                  for token in tokens_adverse_reactions:
                      if token[1] >= i[0]: # start times are equal or greater
                          token[3] = tag
                          if token[2] >= i[1]: # if interval ends, break the loop
                              break

              elif mention.get('section') == 'S2': 
                  for token in tokens_warnings_and_precautions:
                      if token[1] == s:
                          token[3] = tag
                          break
              elif mention.get('section') == 'S3': 
                  for token in tokens_boxed_warnings:
                      if token[1] == s:
                          token[3] = tag
                          break

100%|██████████| 101/101 [00:01<00:00, 51.83it/s]


In [0]:
# simplify the tags a bit (we don't need the intervals any more)
def reduceTokens(tokens):
  for token in tokens:
    try:
      token[1] = token[3]
      token.remove(token[2])
      token.remove(token[2])
    except:
      break

drug_labels_annotated_tokens['adverse_reactions'].apply(reduceTokens)
drug_labels_annotated_tokens['warnings_and_precautions'].apply(reduceTokens)
drug_labels_annotated_tokens['boxed_warnings'].apply(reduceTokens)

drug_labels_annotated_tokens

Unnamed: 0,file_name,adverse_reactions,warnings_and_precautions,boxed_warnings
0,ADCETRIS.xml,"[[6, 0], [ADVERSE, 0], [REACTIONS, 0], [The, 0...","[[5, 0], [WARNINGS, 0], [AND, 0], [PRECAUTIONS...","[[BOXED, 0], [WARNING, 0], [:, 0], [WARNING, 0..."
1,ADREVIEW.xml,"[[6, 0], [ADVERSE, 0], [REACTIONS, 0], [EXCERP...","[[5, 0], [WARNINGS, 0], [AND, 0], [PRECAUTIONS...",[]
2,AFINITOR.xml,"[[6, 0], [ADVERSE, 0], [REACTIONS, 0], [The, 0...","[[5, 0], [WARNINGS, 0], [AND, 0], [PRECAUTIONS...",[]
3,AMPYRA.xml,"[[6, 0], [ADVERSE, 0], [REACTIONS, 0], [Becaus...","[[5, 0], [WARNINGS, 0], [AND, 0], [PRECAUTIONS...",[]
4,AMYVID.xml,"[[6, 0], [ADVERSE, 0], [REACTIONS, 0], [EXCERP...","[[5, 0], [WARNINGS, 0], [AND, 0], [PRECAUTIONS...",[]
...,...,...,...,...
96,YERVOY.xml,"[[6, 0], [ADVERSE, 0], [REACTIONS, 0], [The, 0...","[[5, 0], [WARNINGS, 0], [AND, 0], [PRECAUTIONS...","[[BOXED, 0], [WARNING, 0], [:, 0], [WARNING, 0..."
97,ZERBAXA.xml,"[[6, 0], [ADVERSE, 0], [REACTIONS, 0], [The, 0...","[[5, 0], [WARNINGS, 0], [AND, 0], [PRECAUTIONS...",[]
98,ZYDELIG.xml,"[[6, 0], [ADVERSE, 0], [REACTIONS, 0], [The, 0...","[[5, 0], [WARNINGS, 0], [AND, 0], [PRECAUTIONS...","[[BOXED, 0], [WARNING, 0], [:, 0], [WARNING, 0..."
99,ZYKADIA.xml,"[[6, 0], [ADVERSE, 0], [REACTIONS, 0], [The, 0...","[[5, 0], [WARNINGS, 0], [AND, 0], [PRECAUTIONS...",[]


In [0]:
# Bert experiment 

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

torch.__version__

Using TensorFlow backend.


'1.4.0'

In [0]:
MAX_LEN = 75
bs = 32

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

HBox(children=(IntProgress(value=0, description='Downloading', max=213450, style=ProgressStyle(description_wid…




In [0]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [0]:
from nltk.tokenize.treebank import TreebankWordDetokenizer


def detokenize_tokens(tokens):
  TreebankWordDetokenizer().detokenize(tokens)

drug_labels_annotated_tokens


Unnamed: 0,file_name,adverse_reactions,warnings_and_precautions,boxed_warnings
0,ADCETRIS.xml,"[[6, 0], [ADVERSE, 0], [REACTIONS, 0], [The, 0...","[[5, 0], [WARNINGS, 0], [AND, 0], [PRECAUTIONS...","[[BOXED, 0], [WARNING, 0], [:, 0], [WARNING, 0..."
1,ADREVIEW.xml,"[[6, 0], [ADVERSE, 0], [REACTIONS, 0], [EXCERP...","[[5, 0], [WARNINGS, 0], [AND, 0], [PRECAUTIONS...",[]
2,AFINITOR.xml,"[[6, 0], [ADVERSE, 0], [REACTIONS, 0], [The, 0...","[[5, 0], [WARNINGS, 0], [AND, 0], [PRECAUTIONS...",[]
3,AMPYRA.xml,"[[6, 0], [ADVERSE, 0], [REACTIONS, 0], [Becaus...","[[5, 0], [WARNINGS, 0], [AND, 0], [PRECAUTIONS...",[]
4,AMYVID.xml,"[[6, 0], [ADVERSE, 0], [REACTIONS, 0], [EXCERP...","[[5, 0], [WARNINGS, 0], [AND, 0], [PRECAUTIONS...",[]
...,...,...,...,...
96,YERVOY.xml,"[[6, 0], [ADVERSE, 0], [REACTIONS, 0], [The, 0...","[[5, 0], [WARNINGS, 0], [AND, 0], [PRECAUTIONS...","[[BOXED, 0], [WARNING, 0], [:, 0], [WARNING, 0..."
97,ZERBAXA.xml,"[[6, 0], [ADVERSE, 0], [REACTIONS, 0], [The, 0...","[[5, 0], [WARNINGS, 0], [AND, 0], [PRECAUTIONS...",[]
98,ZYDELIG.xml,"[[6, 0], [ADVERSE, 0], [REACTIONS, 0], [The, 0...","[[5, 0], [WARNINGS, 0], [AND, 0], [PRECAUTIONS...","[[BOXED, 0], [WARNING, 0], [:, 0], [WARNING, 0..."
99,ZYKADIA.xml,"[[6, 0], [ADVERSE, 0], [REACTIONS, 0], [The, 0...","[[5, 0], [WARNINGS, 0], [AND, 0], [PRECAUTIONS...",[]
