This notebook contains the code to preprocess the dataset and train a recurrent neural network to classify epidemiology studies.

In [1]:
import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import pandas as pd
import matplotlib.pyplot as plt
import requests
import xml.etree.ElementTree as ET
import pickle

from sklearn.metrics import f1_score, precision_score, recall_score
import spacy
import tensorflow as tf
from tensorflow.keras import metrics
import random
import csv
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
nlp = spacy.load('en_core_web_lg')
# load scispaCy models
nlpSci = spacy.load("en_ner_bc5cdr_md")
nlpSci2 = spacy.load('en_ner_bionlp13cg_md')

## RNN

Read in curator evaluation set (to exclude from the positive and negative sets)

In [2]:
curator_labeled = pd.read_excel('curator_labeled_dataset.xlsx')
curator_pmids = list(curator_labeled['PMID'])
for i in range(len(curator_pmids)):
    curator_pmids[i] = str(curator_pmids[i])

Set preprocessing parameters

In [3]:
vocab_size = 5000
embedding_dim = 64
max_length = 300
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

In [4]:
abstracts = []
labels = []
pmids = []

# Read in negative dataset

with open("negative_dataset.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        abstract = row[1]
        # Remove stopwords
        for word in STOPWORDS:
            token = ' ' + word + ' '
            abstract = abstract.replace(token, ' ')
            abstract = abstract.replace(' ', ' ')
        # Only keep the article if the abstract has more than 5 characters
        if len(abstract)>5:
            abstracts.append(abstract)
            labels.append(0)
            pmids.append(row[0])

# Read in positive dataset
            
with open("orphanet_epi_mesh.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        abstract = row[1]
        # Remove stopwords
        for word in STOPWORDS:
            token = ' ' + word + ' '
            abstract = abstract.replace(token, ' ')
            abstract = abstract.replace(' ', ' ')
        # Only keep the article if the abstract has more than 5 chars, and it's not one of the curator articles
        if len(abstract)>5 and row[0] not in curator_pmids:
            abstracts.append(abstract)
            labels.append(1)
            pmids.append(row[0])

print(len(labels), len(abstracts), len(pmids))
combined = list(zip(labels, abstracts, pmids))
random.shuffle(combined)
labels, abstracts, pmids = zip(*combined)
print(len(labels))

in curator
in curator
in curator
in curator
in curator
in curator
in curator
in curator
in curator
in curator
in curator
in curator
in curator
in curator
in curator
in curator
in curator
in curator
in curator
in curator
in curator
in curator
in curator
in curator
in curator
in curator
26375 26375 26375
26375


Create a dictionary mapping pmids to indices in the dataset

In [6]:
pmid_to_indices = {}
for i in range(len(pmids)):
    pmid = pmids[i]
    pmid_to_indices[pmid] = i

Split data into train and validation sets

In [7]:
train_num_abs = int(len(labels) * training_portion) # determine size of training set
train_pmids = pmids[0 : train_num_abs]
validation_pmids = pmids[train_num_abs:]

train_abstracts = []
train_labels = []
validation_abstracts = []
validation_labels = []

for pmid in train_pmids:
    i = pmid_to_indices[pmid]
    train_abstracts.append(abstracts[i])
    train_labels.append(labels[i])
        
for pmid in validation_pmids:
    i = pmid_to_indices[pmid]
    validation_abstracts.append(abstracts[i])
    validation_labels.append(labels[i])
        
combined = list(zip(train_abstracts, train_labels, train_pmids))
random.shuffle(combined)
train_abstracts, train_labels, train_pmids = zip(*combined)

In [46]:
print(len(train_labels))
print(len(validation_labels))

21100
5275


Determine positive/negative composition in train set

In [48]:
pos = 0
neg = 0
for l in train_labels:
    if l==1:
        pos+=1
    else:
        neg +=1
print(pos,neg)

1119 19981


In [10]:
train_num_abs

21100

In [11]:
len(pmids)

26375

In [12]:
# Standardize the text of the abstract by replacing all named entities with their entity label
# Eg: 3 patients were seen in a clinic in England --> CARDINAL patients were seen in a clinic in GPE
def standardizeAbstract(abstract):
    doc = nlp(abstract)
    newAbstract = abstract
    # interate through the entities in the abstract
    for e in reversed(doc.ents):
        # replace entities with their label
        if e.label_ in {'PERCENT','CARDINAL','GPE','LOC','DATE','TIME','QUANTITY','ORDINAL'}:
            start = e.start_char
            end = start + len(e.text)
            newAbstract = newAbstract[:start] + e.label_ + newAbstract[end:]
    return newAbstract

# Same process but include scientific entities from scispaCy models
def standardizeSciTerms(abstract):
    doc = nlpSci(abstract)
    newAbstract = abstract
    for e in reversed(doc.ents):
        start = e.start_char
        end = start + len(e.text)
        newAbstract = newAbstract[:start] + e.label_ + newAbstract[end:]
        
    doc = nlpSci2(newAbstract)
    for e in reversed(doc.ents):
        start = e.start_char
        end = start + len(e.text)
        newAbstract = newAbstract[:start] + e.label_ + newAbstract[end:]
    return newAbstract

In [13]:
train_abstracts_standard = [standardizeAbstract(standardizeSciTerms(abstract)) for abstract in train_abstracts]
val_abstracts_standard = [standardizeAbstract(standardizeSciTerms(abstract)) for abstract in validation_abstracts]

Fit a tokenizer on the data. Uncommon words are replaced with an OOV (out of vocabulary) token

In [14]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_abstracts_standard)
word_index = tokenizer.word_index
dict(list(word_index.items())[0:10])


{'<OOV>': 1,
 'disease': 2,
 'gene': 3,
 'cardinal': 4,
 'organism': 5,
 'or': 6,
 'product': 7,
 'chemical': 8,
 'tissue': 9,
 'the': 10}

Save the tokenizer to a pickle file

In [15]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

Using the tokenizer, convert the texts to matrices, and pad them to a constant length

In [16]:
train_sequences = tokenizer.texts_to_sequences(train_abstracts_standard)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [17]:
validation_sequences = tokenizer.texts_to_sequences(val_abstracts_standard)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(len(val_abstracts_standard))

5275


In [18]:
training_label_seq = np.array(train_labels)
validation_label_seq = np.array(validation_labels)

Save and reload the training and validation data

In [34]:
np.save('train_padded.npy', train_padded)
np.save('validation_padded.npy', validation_padded)
np.save('train_abstracts.npy', train_abstracts)
np.save('validation_abstracts.npy', validation_abstracts)
np.save('training_label_seq.npy', training_label_seq)
np.save('validation_label_seq.npy', validation_label_seq)
np.save('validation_pmids.npy', validation_pmids)
np.save('train_pmids.npy', train_pmids)

In [8]:
train_padded = np.load('train_padded.npy')
validation_padded = np.load('validation_padded.npy')
train_abstracts = np.load('train_abstracts.npy')
validation_abstracts = np.load('validation_abstracts.npy')
training_label_seq = np.load('training_label_seq.npy')
validation_label_seq = np.load('validation_label_seq.npy')
validation_pmids = np.load('validation_pmids.npy')
train_pmids = np.load('train_pmids.npy')

Print an example preprocessed abstract

In [9]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_abstract(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_abstract(train_padded[5877]))
print('---')
print(train_abstracts[5877])

NameError: name 'word_index' is not defined

Define the model. It consists of: an embedding layer, 2 LSTM layers, one dense layer with ReLU activation, and one dense layer with softmax activation

In [37]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          320000    
_________________________________________________________________
bidirectional (Bidirectional (None, None, 128)         66048     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 130       
Total params: 431,554
Trainable params: 431,554
Non-trainable params: 0
_________________________________________________________________


Compile and train the model. Early stopping halts training if there is an increase in validation loss

In [38]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam'
              , metrics=['accuracy'])
num_epochs = 10
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1)
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2, callbacks=[es])


Epoch 1/10
660/660 - 374s - loss: 0.0971 - accuracy: 0.9714 - val_loss: 0.0563 - val_accuracy: 0.9881
Epoch 2/10
660/660 - 369s - loss: 0.0542 - accuracy: 0.9858 - val_loss: 0.0457 - val_accuracy: 0.9898
Epoch 3/10
660/660 - 433s - loss: 0.0341 - accuracy: 0.9911 - val_loss: 0.1121 - val_accuracy: 0.9750
Epoch 00003: early stopping


Save and reload the model architecture and weights

In [39]:
!mkdir -p saved_model
model.save('saved_model/my_model_orphanet_final') 

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: saved_model/my_model_orphanet_final/assets


In [10]:
loaded_model = tf.keras.models.load_model('saved_model/my_model_orphanet_final')

Determine predictions on the validation set and calculate precision, recall, F1 score, and AUC

In [11]:
y_pred1 = loaded_model.predict(validation_padded)
y_pred = np.argmax(y_pred1, axis=1)

print(precision_score(validation_label_seq, y_pred , average="macro"))
print(recall_score(validation_label_seq, y_pred , average="macro"))
print(f1_score(validation_label_seq, y_pred , average="macro"))

0.8459980286087543
0.9373757336743485
0.8855156874583201


In [12]:
m = tf.keras.metrics.AUC()
_ = m.update_state(validation_label_seq, y_pred)
m.result().numpy()

0.9373757

In [13]:
pmid_to_indices_val = {}
for i in range(len(validation_pmids)):
    pmid = validation_pmids[i]
    pmid_to_indices_val[pmid] = i

Print example results

In [25]:
for i in range(1000):
    if y_pred[i] == 1 and validation_label_seq[i] == 0: # false positives
        print('\nprediction:',y_pred[i], y_pred1[i])
        print('label:',validation_label_seq[i])
        print('pmid:',validation_pmids[i])
        print(validation_abstracts[i])


prediction: 1 [0.33761504 0.6623849 ]
label: 0
pmid: 32471399
BACKGROUND:Ramadan month within Islamic lunar calendar Muslims required fast (abstain food drink) daytime (from sunrise sunset) entire month. Due established connection fasting dehydration acute sialadenitis, aim study determine higher frequency sialadenitis among Muslim population Ramadan months year. METHODS:We conducted retrospective study using medical records 120 Muslim patients admitted emergency room (ER) diagnosed acute sialadenitis 5-year period Baruch Padeh Medical Center, Poriya, St. Vincent de Paul (French) Hospital, Nazareth, located Israel. The study group Muslim patients, aforementioned diagnosis, admitted Ramadan, control group included patients diagnosed sialadenitis rest year. We analyzed overall admission frequency well descriptive diagnostic data, including age, sex, gland involved several blood test results. RESULTS:During month Ramadan, admission Muslims diagnosis acute sialadenitis double months year 