In [None]:
import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import pandas as pd
import matplotlib.pyplot as plt
import requests
import xml.etree.ElementTree as ET
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
import spacy
nlp = spacy.load('en_core_web_lg')
nlpSci = spacy.load("en_ner_bc5cdr_md")
nlpSci2 = spacy.load('en_ner_bionlp13cg_md')

## Prepare sentence datasets

In [None]:
filename = 'epidemiology_classifications_sents.csv'
df = pd.read_csv(filename, header=None, skiprows=None, names=['label','pmid','sent'])
df.dropna(inplace=True)

In [None]:
df_shuffled = df.sample(frac=1).reset_index(drop=True)

In [None]:
filename = 'epidemiology_classifications.csv'
df_abs = pd.read_csv(filename, header=None, skiprows=[0], names=['label','pmid','abs'])
df_abs.dropna(inplace=True)

## RNN

In [None]:
import tensorflow as tf
from tensorflow.keras import metrics
import random
import csv
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

In [None]:
vocab_size = 5000
embedding_dim = 64
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

In [None]:
sents = []
labels = []
pmids = []

with open("epidemiology_classifications_sents.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        sent = row[2]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            sent = sent.replace(token, ' ')
            sent = sent.replace(' ', ' ')
        if len(sent)>5:
            sents.append(sent)
            labels.append(int(row[0] == 'True'))
            pmids.append(row[1])

combined = list(zip(labels, sents, pmids))
random.shuffle(combined)
labels, sents, pmids = zip(*combined)
print(len(labels))

In [None]:
pmid_to_indices = {}
for i in range(len(pmids)):
    pmid = pmids[i]
    if pmid in pmid_to_indices:
        pmid_to_indices[pmid].append(i)
    else:
        pmid_to_indices[pmid] = [i]

In [None]:
train_num_abs = int(len(pmid_to_indices) * training_portion)
pmid_list = list(pmid_to_indices.keys())
train_pmid_list = pmid_list[0 : train_num_abs]
validation_pmid_list = pmid_list[train_num_abs:]

train_sents = []
train_labels = []
train_pmids = []
validation_sents = []
validation_labels = []
validation_pmids = []

for pmid in train_pmid_list:
    for i in pmid_to_indices[pmid]:
        train_sents.append(sents[i])
        train_labels.append(labels[i])
        train_pmids.append(pmid)
        
for pmid in validation_pmid_list:
    for i in pmid_to_indices[pmid]:
        validation_sents.append(sents[i])
        validation_labels.append(labels[i])
        validation_pmids.append(pmid)
        
combined = list(zip(train_sents, train_labels, train_pmids))
random.shuffle(combined)
train_sents, train_labels, train_pmids = zip(*combined)

In [None]:
print(len(train_labels))
print(len(validation_labels))

In [None]:
len(train_pmid_list), len(validation_pmid_list)

In [None]:
pos = 0
neg = 0
for l in validation_labels:
    if l==1:
        pos+=1
    else:
        neg +=1
print(pos,neg)

In [None]:
def standardizeSent(sent):
    doc = nlp(sent)
    newSent = sent
    for e in reversed(doc.ents):
        if e.label_ in {'PERCENT','CARDINAL','GPE','LOC','DATE','TIME','QUANTITY','ORDINAL'}:
            start = e.start_char
            end = start + len(e.text)
            newSent = newSent[:start] + e.label_ + newSent[end:]
    return newSent

def standardizeSciTerms(sent):
    doc = nlpSci(sent)
    newSent = sent
    for e in reversed(doc.ents):
        start = e.start_char
        end = start + len(e.text)
        newSent = newSent[:start] + e.label_ + newSent[end:]
        
    doc = nlpSci2(newSent)
    for e in reversed(doc.ents):
        start = e.start_char
        end = start + len(e.text)
        newSent = newSent[:start] + e.label_ + newSent[end:]
    return newSent

In [None]:
train_sents_standard = [standardizeSent(standardizeSciTerms(sent)) for sent in train_sents]
val_sents_standard = [standardizeSent(standardizeSciTerms(sent)) for sent in validation_sents]

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sents_standard)
word_index = tokenizer.word_index
dict(list(word_index.items())[0:10])


In [None]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_sents_standard)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
validation_sequences = tokenizer.texts_to_sequences(val_sents_standard)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(len(val_sents_standard))

In [None]:
training_label_seq = np.array(train_labels) #np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(validation_labels) #np.array(label_tokenizer.texts_to_sequences(validation_labels))

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_sent(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_sent(train_padded[1]))
print('---')
print(train_sents[1])

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])
model.summary()

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam'
              , metrics=['accuracy'])
num_epochs = 10
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1)
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2, callbacks=[es])


In [None]:
!mkdir -p saved_model
model.save('saved_model/my_model') 

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
y_pred1 = model.predict(validation_padded)
y_pred = np.argmax(y_pred1, axis=1)

print(precision_score(validation_label_seq, y_pred , average="macro"))
print(recall_score(validation_label_seq, y_pred , average="macro"))
print(f1_score(validation_label_seq, y_pred , average="macro"))

In [None]:
m = tf.keras.metrics.AUC()
_ = m.update_state(validation_label_seq, y_pred)
m.result().numpy()

In [None]:
pmid_to_indices_val = {}
for i in range(len(validation_pmids)):
    pmid = validation_pmids[i]
    if pmid in pmid_to_indices_val:
        pmid_to_indices_val[pmid].append(i)
    else:
        pmid_to_indices_val[pmid] = [i]

In [None]:
for i in range(1000):
    if y_pred[i]==0 and validation_label_seq[i]==1:
        print('\nprediction:',y_pred[i], y_pred1[i])
        print('label:',validation_label_seq[i])
        print(validation_sents[i])

In [None]:
for pmid in pmid_to_indices_val:
    print('\n',pmid)
    epi_count = 0
    print(df_abs.loc[df_abs['pmid'] == int(pmid)]['abs'].item())
    for i in pmid_to_indices_val[pmid]:
        if y_pred[i] == 1:
            epi_count +=1
        print('\nprediction:',y_pred[i], y_pred1[i])
        print('label:',validation_label_seq[i])
        print(validation_sents[i])
    print('NUMBER OF EPI SENTS:',epi_count)

## Predict on one example

In [None]:
pmid = 26795590
url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:'+str(pmid)+'&resulttype=core'
r = requests.get(url)
root = ET.fromstring(r.content)
new_model = tf.keras.models.load_model('saved_model/my_model')
with open('tokenizer.pickle', 'rb') as handle:
    new_tokenizer = pickle.load(handle)
abstract = ''
isEpi = False
for child in root.iter('*'):
    if child.tag == 'abstractText':
        abstract = child.text
doc = nlp(abstract)
sents = [sent.text for sent in doc.sents]

for sent in sents:
    for word in STOPWORDS:
            token = ' ' + word + ' '
            sent = sent.replace(token, ' ')
            sent = sent.replace(' ', ' ')
    if len(sent)>5:
        sent_standard = [standardizeSent(standardizeSciTerms(sent))]
        sequence = new_tokenizer.texts_to_sequences(sent_standard)
        padded = pad_sequences(sequence, maxlen=max_length, padding=padding_type, truncating=trunc_type)
        y_pred1 = new_model.predict(padded)
        y_pred = np.argmax(y_pred1, axis=1)
        if y_pred == 1:
            isEpi = True
        print(sent)
        print('Probability of epidemiology:', round(y_pred1[0][1],3))
    
if isEpi:
    print("Abstract classification: Epidemiology")
else:
    print("Abstract classification: Not epidemiology")