In [1]:
import numpy as np
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding

In [2]:
import os
import re

datadir = './abstrct/AbstRCT_corpus/data/train/neoplasm_train'
annotated = [f for f in os.listdir(datadir) if f.endswith('.ann')]

testdir = './abstrct/AbstRCT_corpus/data/test/neoplasm_test'
testannotated = [f for f in os.listdir(testdir) if f.endswith('.ann')]

In [3]:
def extract_annotated(fpath):
    res = []
    with open(fpath, 'r') as infile:
        for row in infile.readlines():
            row = row.strip()
            if re.match('^T\d', row):
                name, annotation, text = row.split('\t')
                fname = os.path.basename(fpath).replace('.ann', '')    
                name = f'{fname}-{name}'
                is_arg, start, end = annotation.split()
                start = int(start)
                end = int(end)
                res.append((name, is_arg, start, end, text))
    return res

In [4]:
from collections import defaultdict

train_ann_dict = defaultdict(list)
test_ann_dict = defaultdict(list)

for f in annotated:
    fname = f.replace('.ann', '')
    filepath = os.path.join(datadir, f)
    train_ann_dict[fname] = extract_annotated(filepath)
    
for f in testannotated:
    fname = f.replace('.ann', '')
    filepath = os.path.join(testdir, f)
    test_ann_dict[fname] = extract_annotated(filepath)

In [5]:
train_data = []
for k, list_ in train_ann_dict.items():
    for tup in list_:
        train_data.append((tup[1], tup[4]))

test_data = []
for k, list_ in test_ann_dict.items():
    for tup in list_:
        test_data.append((tup[1], tup[4]))

print(len(train_data))
print(len(test_data))

2267
686


In [6]:
train_data = [(tup[1], 0) if tup[0] == 'Premise' else (tup[1], 1) for tup in train_data]

In [7]:
test_data = [(tup[1], 0) if tup[0] == 'Premise' else (tup[1], 1) for tup in test_data]

In [8]:
train = pd.DataFrame(train_data, columns=['text', 'label'])
test = pd.DataFrame(test_data, columns=['text', 'label'])

In [10]:
test.head()

Unnamed: 0,text,label
0,Patients reported similar pain and satisfactio...,1
1,Bolus IV administration of G-CSF results in lo...,1
2,The mean time to neutropenia resolution was lo...,0
3,Longer neutropenia duration was observed in al...,0
4,except for patients undergoing autologous HCT.,0


In [15]:
docs = list(train['text'])

In [17]:
labels = list(train['label'])

In [37]:
test_docs = list(test['text'])
test_labels = list(test['label'])

In [18]:
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(len(encoded_docs))

2267


In [40]:
t = Tokenizer()
t.fit_on_texts(test_docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
test_encoded_docs = t.texts_to_sequences(test_docs)
print(len(test_encoded_docs))

686


In [20]:
encoded_docs[0]

[21,
 25,
 131,
 564,
 192,
 4,
 1828,
 10,
 42,
 246,
 79,
 3,
 281,
 75,
 4,
 2474,
 42,
 323,
 177,
 79,
 57,
 845,
 8,
 348,
 362,
 1244,
 5,
 6,
 475,
 3,
 139,
 25,
 4,
 2475,
 42,
 246,
 79,
 3,
 707,
 19,
 4,
 2476,
 42,
 323,
 18,
 177,
 57,
 65,
 349,
 2477,
 5,
 6,
 1245]

In [21]:
max([len(x) for x in docs])

669

In [22]:
min([len(x) for x in docs])

21

In [23]:
max_length = 500
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(len(padded_docs))

2267


In [41]:
test_padded_docs = pad_sequences(test_encoded_docs, maxlen=max_length, padding='post')
print(len(test_padded_docs))

686


In [24]:
padded_docs[0]

array([  21,   25,  131,  564,  192,    4, 1828,   10,   42,  246,   79,
          3,  281,   75,    4, 2474,   42,  323,  177,   79,   57,  845,
          8,  348,  362, 1244,    5,    6,  475,    3,  139,   25,    4,
       2475,   42,  246,   79,    3,  707,   19,    4, 2476,   42,  323,
         18,  177,   57,   65,  349, 2477,    5,    6, 1245,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [26]:
embeddings_index = dict()
f = open('./glove.6B.100d.txt')  # wget http://nlp.stanford.edu/data/glove.6B.zip
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [27]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [33]:
# define model
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=500, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 100)          422200    
_________________________________________________________________
flatten_1 (Flatten)          (None, 50000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 50001     
Total params: 472,201
Trainable params: 50,001
Non-trainable params: 422,200
_________________________________________________________________
None


In [35]:
model.fit(padded_docs, np.array(labels), epochs=50, verbose=0)

<tensorflow.python.keras.callbacks.History at 0x7f8e5c183e20>

In [42]:
loss, accuracy = model.evaluate(test_padded_docs, np.array(test_labels), verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 58.892131
