In [1]:
# Gensim
import gensim
from gensim.models import Word2Vec

import os

import pandas as pd
import numpy as np

# custom
from analize_text import get_sentenceID
from paths import *

# nltk
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk import pos_tag, pos_tag_sents

from multiprocessing import cpu_count

# scikit learn
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder

# keras
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
# read dataframes of sentences and entities
sentences_df = pd.read_csv(SENTENCE_PATH)
entities_df = pd.read_csv(ENTITY_PATH)

In [3]:
print('Entities dataframe')
entities_df.head()

Entities dataframe


Unnamed: 0,entityID,name,position,type
0,DDI-DrugBank.d157.s0.e0,cimetidine,34-43,drug
1,DDI-DrugBank.d157.s0.e1,warfarin,49-56,drug
2,DDI-DrugBank.d157.s0.e2,Femara,97-102,brand
3,DDI-DrugBank.d157.s1.e0,Femara,48-53,brand
4,DDI-DrugBank.d157.s1.e1,tamoxifen,59-67,drug


In [4]:
print('Sentences dataframe')
sentences_df.head()

Sentences dataframe


Unnamed: 0,sentenceID,sentenceText
0,DDI-DrugBank.d157.s0,Clinical interaction studies with cimetidine a...
1,DDI-DrugBank.d157.s1,(See CLINICAL PHARMACOLOGY) Coadministration o...
2,DDI-DrugBank.d157.s2,There is no clinical experience to date on the...
3,DDI-DrugBank.d157.s3,Drug/Laboratory Test-Interactions None observed.
4,DDI-DrugBank.d110.s0,The administration of local anesthetic solutio...


### Load label dictionary {sentenceID: [ 'B', 'I', ..., 'O'] }

In [5]:
label_dict_path = os.path.join(ROOT_DIR, 'Train', 'bio_labels')
label_dict = np.load(label_dict_path + '.npy').item()

sentenceIDs = label_dict.keys()

In [6]:
# get sentences containing at least an entity 
sentences = [sentences_df[sentences_df.sentenceID == sentenceID]['sentenceText'].values[0] 
             for sentenceID in sentenceIDs]

# remove duplicates from sentence list (sentences with e.g. 2 entities appeared twice)
sentences = list(set(sentences))

In [None]:
### WTF ### happens iterating from sentences_df instead of indexing by entities 
dd = sentences_df['sentenceText']
for i, sentence in zip(range(len(dd)), dd.values):
    if not isinstance(sentence, str):
        print i, True, sentence
        break # remove this to see all

In [94]:
# just check nothing is wrong (it should not print anything)
for s in sentences:
    if not isinstance(s, str):
        print True

### Tokenize sentences

In [7]:
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
tokenized_sentences[0]

['As',
 'the',
 'primary',
 'effect',
 'of',
 'adenosine',
 'is',
 'to',
 'decrease',
 'conduction',
 'through',
 'the',
 'A-V',
 'node',
 ',',
 'higher',
 'degrees',
 'of',
 'heart',
 'block',
 'may',
 'be',
 'produced',
 'in',
 'the',
 'presence',
 'of',
 'carbamazepine',
 '.']

In [64]:
### EXAMPLE STEM + POS ####
# POS could differ slightly when applied to the stemmed version or not
# TODO: try which one performs better
stemmer = EnglishStemmer()
s = ['interaction', 'between', 'cimetidine', 'and', 'warfarin', 'could', 'be', 'dangerous']

print ('stemmed version:\n')
stemmed_s = [stemmer.stem(w) for w in s]
print (stemmed_s)

print ('\noriginal pos tags:\n')
print(pos_tag(s))

print ('\nstemmed pos tags:\n')
print(pos_tag(stemmed_s))

stemmed version:

[u'interact', u'between', u'cimetidin', u'and', u'warfarin', u'could', 'be', u'danger']

original pos tags:

[('interaction', 'NN'), ('between', 'IN'), ('cimetidine', 'NN'), ('and', 'CC'), ('warfarin', 'NN'), ('could', 'MD'), ('be', 'VB'), ('dangerous', 'JJ')]

stemmed pos tags:

[(u'interact', 'NN'), (u'between', 'IN'), (u'cimetidin', 'NN'), (u'and', 'CC'), (u'warfarin', 'NN'), (u'could', 'MD'), ('be', 'VB'), (u'danger', 'JJR')]


### POS tag

In [8]:
tokenized_sentences_pos = pos_tag_sents(tokenized_sentences, tagset=None) # tagset = None, 'universal', 'wsj', 'brown'

# concatenate the part of speach to each word (e.g. cat_NN)
tokenized_sentences_pos = [ [w + '_' + pos for w, pos in s ] for s in tokenized_sentences_pos]
tokenized_sentences_pos[0]

['As_IN',
 'the_DT',
 'primary_JJ',
 'effect_NN',
 'of_IN',
 'adenosine_NN',
 'is_VBZ',
 'to_TO',
 'decrease_VB',
 'conduction_NN',
 'through_IN',
 'the_DT',
 'A-V_NNP',
 'node_NN',
 ',_,',
 'higher_JJR',
 'degrees_NNS',
 'of_IN',
 'heart_NN',
 'block_NN',
 'may_MD',
 'be_VB',
 'produced_VBN',
 'in_IN',
 'the_DT',
 'presence_NN',
 'of_IN',
 'carbamazepine_NN',
 '._.']

### Word2Vec

In [10]:
vector_size = 20
model = Word2Vec(tokenized_sentences_pos, size=vector_size, window=5, min_count=1, workers=cpu_count(), compute_loss=True)
model.train(sentences, total_examples=len(sentences), epochs=10)
print ('latest loss:', model.get_latest_training_loss())

('latest loss:', 0.0)


In [11]:
# save embeddings and delete model
model.save("../word_vectors")
#model = Word2Vec.load('../word_vectors')
word_vectors = model.wv
del model

In [14]:
print word_vectors["conduction_NN"]

[-0.31187963  0.03976768  0.1937149  -0.35644537  0.10583679 -0.35927674
  0.27302787 -0.10476804 -0.59796894  0.44259581  0.5399918   0.3584716
  0.22442795 -0.21453314  0.16520844  0.08902788 -0.31038392  0.13547154
  0.60578221  0.29021123]


#### create dataset

In [12]:
# create X_train, Y_train
X_train = np.array([]).reshape(0,vector_size)
Y_train = np.array([])

for sentenceID, labels in label_dict.iteritems():
    sentence = sentences_df[sentences_df.sentenceID == sentenceID]['sentenceText'].values[0]
    tok_sentence = word_tokenize(sentence)
    tok_sentence_pos = [ word + '_' + pos for word, pos in pos_tag(tok_sentence, tagset=None)]

    for word, label in zip(tok_sentence_pos, labels):
        word_vector = word_vectors[word]
        X_train = np.vstack((X_train, word_vector))
        Y_train = np.append(Y_train, label)       

In [15]:
print (X_train.shape)
print (Y_train.shape)

(128806, 20)
(128806,)


### One-hot encoding of labels
Convert labels from B-I-O to $[1 0 0, 0 1 0, 0 0 1]$

In [20]:
# encode class values as integers = B-I-O -> 0-1-2
encoder = LabelEncoder()
encoded_Y = encoder.fit_transform(Y_train)
Y_train = encoded_Y
# convert integers to one-hot encoding
#Y_train = np_utils.to_categorical(encoded_Y) # SVM does not need one-hot encoding

# SVM

In [21]:
from sklearn import svm

model = svm.SVC(kernel='rbf', C=1.0, class_weight=None, gamma='auto', tol=0.001, random_state=None)

In [22]:
model.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [23]:
model.score(X_train, Y_train) # todo put X_val, Y_val or X_test, Y_test

0.8745244786733537

In [14]:
help(svm.SVC)

Help on class SVC in module sklearn.svm.classes:

class SVC(sklearn.svm.base.BaseSVC)
 |  C-Support Vector Classification.
 |  
 |  The implementation is based on libsvm. The fit time complexity
 |  is more than quadratic with the number of samples which makes it hard
 |  to scale to dataset with more than a couple of 10000 samples.
 |  
 |  The multiclass support is handled according to a one-vs-one scheme.
 |  
 |  For details on the precise mathematical formulation of the provided
 |  kernel functions and how `gamma`, `coef0` and `degree` affect each
 |  other, see the corresponding section in the narrative documentation:
 |  :ref:`svm_kernels`.
 |  
 |  Read more in the :ref:`User Guide <svm_classification>`.
 |  
 |  Parameters
 |  ----------
 |  C : float, optional (default=1.0)
 |      Penalty parameter C of the error term.
 |  
 |  kernel : string, optional (default='rbf')
 |       Specifies the kernel type to be used in the algorithm.
 |       It must be one of 'linear', 'poly

# [ignore] Define the network

In [43]:
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from sklearn import preprocessing
from keras.optimizers import *
from keras.initializers import *

Using TensorFlow backend.


In [None]:
# TODO to use a CNN (tedious)
# stack vectors of a sentence to form a matrix
# 0-pad matrix with as many rows needed to match maximum lenght of a sentence
# -1-pad the labels accordingly or 2-pad
# define loss function to ignore missclassification of padding

In [68]:
num_inputs = X_train.shape[1] # size of a vector
num_outputs = 3 # b-i-o tags

model = Sequential()

model.add(Dense(units=64, input_shape=(num_inputs,), activation='relu'))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=num_outputs, activation='softmax'))

model.compile(loss='categorical_crossentropy', 
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_18 (Dense)             (None, 64)                1344      
_________________________________________________________________
dense_19 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_20 (Dense)             (None, 3)                 99        
Total params: 3,523
Trainable params: 3,523
Non-trainable params: 0
_________________________________________________________________


In [69]:
epochs = 10

for epoch in range(epochs):
    for sentence in sentences: # TODO: do this better
        tokens = word_tokenize(sentence)
        batch_size = len(tokens)
        model.fit(X_train, Y_train, epochs=1, shuffle=False, verbose=1, batch_size=batch_size, initial_epoch=epoch)

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1

KeyboardInterrupt: 