In [1]:
# Gensim
import gensim
from gensim.models import Word2Vec

import os

import pandas as pd
import numpy as np

# custom
from analize_text import get_sentenceID
from paths import *

# nltk
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk import pos_tag, pos_tag_sents

from multiprocessing import cpu_count

# scikit learn
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer

In [2]:
# read dataframes of sentences and entities
sentences_df = pd.read_csv(SENTENCE_PATH)
entities_df = pd.read_csv(ENTITY_PATH)

In [3]:
print('Entities dataframe')
entities_df.head()

Entities dataframe


Unnamed: 0,entityID,name,position,type
0,DDI-DrugBank.d157.s0.e0,cimetidine,34-43,drug
1,DDI-DrugBank.d157.s0.e1,warfarin,49-56,drug
2,DDI-DrugBank.d157.s0.e2,Femara,97-102,brand
3,DDI-DrugBank.d157.s1.e0,Femara,48-53,brand
4,DDI-DrugBank.d157.s1.e1,tamoxifen,59-67,drug


In [4]:
print('Sentences dataframe')
sentences_df.head()

Sentences dataframe


Unnamed: 0,sentenceID,sentenceText
0,DDI-DrugBank.d157.s0,Clinical interaction studies with cimetidine a...
1,DDI-DrugBank.d157.s1,(See CLINICAL PHARMACOLOGY) Coadministration o...
2,DDI-DrugBank.d157.s2,There is no clinical experience to date on the...
3,DDI-DrugBank.d157.s3,Drug/Laboratory Test-Interactions None observed.
4,DDI-DrugBank.d110.s0,The administration of local anesthetic solutio...


### Load label dictionary {sentenceID: [ 'B', 'I', ..., 'O'] }

In [8]:
label_dict_path = os.path.join(ROOT_DIR, 'Train', 'bio_labels')
label_dict = np.load(label_dict_path + '.npy').item()

sentenceIDs = label_dict.keys()

In [37]:
# get sentences containing at least an entity
#entities = entities_df['entityID']
#sentences = [sentences_df[sentences_df.sentenceID == get_sentenceID(entity)]['sentenceText'].values[0] 
#             for entity in entities]    

sentences = [sentences_df[sentences_df.sentenceID == sentenceID]['sentenceText'].values[0] 
             for sentenceID in sentenceIDs]

# remove duplicates from sentence list (sentences with e.g. 2 entities appeared twice)
sentences = list(set(sentences))
sentences

['As the primary effect of adenosine is to decrease conduction through the A-V node, higher degrees of heart block may be produced in the presence of carbamazepine.',
 'In patients receiving Nalfon and a steroid concomitantly, any reduction in steroid dosage should be gradual in order to avoid the possible complications of sudden steroid withdrawal.',
 'When the STADOL NS was administered 30 minutes after the sumatriptan nasal spray, the AUC of butorphanol increased 11% and Cmax decreased 18%.',
 'Cholecystokinin octapeptide is a necessary factor for realization of this action of neuroleptics.',
 'However, when addiction is defined as compulsion, loss of control and continued use in spite of adverse consequences, cocaine drug hunger can be seen as an agent of addictive disease. ',
 'Acute effect of different antidepressants on glycemia in diabetic and non-diabetic rats.\n',
 'oxyphenbutazone;',
 'Agents Affecting Sympathetic Activity The sympathetic nervous system may be especially imp

In [None]:
### WTF ### happens iterating from sentences_df instead of indexing by entities 
dd = sentences_df['sentenceText']
for i, sentence in zip(range(len(dd)), dd.values):
    if not isinstance(sentence, str):
        print i, True, sentence
        break # remove this to see all

In [94]:
# just check nothing is wrong (it should not print anything)
for s in sentences:
    if not isinstance(s, str):
        print True

### Tokenize sentences

In [11]:
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
tokenized_sentences[0]

['As',
 'the',
 'primary',
 'effect',
 'of',
 'adenosine',
 'is',
 'to',
 'decrease',
 'conduction',
 'through',
 'the',
 'A-V',
 'node',
 ',',
 'higher',
 'degrees',
 'of',
 'heart',
 'block',
 'may',
 'be',
 'produced',
 'in',
 'the',
 'presence',
 'of',
 'carbamazepine',
 '.']

In [64]:
### EXAMPLE STEM + POS ####
# POS could differ slightly when applied to the stemmed version or not
# TODO: try which one performs better
stemmer = EnglishStemmer()
s = ['interaction', 'between', 'cimetidine', 'and', 'warfarin', 'could', 'be', 'dangerous']

print ('stemmed version:\n')
stemmed_s = [stemmer.stem(w) for w in s]
print (stemmed_s)

print ('\noriginal pos tags:\n')
print(pos_tag(s))

print ('\nstemmed pos tags:\n')
print(pos_tag(stemmed_s))

stemmed version:

[u'interact', u'between', u'cimetidin', u'and', u'warfarin', u'could', 'be', u'danger']

original pos tags:

[('interaction', 'NN'), ('between', 'IN'), ('cimetidine', 'NN'), ('and', 'CC'), ('warfarin', 'NN'), ('could', 'MD'), ('be', 'VB'), ('dangerous', 'JJ')]

stemmed pos tags:

[(u'interact', 'NN'), (u'between', 'IN'), (u'cimetidin', 'NN'), (u'and', 'CC'), (u'warfarin', 'NN'), (u'could', 'MD'), ('be', 'VB'), (u'danger', 'JJR')]


### POS tag

In [12]:
tokenized_sentences_pos = pos_tag_sents(tokenized_sentences[:2], tagset=None) # tagset = None, 'universal', 'wsj', 'brown'

# concatenate the part of speach to each word (e.g. cat_NN)
tokenized_sentences_pos = [ [w + '_' + pos for w, pos in s ] for s in tokenized_sentences_pos]
tokenized_sentences_pos[:2]

[['As_IN',
  'the_DT',
  'primary_JJ',
  'effect_NN',
  'of_IN',
  'adenosine_NN',
  'is_VBZ',
  'to_TO',
  'decrease_VB',
  'conduction_NN',
  'through_IN',
  'the_DT',
  'A-V_NNP',
  'node_NN',
  ',_,',
  'higher_JJR',
  'degrees_NNS',
  'of_IN',
  'heart_NN',
  'block_NN',
  'may_MD',
  'be_VB',
  'produced_VBN',
  'in_IN',
  'the_DT',
  'presence_NN',
  'of_IN',
  'carbamazepine_NN',
  '._.'],
 ['In_IN',
  'patients_NNS',
  'receiving_VBG',
  'Nalfon_NNP',
  'and_CC',
  'a_DT',
  'steroid_NN',
  'concomitantly_RB',
  ',_,',
  'any_DT',
  'reduction_NN',
  'in_IN',
  'steroid_JJ',
  'dosage_NN',
  'should_MD',
  'be_VB',
  'gradual_JJ',
  'in_IN',
  'order_NN',
  'to_TO',
  'avoid_VB',
  'the_DT',
  'possible_JJ',
  'complications_NNS',
  'of_IN',
  'sudden_JJ',
  'steroid_NN',
  'withdrawal_NN',
  '._.']]

### Word2Vec

In [31]:
model = Word2Vec(tokenized_sentences_pos, size=20, window=5, min_count=1, workers=cpu_count(), compute_loss=True)
model.train(sentences, total_examples=len(sentences), epochs=10)
print ('latest loss:', model.get_latest_training_loss())

('latest loss:', 0.0)


In [32]:
# save embeddings and delete model
model.save("../word_vectors")
#model = Word2Vec.load('../word_vectors')
word_vectors = model.wv
del model

In [35]:
print word_vectors["conduction_NN"]

[ 0.00978385 -0.00398002  0.00176986  0.01003654  0.01661979 -0.00537594
  0.015536    0.00830257  0.02409437  0.02358754 -0.01696781  0.01079735
  0.01331737  0.0105362   0.00786868  0.01158823  0.01171584  0.02198743
  0.0171968   0.01311889]


In [36]:
# create X_train, Y_train
X_train = []
Y_train = []

for sentenceID, labels in label_dict.iteritems():
    sentence = sentences_df[sentences_df.sentenceID == sentenceID]['sentenceText'].values[0]
    tok_sentence = word_tokenize(sentence)
    tok_sentence_pos = [ word + '_' + pos for word, pos in pos_tag(tok_sentence, tagset=None)]
# Concurrent use of phenothiazines may antagonize the anorectic effect of diethylpropion.
# ['O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O']
    print tok_sentence_pos
    for word, label in zip(tok_sentence_pos, labels):
        word_vector = word_vectors[word]
        X_train.append(word_vector)
        Y_train.append(label)       

['Concurrent_NNP', 'use_NN', 'of_IN', 'phenothiazines_NNS', 'may_MD', 'antagonize_VB', 'the_DT', 'anorectic_JJ', 'effect_NN', 'of_IN', 'diethylpropion_NN', '._.']


KeyError: "word 'Concurrent_NNP' not in vocabulary"

In [38]:
#word_vectors['interaction']
#print(word_vectors.vocab.keys())