In [2]:
# Gensim
import gensim
from gensim.models import Word2Vec

# to make nbs importable
import io, os, sys, types
from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell

import pandas as pd
import numpy as np

from multiprocessing import cpu_count

# custom
from analize_text import get_sentenceID
from paths import *

# nltk
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk import pos_tag, pos_tag_sents



### Reading sentences from structured data frames

In [3]:
# read dataframes of sentences and entities

# TRAIN SET
sentences_df_train = pd.read_csv(SENTENCE_PATH_train)
entities_df_train = pd.read_csv(ENTITY_PATH_train)

#TEST SET
sentences_df_test1 = pd.read_csv(SENTENCE_PATH_test1)
entities_df_test1 = pd.read_csv(ENTITY_PATH_test1)

#TEST2 SET
sentences_df_test2 = pd.read_csv(SENTENCE_PATH_test2)
entities_df_test2 = pd.read_csv(ENTITY_PATH_test2)

print(len(sentences_df_train), len(sentences_df_test1), len(sentences_df_test2))

6832 664 1299


In [4]:
# Concatenating training and test data for the word2vec training!
sentences_df = pd.concat([sentences_df_train,
                          sentences_df_test1,
                          sentences_df_test2]).drop_duplicates().reset_index(drop=True)

entities_df = pd.concat([entities_df_train,
                         entities_df_test1,
                         entities_df_test2]).drop_duplicates().reset_index(drop=True)

### Sentences dataframe

In [5]:
print('Sentences dataframe')
sentences_df_train.head()
#sentences_df_test2.info()

Sentences dataframe


Unnamed: 0,sentenceID,sentenceText
0,DDI-DrugBank.d157.s0,Clinical interaction studies with cimetidine a...
1,DDI-DrugBank.d157.s1,(See CLINICAL PHARMACOLOGY) Coadministration o...
2,DDI-DrugBank.d157.s2,There is no clinical experience to date on the...
3,DDI-DrugBank.d157.s3,Drug/Laboratory Test-Interactions None observed.
4,DDI-DrugBank.d110.s0,The administration of local anesthetic solutio...


### Entities dataframe

In [6]:
print('Entities dataframe')
entities_df_train.head()
#entities_df.info()

Entities dataframe


Unnamed: 0,entityID,name,position,type
0,DDI-DrugBank.d157.s0.e0,cimetidine,34-43,drug
1,DDI-DrugBank.d157.s0.e1,warfarin,49-56,drug
2,DDI-DrugBank.d157.s0.e2,Femara,97-102,brand
3,DDI-DrugBank.d157.s1.e0,Femara,48-53,brand
4,DDI-DrugBank.d157.s1.e1,tamoxifen,59-67,drug


### Load label dictionary {sentenceID: [ 'B', 'I', ..., 'O'] }


In [24]:
label_dict_path = os.path.join(ROOT_DIR, 'Train', 'bio_labels')
label_dict = np.load(label_dict_path + '.npy').item()

sentenceIDs = label_dict.keys()

n_words = 0
for k, v in label_dict.items():
    n_words += len(v)
print("number of words:", n_words)

number of words: 192768


### Tokenize sentences

In [8]:
sentences = [row['sentenceText'] for index, row in sentences_df.iterrows()]
# remove duplicates from sentence list (sentences with e.g. 2 entities appeared twice)
sentences = list(set(sentences))
#print(sentences)
print(len(sentences))
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

8492


### Stemming + POS

In [11]:
# POS needs to be extracted before stemming appended afterwards
# TODO: try which one performs better
stemmer = EnglishStemmer()
s = ['interaction', 'between', 'cimetidine', 'and', 'warfarin', 'could', 'be', 'dangerous']

print ('original: ', s)
print('stemmed:  ', [stemmer.stem(w) for w in s])
print('Conclusion: It stemms the drug names too.')

original:  ['interaction', 'between', 'cimetidine', 'and', 'warfarin', 'could', 'be', 'dangerous']
stemmed:   ['interact', 'between', 'cimetidin', 'and', 'warfarin', 'could', 'be', 'danger']
Conclusion: It stemms the drug names too.


### Appending POS tags
- tokens will have the form: stemmed-word_POS if argument stem=True
- tokens will have the form: word_POS         if argument stem=False

In [161]:
def tokenize_with_POS(sentences, pos=True, stem=True):
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    tokenized_pos = pos_tag_sents(tokenized_sentences, tagset=None)
    
    if pos is False and stem is False:
        print('original')
        return tokenized_sentences
    if stem and pos:
        print('stem + pos')
        tokenized_pos = [ [stemmer.stem(w) + '_' + pos for w, pos in s ] for s in tokenized_pos]
    if stem and pos is False:
        print('stem')
        tokenized_pos = [ [stemmer.stem(w) for w in s ] for s in tokenized_sentences]
    else:
        print('pos')
        tokenized_pos = [ [w + '_' + pos for w, pos in s ] for s in tokenized_pos]
    return tokenized_pos  

### Training word2vec with dimensions= vectorsize

In [162]:
vector_size = 20
window = 5
pos = False
stem = True

tokenized_sentences_pos = tokenize_with_POS(sentences, pos=pos, stem=stem)
print(tokenized_sentences_pos[0])
print(len(tokenized_sentences_pos))  

model = Word2Vec(tokenized_sentences_pos, size=vector_size, window=window, min_count=1, workers=cpu_count(), compute_loss=True)
model.train(sentences, total_examples=len(sentences), epochs=10)
print ('latest loss:', model.get_latest_training_loss())

# save embeddings and delete model
model.save("../word_vectors_stem_20")
#model = Word2Vec.load('../word_vectors')
word_vectors = model.wv
del model

stem
['acut', 'effect', 'of', 'differ', 'antidepress', 'on', 'glycemia', 'in', 'diabet', 'and', 'non-diabet', 'rat', '.']
8492
latest loss: 0.0


In [163]:
#print(word_vectors["conduction_NN"]) 
#print(word_vectors["conduction"]) 
#print(word_vectors["conduct_NN"]) #stemmed + pos
print(word_vectors["conduct"])
#print(word_vectors['105_CD'])

[-1.1643752  -2.1342478   0.67933977 -0.60886997 -0.72474104  0.30156285
 -1.3156426  -0.8812983   0.5948269  -0.5462615   1.602318   -0.8439801
  0.7875856  -0.21448128  0.19546854  0.14384465 -0.15514095 -1.0530926
  0.0846498   0.38285807]


### Creating the dataset

In [164]:
def load_data(df, pos=True, stem=True):    
    sentences = [row['sentenceText'] for index, row in df.iterrows()]
    sentenceIDs = [row['sentenceID'] for index, row in df.iterrows()]
    tokenized_sentences_pos = tokenize_with_POS(sentences, pos, stem)
    print(len(sentences), len(sentenceIDs), len(tokenized_sentences_pos))
    
    # compute size of dataset 
    count = 0
    for sen, ID in (zip(tokenized_sentences_pos, sentenceIDs)):
        count += len(label_dict[ID])
        
    # allocate memory
    X = np.empty((count, vector_size))
    Y = np.chararray((count))
    
    i = 0
    for sen, ID in (zip(tokenized_sentences_pos, sentenceIDs)):
        #print(sen, ID)
        labels = label_dict[ID]
        #print(labels)
        for word, label in zip(sen, labels):
                word_vector = word_vectors[word]
                #print(word, label)
                X[i] = word_vector
                Y[i] = label
                i += 1
                #X = np.vstack((X, word_vector))
                #Y = np.append(Y, label)
    print(X.shape, Y.shape)
    return X, Y

In [165]:
X_train, Y_train = load_data(sentences_df_train, pos, stem)
X_test1, Y_test1 = load_data(sentences_df_test1, pos, stem)
X_test2, Y_test2 = load_data(sentences_df_test2, pos, stem)

stem
6832 6832 6832
(148031, 20) (148031,)
stem
664 664 664
(14896, 20) (14896,)
stem
1299 1299 1299
(29841, 20) (29841,)


In [166]:
data_path = os.path.join(ROOT_DIR, 'XY/STEM_20')

np.save(os.path.join(data_path, 'X_train.npy'), X_train)
np.save(os.path.join(data_path, 'X_test1.npy'), X_test1)
np.save(os.path.join(data_path, 'X_test2.npy'), X_test2)
np.save(os.path.join(data_path, 'Y_train.npy'), Y_train)
np.save(os.path.join(data_path, 'Y_test1.npy'), Y_test1)
np.save(os.path.join(data_path, 'Y_test2.npy'), Y_test2)

In [167]:
print (X_train.shape, Y_train.shape)
print (X_test1.shape, Y_test1.shape)
print (X_test2.shape, Y_test2.shape)

(148031, 20) (148031,)
(14896, 20) (14896,)
(29841, 20) (29841,)


### Encode labels
- Convert labels from B-I-O to $0, 1, 2$ for SVM
- Convert labels from B-I-O to $[1 0 0, 0 1 0, 0 0 1]$ for ANN

In [None]:
# encode class values as integers = B-I-O -> 0-1-2
encoder = LabelEncoder()
encoded_Y = encoder.fit_transform(Y_train)
Y_train = encoded_Y
# convert integers to one-hot encoding
Y_train_one_hot = np_utils.to_categorical(encoded_Y) # SVM does not need one-hot encoding