In [1]:
# Gensim
import gensim
from gensim.models import Word2Vec

# to make nbs importable
import io, os, sys, types
from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell

import pandas as pd
import numpy as np

from multiprocessing import cpu_count

# custom
from analize_text import get_sentenceID
from paths import *

# nltk
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk import pos_tag, pos_tag_sents



### Reading sentences from structured data frames

In [2]:
# read dataframes of sentences and entities

# TRAIN SET
sentences_df_train = pd.read_csv(SENTENCE_PATH_train)
entities_df_train = pd.read_csv(ENTITY_PATH_train)

#TEST SET
sentences_df_test1 = pd.read_csv(SENTENCE_PATH_test1)
entities_df_test1 = pd.read_csv(ENTITY_PATH_test1)

#TEST2 SET
sentences_df_test2 = pd.read_csv(SENTENCE_PATH_test2)
entities_df_test2 = pd.read_csv(ENTITY_PATH_test2)

print(len(sentences_df_train), len(sentences_df_test1), len(sentences_df_test2))

6832 664 1299


In [3]:
# Concatenating training and test data for the word2vec training!
sentences_df = pd.concat([sentences_df_train,
                          sentences_df_test1,
                          sentences_df_test2]).drop_duplicates().reset_index(drop=True)

entities_df = pd.concat([entities_df_train,
                         entities_df_test1,
                         entities_df_test2]).drop_duplicates().reset_index(drop=True)

### Sentences dataframe

In [4]:
print('Sentences dataframe')
sentences_df_train.head()
#sentences_df_test2.info()

Sentences dataframe


Unnamed: 0,sentenceID,sentenceText
0,DDI-DrugBank.d281.s0,Probenecid may decrease renal tubular secretio...
1,DDI-DrugBank.d281.s1,Drug/Laboratory Test Interactions A false posi...
2,DDI-DrugBank.d281.s2,Positive direct and indirect antiglobulin (Coo...
3,DDI-DrugBank.d281.s3,these may also occur in neonates whose mothers...
4,DDI-DrugBank.d384.s0,Interactions for vitamin D analogues (Vitamin ...


### Entities dataframe

In [5]:
print('Entities dataframe')
entities_df_train.head()
#entities_df.info()

Entities dataframe


Unnamed: 0,entityID,name,position,type
0,DDI-DrugBank.d281.s0.e0,Probenecid,0-9,drug
1,DDI-DrugBank.d281.s0.e1,cephalosporins,51-64,group
2,DDI-DrugBank.d281.s0.e2,cephalosporin,132-144,group
3,DDI-DrugBank.d281.s3.e0,cephalosporins,56-69,group
4,DDI-DrugBank.d384.s0.e0,vitamin D analogues,17-35,group


### Load label dictionary {sentenceID: [ 'B', 'I', ..., 'O'] }


In [6]:
label_dict_path = os.path.join(ROOT_DIR, 'Train', 'bio_labels')
label_dict = np.load(label_dict_path + '.npy').item()

sentenceIDs = label_dict.keys()

### Tokenize sentences

In [7]:
sentences = [row['sentenceText'] for index, row in sentences_df.iterrows()]
# remove duplicates from sentence list (sentences with e.g. 2 entities appeared twice)
sentences = list(set(sentences))
#print(sentences)
print(len(sentences))
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

8492


### Stemming + POS

In [8]:
# POS needs to be extracted before stemming appended afterwards
# TODO: try which one performs better
stemmer = EnglishStemmer()
s = ['interaction', 'between', 'cimetidine', 'and', 'warfarin', 'could', 'be', 'dangerous']

print ('original: ', s)
print('stemmed:  ', [stemmer.stem(w) for w in s])
print('Conclusion: It stemms the drug names too.')

original:  ['interaction', 'between', 'cimetidine', 'and', 'warfarin', 'could', 'be', 'dangerous']
stemmed:   ['interact', 'between', 'cimetidin', 'and', 'warfarin', 'could', 'be', 'danger']
Conclusion: It stemms the drug names too.


### Appending POS tags
- tokens will have the form: stemmed-word_POS if argument stem=True
- tokens will have the form: word_POS         if argument stem=False

In [9]:
def tokenize_with_POS(sentences, stem=True):
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    tokenized_pos = pos_tag_sents(tokenized_sentences, tagset=None)
    if stem:
        tokenized_pos = [ [stemmer.stem(w) + '_' + pos for w, pos in s ] for s in tokenized_pos]
    else:
        tokenized_pos = [ [w + '_' + pos for w, pos in s ] for s in tokenized_pos]
    return tokenized_pos  

### Training word2vec with dimensions= vectorsize

In [10]:
vector_size = 20

tokenized_sentences_pos = tokenize_with_POS(sentences)
print(tokenized_sentences_pos[0])
print(len(tokenized_sentences_pos))  

model = Word2Vec(tokenized_sentences_pos, size=vector_size, window=5, min_count=1, workers=cpu_count(), compute_loss=True)
model.train(sentences, total_examples=len(sentences), epochs=10)
print ('latest loss:', model.get_latest_training_loss())

# save embeddings and delete model
model.save("../word_vectors")
#model = Word2Vec.load('../word_vectors')
word_vectors = model.wv
del model

['the_DT', 'effect_NN', 'of_IN', 'gabapentin_NN', 'on_IN', 'cimetidin_NN', 'was_VBD', 'not_RB', 'evalu_VBN', '._.']
8492
latest loss: 0.0


In [11]:
#print(word_vectors["conduction_NN"]) 
print(word_vectors["conduct_NN"]) #stemmed
print(word_vectors['105_CD'])

[-0.64336133 -0.13537563 -0.12868695 -0.13165009 -0.10287455  0.5218419
 -0.193023   -0.02495184  0.3221799  -0.35481405  0.76113516  0.0760758
 -0.44677114  0.24988987  0.35520503  0.47339836  0.32494453  0.57354724
 -0.15979838 -0.18083699]
[-0.03398504 -0.01368822 -0.01854203  0.04567573  0.01171889  0.03160236
  0.01385679 -0.02284383  0.00455418  0.0305319   0.01115585  0.01047091
 -0.0091681   0.04327663 -0.01575882 -0.01894831  0.03560774  0.01607608
 -0.03072158 -0.05516386]


### Creating the dataset

In [19]:
def load_data(df):
    X = np.array([]).reshape(0,vector_size)
    Y = np.array([])
    sentences = [row['sentenceText'] for index, row in df.iterrows()]
    sentenceIDs = [row['sentenceID'] for index, row in df.iterrows()]
    tokenized_sentences_pos = tokenize_with_POS(sentences)
    print(len(sentences), len(sentenceIDs), len(tokenized_sentences_pos))
    for sen, ID in (zip(tokenized_sentences_pos, sentenceIDs)):
        #print(sen, ID)
        labels = label_dict[ID]
        #print(labels)
        for word, label in zip(sen, labels):
                word_vector = word_vectors[word]
                #print(word, label)
                X = np.vstack((X, word_vector))
                Y = np.append(Y, label)
    print(X.shape, Y.shape)
    return X, Y

In [20]:
# create X_train, Y_train
X_train = np.array([]).reshape(0,vector_size)
Y_train = np.array([])

X_test1 = np.array([]).reshape(0,vector_size)
Y_test1 = np.array([])

X_test2 = np.array([]).reshape(0,vector_size)
Y_test2 = np.array([])

In [29]:
#X_train, Y_train = load_data(sentences_df_train)
X_test1, Y_test1 = load_data(sentences_df_test1)
#X_test2, Y_test2 = load_data(sentences_df_test2)


664 664 664
(14896, 20) (14896,)


In [30]:
data_path = os.path.join(ROOT_DIR, 'XY')

#np.save(os.path.join(data_path, 'X_train.npy'), X_train)
np.save(os.path.join(data_path, 'X_test1.npy'), X_test1)
#np.save(os.path.join(data_path, 'X_test2.npy'), X_test2)
#np.save(os.path.join(data_path, 'Y_train.npy'), Y_train)
np.save(os.path.join(data_path, 'Y_test1.npy'), Y_test1)
#np.save(os.path.join(data_path, 'Y_test2.npy'), Y_test2)


In [32]:
#print (X_train.shape, Y_train.shape)
print (X_test1.shape, Y_test1.shape)
#print (X_test2.shape, Y_test2.shape)

(14896, 20) (14896,)


### Encode labels
- Convert labels from B-I-O to $0, 1, 2$ for SVM
- Convert labels from B-I-O to $[1 0 0, 0 1 0, 0 0 1]$ for ANN

In [None]:
# encode class values as integers = B-I-O -> 0-1-2
encoder = LabelEncoder()
encoded_Y = encoder.fit_transform(Y_train)
Y_train = encoded_Y
# convert integers to one-hot encoding
Y_train_one_hot = np_utils.to_categorical(encoded_Y) # SVM does not need one-hot encoding