In [3]:
# Gensim
import gensim
from gensim.models import Word2Vec

# to make nbs importable
import io, os, sys, types
from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell

import pandas as pd
import numpy as np

from multiprocessing import cpu_count

# custom
from analize_text import get_sentenceID
from paths import *

# nltk
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk import pos_tag, pos_tag_sents

from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, MinMaxScaler, StandardScaler

from keras.utils import np_utils




### Reading sentences from structured data frames

In [4]:
# read dataframes of sentences and entities

# TRAIN SET
sentences_df_train = pd.read_csv(SENTENCE_PATH_train)
entities_df_train = pd.read_csv(ENTITY_PATH_train)

#TEST SET
sentences_df_test1 = pd.read_csv(SENTENCE_PATH_test1)
entities_df_test1 = pd.read_csv(ENTITY_PATH_test1)

#TEST2 SET
sentences_df_test2 = pd.read_csv(SENTENCE_PATH_test2)
entities_df_test2 = pd.read_csv(ENTITY_PATH_test2)

print(len(sentences_df_train), len(sentences_df_test1), len(sentences_df_test2))

6832 664 1299


###  Concatenating training and test data for the word2vec training!

In [5]:
sentences_df = pd.concat([sentences_df_train,
                          sentences_df_test1,
                          sentences_df_test2]).drop_duplicates().reset_index(drop=True)

entities_df = pd.concat([entities_df_train,
                         entities_df_test1,
                         entities_df_test2]).drop_duplicates().reset_index(drop=True)

### Sentences dataframe

In [6]:
print('Sentences dataframe')
sentences_df_train.head()
#sentences_df_test2.info()

Sentences dataframe


Unnamed: 0,sentenceID,sentenceText
0,DDI-DrugBank.d281.s0,Probenecid may decrease renal tubular secretio...
1,DDI-DrugBank.d281.s1,Drug/Laboratory Test Interactions A false posi...
2,DDI-DrugBank.d281.s2,Positive direct and indirect antiglobulin (Coo...
3,DDI-DrugBank.d281.s3,these may also occur in neonates whose mothers...
4,DDI-DrugBank.d384.s0,Interactions for vitamin D analogues (Vitamin ...


### Entities dataframe

In [7]:
print('Entities dataframe')
entities_df_train.head()
#entities_df.info()

Entities dataframe


Unnamed: 0,entityID,name,position,type
0,DDI-DrugBank.d281.s0.e0,Probenecid,0-9,drug
1,DDI-DrugBank.d281.s0.e1,cephalosporins,51-64,group
2,DDI-DrugBank.d281.s0.e2,cephalosporin,132-144,group
3,DDI-DrugBank.d281.s3.e0,cephalosporins,56-69,group
4,DDI-DrugBank.d384.s0.e0,vitamin D analogues,17-35,group


### Load label dictionary {sentenceID: [ 'B', 'I', ..., 'O'] }


In [8]:
label_dict_path = os.path.join(ROOT_DIR, 'Train', 'bio_labels')
label_dict = np.load(label_dict_path + '.npy').item()

sentenceIDs = label_dict.keys()

### Tokenize sentences

In [9]:
sentences = [row['sentenceText'] for index, row in sentences_df.iterrows()]
# remove duplicates from sentence list (sentences with e.g. 2 entities appeared twice)
sentences = list(set(sentences))
#print(sentences)
print(len(sentences))
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

8492


### Stemming + POS

In [10]:
# POS needs to be extracted before stemming appended afterwards
# TODO: try which one performs better
stemmer = EnglishStemmer()
s = ['interaction', 'between', 'cimetidine', 'and', 'warfarin', 'could', 'be', 'dangerous']

print ('original: ', s)
print('stemmed:  ', [(stemmer.stem(w), pos_tag([w])) for w in s])
print('Conclusion: It stemms the drug names too.')

original:  ['interaction', 'between', 'cimetidine', 'and', 'warfarin', 'could', 'be', 'dangerous']
stemmed:   [('interact', [('interaction', 'NN')]), ('between', [('between', 'IN')]), ('cimetidin', [('cimetidine', 'NN')]), ('and', [('and', 'CC')]), ('warfarin', [('warfarin', 'NN')]), ('could', [('could', 'MD')]), ('be', [('be', 'VB')]), ('danger', [('dangerous', 'JJ')])]
Conclusion: It stemms the drug names too.


### Appending POS tags
- tokens will have the form: stemmed-word_POS if argument stem=True
- tokens will have the form: word_POS         if argument stem=False

In [11]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

punctuation = [".",",", ":", ";", "!", "?", "(", ")", "%", "[","]", "-", "e.g."]
stop_words = set(stopwords.words('english'))

def isNumber(inputString):
    try:
        int(inputString)
        return True
    except ValueError:
        return False

def tokenize(sentences, stem=True, POS=True, remove_stopwords=False, remove_punctuation=True, lower=False, remove_nums=False):
    tokenized = [word_tokenize(sentence) for sentence in sentences]
    #print(tokenized)
    if lower:
        tokenized = [[w.lower() for w in s] for s in tokenized]
    if remove_punctuation:
        tokenized = [[w for w in s if w not in punctuation] for s in tokenized]
    if remove_nums:
        tokenized = [[w for w in s if not isNumber(w)] for s in tokenized]
        #print(tokenized)
    if remove_stopwords:
        tokenized = [[w for w in s if w.lower() not in stop_words] for s in tokenized]
        #print(tokenized)
    if POS:
        tokenized = pos_tag_sents(tokenized, tagset=None)
        #print(tokenized)
    if stem and POS:
        tokenized = [ [stemmer.stem(w) + '_' + pos for w, pos in s ] for s in tokenized]
    if stem and not POS:
        tokenized = [ [stemmer.stem(w) for w in s ] for s in tokenized]
        #filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return tokenized

In [12]:
tokenize(["This is, my first dinner at Barcelona. Having a 7 blast!", "Hey you!"],
         POS=True,
         stem=False,
         remove_stopwords=True,
         lower=True,
         remove_nums=True)


[[('first', 'RB'), ('dinner', 'NN'), ('barcelona', 'NN'), ('blast', 'NN')],
 [('hey', 'NN')]]

### Training word2vec with dimensions= vectorsize

In [13]:
vector_size = 20
window = 5
pos = False
stem = True
remove_stopwords=False
remove_punctuation=True
lower=False
remove_nums=False

tokenized_sentences_pos = tokenize(sentences,
                                   POS=pos,
                                   stem=stem,
                                   remove_stopwords=remove_stopwords,
                                   remove_punctuation=remove_punctuation,
                                   lower=lower,
                                   remove_nums=remove_nums)
print(tokenized_sentences_pos[0])
print(len(tokenized_sentences_pos))  

model = Word2Vec(tokenized_sentences_pos, size=vector_size, window=window, min_count=1, workers=cpu_count(), compute_loss=True)
model.train(sentences, total_examples=len(sentences), epochs=10)
print ('latest loss:', model.get_latest_training_loss())

# save embeddings and delete model
model.save("../word_vectors_stem_20")
#model = Word2Vec.load('../word_vectors')
word_vectors = model.wv
del model

['the', 'elev', 'plasma', 'alprazolam', 'concentr', 'result', 'in', 'decreas', 'psychomotor', 'perform', 'and', 'memori']
8492
latest loss: 0.0


In [14]:
#print(word_vectors["conduction_NN"]) 
#print(word_vectors["conduction"]) 
#print(word_vectors["conduct_NN"]) #stemmed + pos
print(word_vectors["conduct"])
#print(word_vectors['105_CD'])

[ 1.3487974   0.14439005 -0.28479916  1.5563012   0.64845455 -0.96170324
 -0.12372573  0.6140399  -0.82571924  1.0679381  -0.34881476  0.94606185
 -0.19111978 -0.22339945  0.9301789  -0.870071   -0.45964292 -0.33546418
  2.1522143  -1.1935843 ]


### Creating the dataset

In [15]:
vector_size = 20
window = 5
pos = False
stem = True
remove_stopwords=False
remove_punctuation=True
lower=False
remove_nums=False


def load_data(df, pos=True, stem=True, remove_stopwords=True, remove_punctuation=True, lower=False, remove_nums=True):    
    sentences = [row['sentenceText'] for index, row in df.iterrows()]
    sentenceIDs = [row['sentenceID'] for index, row in df.iterrows()]
    tokenized_sentences_pos = tokenize(sentences, POS=pos, stem=stem,
                                      remove_stopwords=remove_stopwords,
                                      remove_punctuation=remove_punctuation,
                                      lower=lower, remove_nums=False)
    print(len(sentences), len(sentenceIDs), len(tokenized_sentences_pos))
    
    # compute size of dataset 
    count = 0
    for sen, ID in (zip(tokenized_sentences_pos, sentenceIDs)):
        count += len(label_dict[ID])
        
    # allocate memory
    X = np.empty((count, vector_size))
    Y = np.chararray((count))
    
    i = 0
    for sen, ID in (zip(tokenized_sentences_pos, sentenceIDs)):
        #print(sen, ID)
        labels = label_dict[ID]
        #print(labels)
        for word, label in zip(sen, labels):
                word_vector = word_vectors[word]
                #print(word, label)
                X[i] = word_vector
                Y[i] = label
                i += 1
                #X = np.vstack((X, word_vector))
                #Y = np.append(Y, label)
    print(X.shape, Y.shape)
    return X, Y

In [16]:
X_train, Y_train = load_data(sentences_df_train, pos, stem)
X_test1, Y_test1 = load_data(sentences_df_test1, pos, stem)
X_test2, Y_test2 = load_data(sentences_df_test2, pos, stem)

6832 6832 6832
(148031, 20) (148031,)
664 664 664
(14896, 20) (14896,)
1299 1299 1299
(29839, 20) (29839,)


In [17]:
data_path = os.path.join(ROOT_DIR, 'XY/STEM_20')

np.save(os.path.join(data_path, 'X_train.npy'), X_train)
np.save(os.path.join(data_path, 'X_test1.npy'), X_test1)
np.save(os.path.join(data_path, 'X_test2.npy'), X_test2)
np.save(os.path.join(data_path, 'Y_train.npy'), Y_train)
np.save(os.path.join(data_path, 'Y_test1.npy'), Y_test1)
np.save(os.path.join(data_path, 'Y_test2.npy'), Y_test2)

In [18]:
print (X_train.shape, Y_train.shape)
print (X_test1.shape, Y_test1.shape)
print (X_test2.shape, Y_test2.shape)

(148031, 20) (148031,)
(14896, 20) (14896,)
(29839, 20) (29839,)


### Encode labels
- Convert labels from B-I-O to $0, 1, 2$ for SVM
- Convert labels from B-I-O to $[1 0 0, 0 1 0, 0 0 1]$ for ANN

In [19]:
# encode class values as integers = B-I-O -> 0-1-2
encoder = LabelEncoder()
encoded_Y = encoder.fit_transform(Y_train)
Y_train = encoded_Y

Y_test = encoder.fit_transform(Y_test1)

### Creating the feature matrix

In [20]:
def is_capitalized(word):
    if not word:
        return 0
    if word[0].isupper():
        return 0.5
    if word.isupper():
        return 1
    return 0

In [21]:
def has_POS_NN(word):
    if not word:
        return 0
    if pos_tag([word]) == 'NN':
        return 1
    else:
        return 0
#pos_tag(["cat"])   

In [22]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

def has_numbers(word):
    if isNumber(word):
        return 0
    else:
        if hasNumbers(word):
            return 1
        else:
            return 0.5
        
        

In [23]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [24]:
surrounding = ['test', 'dosage', 'concentrations', 'induce', 'inhibit', 'treatment', 'coadministration',
               'studies', 'supplements', 'intake', 'therapy', 'doses', 'use', 'given', 'administration']

def possible_pos(sentence, word_position):
    mx = min(len(sentence) - 1, word_position + 2)
    mn = max(0, word_position - 2)
    return range(mn, mx+1)

def is_trigger(word, sentence, word_position):
    if not word:
        return 0
    for word_pos in possible_pos(sentence, word_position):
        if np.argmax([similar(sentence[word_pos], w) for w in surrounding])> 0.7:
            return 1
    return 0

In [25]:
vowels = ["a", "e", "i", "o", "u"]
def has_more_consonants(word):
    number_of_vowels = sum(word.count(c) for c in vowels)
    number_of_consonants = len(word) - number_of_vowels
    if number_of_consonants > number_of_vowels:
        return 1
    else:
        return 0

In [26]:
surrounding = ['test', 'dosage', 'concentrations', 'induce', 'inhibit', 'treatment', 'coadministration',
               'studies', 'supplements', 'intake', 'therapy', 'doses', 'use', 'given', 'administration', 'indicate']

words = list()
sentenceIDs = list()

def prepare_words(df):
    sentences_train1 = [row['sentenceText'] for index, row in df.iterrows()]
    sentenceIDs_train = [row['sentenceID'] for index, row in df.iterrows()]
    sentences_train = tokenize(sentences_train1, POS=False, stem=False, remove_stopwords=True, remove_nums=True)

    for ID, sentence in zip(sentences_train1, sentences_train):
        for word in sentence:
            words.append(word)
            sentenceIDs.append(ID)
    
    s = {'sentence':sentenceIDs, 'word':words}
    prep = pd.DataFrame(data=s)
    return prep

train = prepare_words(sentences_df_train)
train.info()
print(len(sentences_df_train))

test = prepare_words(sentences_df_test1)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79252 entries, 0 to 79251
Data columns (total 2 columns):
sentence    79252 non-null object
word        79252 non-null object
dtypes: object(2)
memory usage: 1.2+ MB
6832
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87484 entries, 0 to 87483
Data columns (total 2 columns):
sentence    87484 non-null object
word        87484 non-null object
dtypes: object(2)
memory usage: 1.3+ MB


In [27]:
X_tr = np.zeros([train.shape[0], 25])

for index, row in train.iterrows():
    word = row['word']
    sentence = word_tokenize(row['sentence'])
    word_position = sentence.index(word)
    #print(word, word_position, sentence[word_position])
    vector = [is_capitalized(word), is_trigger(word, sentence, word_position), has_numbers(word), has_POS_NN(word), has_more_consonants(word)] # + add embedding
    #print(vector)similar("Apple","Appel")
    sentence = tokenize([row['sentence']], POS=pos, stem=stem,
                        remove_stopwords=remove_stopwords,
                        remove_punctuation=remove_punctuation,
                        lower=lower, remove_nums=remove_nums)
    #print(sentence, word_position)
    if word_position >=len(sentence[0]):
        word_position = len(sentence[0]) - 1
    vector.extend(word_vectors[sentence[0][word_position]].tolist())
    #print(vector)
    vector = np.array(vector)
    #print(embed, vector)
    #vector = list(vector + embed.T)
    #print(len(vector), X_ts.shape)
    vector = np.array(vector).reshape(1, -1)
    #print(vector.shape, X_ts.shape)
    X_tr[index,:] = vector
    
print(X_tr.shape)

(79252, 25)


In [32]:
print(Y_train.shape)

(148031,)


In [28]:
X_ts = np.zeros([test.shape[0], 25])

for index, row in test.iterrows():
    word = row['word']
    sentence = word_tokenize(row['sentence'])
    word_position = sentence.index(word)
    #print(word, word_position, sentence[word_position])
    vector = [is_capitalized(word), is_trigger(word, sentence, word_position), has_numbers(word), has_POS_NN(word), has_more_consonants(word)] # + add embedding
    #print(vector)similar("Apple","Appel")
    sentence = tokenize([row['sentence']], POS=pos, stem=stem,
                        remove_stopwords=remove_stopwords,
                        remove_punctuation=remove_punctuation,
                        lower=lower, remove_nums=remove_nums)
    #print(sentence, word_position)
    if word_position >=len(sentence[0]):
        word_position = len(sentence[0]) - 1
    vector.extend(word_vectors[sentence[0][word_position]].tolist())
    #print(vector)
    vector = np.array(vector)
    #print(embed, vector)
    #vector = list(vector + embed.T)
    #print(len(vector), X_ts.shape)
    vector = np.array(vector).reshape(1, -1)
    #print(vector.shape, X_ts.shape)
    X_ts[index,:] = vector
    
print(X_ts.shape)

(87484, 25)


In [29]:
from sklearn import svm

# one-vs-all classifier
model = svm.SVC(kernel='rbf', 
                C=1.0,
                class_weight=None,
                gamma='auto',
                #penalty='l2',
                #loss='squared_hinge',
                tol=0.001, random_state=None)

In [30]:
model.fit(X_tr, Y_train)
predictions = model.predict(X_ts, Y_ts)

ValueError: Found input variables with inconsistent numbers of samples: [79252, 148031]

In [None]:
print(X_tr.shape, Y_train.shape)

In [None]:
print(X_ts.shape, Y_test.shape)