In [2]:
# Gensim
import gensim
from gensim.models import Word2Vec

import os

import pandas as pd
import numpy as np

# custom
from analize_text import get_sentenceID
from paths import *

# nltk
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk import pos_tag, pos_tag_sents

from multiprocessing import cpu_count

# scikit learn
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score

### Reading sentences from structured data frames

In [8]:
# read dataframes of sentences and entities

# TRAIN SET
sentences_df_train = pd.read_csv(SENTENCE_PATH_train)
entities_df_train = pd.read_csv(ENTITY_PATH_train)

#TEST SET
sentences_df_test1 = pd.read_csv(SENTENCE_PATH_test1)
entities_df_test1 = pd.read_csv(ENTITY_PATH_test1)

#TEST2 SET
sentences_df_test2 = pd.read_csv(SENTENCE_PATH_test2)
entities_df_test2 = pd.read_csv(ENTITY_PATH_test2)

In [9]:
# Concatenating training and test data for the word2vec training!
sentences_df = pd.concat([sentences_df_train,
                          sentences_df_test1,
                          sentences_df_test2]).drop_duplicates().reset_index(drop=True)

entities_df = pd.concat([entities_df_train,
                         entities_df_test1,
                         entities_df_test2]).drop_duplicates().reset_index(drop=True)

### Sentences dataframe

In [6]:
print('Sentences dataframe')
sentences_df_train.head()
#sentences_df_test2.info()

Sentences dataframe


Unnamed: 0,sentenceID,sentenceText
0,DDI-DrugBank.d281.s0,Probenecid may decrease renal tubular secretio...
1,DDI-DrugBank.d281.s1,Drug/Laboratory Test Interactions A false posi...
2,DDI-DrugBank.d281.s2,Positive direct and indirect antiglobulin (Coo...
3,DDI-DrugBank.d281.s3,these may also occur in neonates whose mothers...
4,DDI-DrugBank.d384.s0,Interactions for vitamin D analogues (Vitamin ...


### Entities dataframe

In [5]:
print('Entities dataframe')
entities_df_train.head()
#entities_df.info()

Entities dataframe


Unnamed: 0,entityID,name,position,type
0,DDI-DrugBank.d281.s0.e0,Probenecid,0-9,drug
1,DDI-DrugBank.d281.s0.e1,cephalosporins,51-64,group
2,DDI-DrugBank.d281.s0.e2,cephalosporin,132-144,group
3,DDI-DrugBank.d281.s3.e0,cephalosporins,56-69,group
4,DDI-DrugBank.d384.s0.e0,vitamin D analogues,17-35,group


### Load label dictionary {sentenceID: [ 'B', 'I', ..., 'O'] }


In [10]:
label_dict_path = os.path.join(ROOT_DIR, 'Train', 'bio_labels')
label_dict = np.load(label_dict_path + '.npy').item()

sentenceIDs = label_dict.keys()

In [23]:
sentences = [row['sentenceText'] for index, row in sentences_df.iterrows()]
# remove duplicates from sentence list (sentences with e.g. 2 entities appeared twice)
sentences = list(set(sentences))

8795
8492


### Tokenize sentences

In [13]:
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
len(tokenized_sentences)

8492

### Lemma (todo)


In [14]:
### EXAMPLE STEM + POS ####
# POS could differ slightly when applied to the stemmed version or not
# TODO: try which one performs better
stemmer = EnglishStemmer()
s = ['interaction', 'between', 'cimetidine', 'and', 'warfarin', 'could', 'be', 'dangerous']

print ('stemmed version:\n')
stemmed_s = [stemmer.stem(w) for w in s]
print (stemmed_s)

print ('\noriginal pos tags:\n')
print(pos_tag(s))

print ('\nstemmed pos tags:\n')
print(pos_tag(stemmed_s))

stemmed version:

['interact', 'between', 'cimetidin', 'and', 'warfarin', 'could', 'be', 'danger']

original pos tags:

[('interaction', 'NN'), ('between', 'IN'), ('cimetidine', 'NN'), ('and', 'CC'), ('warfarin', 'NN'), ('could', 'MD'), ('be', 'VB'), ('dangerous', 'JJ')]

stemmed pos tags:

[('interact', 'NN'), ('between', 'IN'), ('cimetidin', 'NN'), ('and', 'CC'), ('warfarin', 'NN'), ('could', 'MD'), ('be', 'VB'), ('danger', 'JJR')]


### POS tag

In [16]:
tokenized_sentences_pos = pos_tag_sents(tokenized_sentences, tagset=None) # tagset = None, 'universal', 'wsj', 'brown'

# concatenate the part of speach to each word (e.g. cat_NN)
tokenized_sentences_pos = [ [w + '_' + pos for w, pos in s ] for s in tokenized_sentences_pos]
tokenized_sentences_pos[0]

['Fourteen_NNP',
 'days_NNS',
 'later_RB',
 ',_,',
 'all_DT',
 'animals_NNS',
 'were_VBD',
 'challenged_VBN',
 'with_IN',
 'a_DT',
 'single_JJ',
 'hypnotic_JJ',
 'dose_NN',
 'of_IN',
 'ethanol_NN',
 '(_(',
 '3.5_CD',
 'g/kg_NN',
 'IP_NNP',
 ')_)',
 '._.']

In [17]:
vector_size = 20
model = Word2Vec(tokenized_sentences_pos, size=vector_size, window=5, min_count=1, workers=cpu_count(), compute_loss=True)
model.train(sentences, total_examples=len(sentences), epochs=10)
print ('latest loss:', model.get_latest_training_loss())

# save embeddings and delete model
model.save("../word_vectors")
#model = Word2Vec.load('../word_vectors')
word_vectors = model.wv
del model

latest loss: 0.0


In [18]:
print(word_vectors["conduction_NN"])
print(word_vectors['105_CD'])

[ 0.25848603  0.08273632 -0.08718783  0.27140656  0.27854267 -0.02318844
 -0.25571734 -0.17348506 -0.20984288 -0.2621599  -0.22950882 -0.3967506
  0.35958096  0.24631514  0.01595954  0.10744167  0.12884106 -0.6890047
  0.25285932 -0.4622117 ]
[ 0.00748834  0.03906478  0.0067243   0.01321432  0.02106611  0.02975854
 -0.03512239  0.00209284 -0.00719997 -0.0222114  -0.01517265 -0.00194615
  0.01409792  0.03328705  0.01150845  0.01233158 -0.01857619 -0.04219748
  0.01378438 -0.03474882]


### Creating the dataset

In [20]:
def load_data(df):
    X = np.array([]).reshape(0,vector_size)
    words = np.array([])
    Y = np.array([])
    for sentenceID, labels in label_dict.items():
        if df[df.sentenceID == sentenceID].empty:
            #print('empty')
            continue
        else: 
            sentence = df[df.sentenceID == sentenceID]['sentenceText'].values[0] 
        tok_sentence = word_tokenize(sentence)
        tok_sentence_pos = [ word + '_' + pos for word, pos in pos_tag(tok_sentence, tagset=None)]

        for word, label in zip(tok_sentence_pos, labels):
            word_vector = word_vectors[word]
            X = np.vstack((X, word_vector))
            Y = np.append(Y, label)       
            words = np.append(words, word)
            #print(word)
    return X, Y, words

In [21]:
# create X_train, Y_train
X_train = np.array([]).reshape(0,vector_size)
Y_train = np.array([])
words_train = np.array([])

X_test1 = np.array([]).reshape(0,vector_size)
Y_test1 = np.array([])
words_test1 = np.array([])

#X_test2 = np.array([]).reshape(0,vector_size)
#Y_test2 = np.array([])

In [None]:
X_train, Y_train, words_train = load_data(sentences_df_train)
X_test1, Y_test1, words_test1 = load_data(sentences_df_test1)
#X_test2, Y_test2, words_test2 = load_data(sentences_df_test2)

In [None]:
print (X_train.shape, Y_train.shape)
print (X_test1.shape, Y_test1.shape)
#print (X_test2.shape, Y_test2.shape)

### Encode labels
- Convert labels from B-I-O to $0, 1, 2$ for SVM
- Convert labels from B-I-O to $[1 0 0, 0 1 0, 0 0 1]$ for ANN

In [None]:
# encode class values as integers = B-I-O -> 0-1-2
encoder = LabelEncoder()
encoded_Y = encoder.fit_transform(Y_train)
Y_train = encoded_Y
# convert integers to one-hot encoding
Y_train_one_hot = np_utils.to_categorical(encoded_Y) # SVM does not need one-hot encoding