In [None]:
# Gensim
import gensim
from gensim.models import Word2Vec

# to make nbs importable
import io, os, sys, types
from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell

import pandas as pd
import numpy as np

from multiprocessing import cpu_count

# custom
from analize_text import get_sentenceID
from paths import *

# nltk
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk import pos_tag, pos_tag_sents



### Reading sentences from structured data frames

In [None]:
# read dataframes of sentences and entities

# TRAIN SET
sentences_df_train = pd.read_csv(SENTENCE_PATH_train)
entities_df_train = pd.read_csv(ENTITY_PATH_train)

#TEST SET
sentences_df_test1 = pd.read_csv(SENTENCE_PATH_test1)
entities_df_test1 = pd.read_csv(ENTITY_PATH_test1)

#TEST2 SET
sentences_df_test2 = pd.read_csv(SENTENCE_PATH_test2)
entities_df_test2 = pd.read_csv(ENTITY_PATH_test2)

print(len(sentences_df_train), len(sentences_df_test1), len(sentences_df_test2))

In [None]:
# Concatenating training and test data for the word2vec training!
sentences_df = pd.concat([sentences_df_train,
                          sentences_df_test1,
                          sentences_df_test2]).drop_duplicates().reset_index(drop=True)

entities_df = pd.concat([entities_df_train,
                         entities_df_test1,
                         entities_df_test2]).drop_duplicates().reset_index(drop=True)

### Sentences dataframe

In [None]:
print('Sentences dataframe')
sentences_df_train.head()
#sentences_df_test2.info()

### Entities dataframe

In [None]:
print('Entities dataframe')
entities_df_train.head()
#entities_df.info()

### Load label dictionary {sentenceID: [ 'B', 'I', ..., 'O'] }


In [None]:
label_dict_path = os.path.join(ROOT_DIR, 'Train', 'bio_labels')
label_dict = np.load(label_dict_path + '.npy').item()

sentenceIDs = label_dict.keys()

### Tokenize sentences

In [None]:
sentences = [row['sentenceText'] for index, row in sentences_df.iterrows()]
# remove duplicates from sentence list (sentences with e.g. 2 entities appeared twice)
sentences = list(set(sentences))
#print(sentences)
print(len(sentences))
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

### Stemming + POS

In [None]:
# POS needs to be extracted before stemming appended afterwards
# TODO: try which one performs better
stemmer = EnglishStemmer()
s = ['interaction', 'between', 'cimetidine', 'and', 'warfarin', 'could', 'be', 'dangerous']

print ('original: ', s)
print('stemmed:  ', [stemmer.stem(w) for w in s])
print('Conclusion: It stemms the drug names too.')

### Appending POS tags
- tokens will have the form: stemmed-word_POS if argument stem=True
- tokens will have the form: word_POS         if argument stem=False

In [None]:
def tokenize_with_POS(sentences, stem=True):
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    tokenized_pos = pos_tag_sents(tokenized_sentences, tagset=None)
    if stem:
        tokenized_pos = [ [stemmer.stem(w) + '_' + pos for w, pos in s ] for s in tokenized_pos]
    else:
        tokenized_pos = [ [w + '_' + pos for w, pos in s ] for s in tokenized_pos]
    return tokenized_pos  

### Training word2vec with dimensions= vectorsize

In [None]:
vector_size = 20

tokenized_sentences_pos = tokenize_with_POS(sentences)
print(tokenized_sentences_pos[0])
print(len(tokenized_sentences_pos))  

model = Word2Vec(tokenized_sentences_pos, size=vector_size, window=5, min_count=1, workers=cpu_count(), compute_loss=True)
model.train(sentences, total_examples=len(sentences), epochs=10)
print ('latest loss:', model.get_latest_training_loss())

# save embeddings and delete model
model.save("../word_vectors")
#model = Word2Vec.load('../word_vectors')
word_vectors = model.wv
del model

In [None]:
#print(word_vectors["conduction_NN"]) 
print(word_vectors["conduct_NN"]) #stemmed
print(word_vectors['105_CD'])

### Creating the dataset

In [None]:
def load_data(df):
    X = np.array([]).reshape(0,vector_size)
    Y = np.array([])
    sentences = [row['sentenceText'] for index, row in df.iterrows()]
    sentenceIDs = [row['sentenceID'] for index, row in df.iterrows()]
    tokenized_sentences_pos = tokenize_with_POS(sentences)
    print(len(sentences), len(sentenceIDs), len(tokenized_sentences_pos))
    for sen, ID in (zip(tokenized_sentences_pos, sentenceIDs)):
        #print(sen, ID)
        labels = label_dict[ID]
        #print(labels)
        for word, label in zip(sen, labels):
                word_vector = word_vectors[word]
                #print(word, label)
                X = np.vstack((X, word_vector))
                Y = np.append(Y, label)
    print(X.shape, Y.shape)
    return X, Y

In [None]:
# create X_train, Y_train
X_train = np.array([]).reshape(0,vector_size)
Y_train = np.array([])

X_test1 = np.array([]).reshape(0,vector_size)
Y_test1 = np.array([])

X_test2 = np.array([]).reshape(0,vector_size)
Y_test2 = np.array([])

In [None]:
X_train, Y_train = load_data(sentences_df_train)
X_test1, Y_test1 = load_data(sentences_df_test1)
X_test2, Y_test2 = load_data(sentences_df_test2)


In [None]:
data_path = os.path.join(ROOT_DIR, 'XY')

np.save(os.path.join(data_path, 'X_train.npy'), X_train)
np.save(os.path.join(data_path, 'X_test1.npy'), X_test1)
np.save(os.path.join(data_path, 'X_test2.npy'), X_test2)
np.save(os.path.join(data_path, 'Y_train.npy'), Y_train)
np.save(os.path.join(data_path, 'Y_test1.npy'), Y_test1)
np.save(os.path.join(data_path, 'Y_test2.npy'), Y_test2)

In [None]:
print (X_train.shape, Y_train.shape)
print (X_test1.shape, Y_test1.shape)
print (X_test2.shape, Y_test2.shape)

### Encode labels
- Convert labels from B-I-O to $0, 1, 2$ for SVM
- Convert labels from B-I-O to $[1 0 0, 0 1 0, 0 0 1]$ for ANN

In [None]:
# encode class values as integers = B-I-O -> 0-1-2
encoder = LabelEncoder()
encoded_Y = encoder.fit_transform(Y_train)
Y_train = encoded_Y
# convert integers to one-hot encoding
Y_train_one_hot = np_utils.to_categorical(encoded_Y) # SVM does not need one-hot encoding