In [None]:
# Gensim
import gensim
from gensim.models import Word2Vec

import os

import pandas as pd
import numpy as np

# custom
from analize_text import get_sentenceID
from paths import *

# nltk
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk import pos_tag, pos_tag_sents

from multiprocessing import cpu_count

# scikit learn
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder

# keras
from keras.utils import np_utils

from feature_transformer import *

In [None]:
# read dataframes of sentences and entities
sentences_df_train = pd.read_csv(SENTENCE_PATH_train)
entities_df_train = pd.read_csv(ENTITY_PATH_train)

sentences_df_test1 = pd.read_csv(SENTENCE_PATH_test1)
entities_df_test1 = pd.read_csv(ENTITY_PATH_test1)

sentences_df_test2 = pd.read_csv(SENTENCE_PATH_test2)
entities_df_test2 = pd.read_csv(ENTITY_PATH_test2)

In [None]:
# Concatenating training and test data for the word2vec training!
sentences_df = pd.concat([sentences_df_train,
                          sentences_df_test1,
                          sentences_df_test2]).drop_duplicates().reset_index(drop=True)

entities_df = pd.concat([entities_df_train,
                         entities_df_test1,
                         entities_df_test2]).drop_duplicates().reset_index(drop=True)

In [None]:
print('Entities dataframe')
entities_df_train.head()
#entities_df.info()

In [None]:
print('Sentences dataframe')
sentences_df_train.info()
sentences_df_test2.info()

### Load label dictionary {sentenceID: [ 'B', 'I', ..., 'O'] }

In [None]:
label_dict_path = os.path.join(ROOT_DIR, 'Train', 'bio_labels')
label_dict = np.load(label_dict_path + '.npy').item()

sentenceIDs = label_dict.keys()

In [None]:
# get sentences containing at least an entity 
sentences = [sentences_df[sentences_df.sentenceID == sentenceID]['sentenceText'].values[0] 
             for sentenceID in sentenceIDs]

# remove duplicates from sentence list (sentences with e.g. 2 entities appeared twice)
sentences = list(set(sentences))

In [None]:
### WTF ### happens iterating from sentences_df instead of indexing by entities 
dd = sentences_df['sentenceText']
for i, sentence in zip(range(len(dd)), dd.values):
    if not isinstance(sentence, str):
        print(i, True, sentence)
        #break # remove this to see all

In [None]:
# just check nothing is wrong (it should not print anything)
for s in sentences:
    if not isinstance(s, str):
        print(True)

### Tokenize sentences

In [None]:
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
tokenized_sentences[0]

In [None]:
### EXAMPLE STEM + POS ####
# POS could differ slightly when applied to the stemmed version or not
# TODO: try which one performs better
stemmer = EnglishStemmer()
s = ['interaction', 'between', 'cimetidine', 'and', 'warfarin', 'could', 'be', 'dangerous']

print ('stemmed version:\n')
stemmed_s = [stemmer.stem(w) for w in s]
print (stemmed_s)

print ('\noriginal pos tags:\n')
print(pos_tag(s))

print ('\nstemmed pos tags:\n')
print(pos_tag(stemmed_s))

### POS tag

In [None]:
tokenized_sentences_pos = pos_tag_sents(tokenized_sentences, tagset=None) # tagset = None, 'universal', 'wsj', 'brown'

# concatenate the part of speach to each word (e.g. cat_NN)
tokenized_sentences_pos = [ [w + '_' + pos for w, pos in s ] for s in tokenized_sentences_pos]
tokenized_sentences_pos[0]

### Word2Vec

In [None]:
vector_size = 20
model = Word2Vec(tokenized_sentences_pos, size=vector_size, window=5, min_count=1, workers=cpu_count(), compute_loss=True)
model.train(sentences, total_examples=len(sentences), epochs=10)
print ('latest loss:', model.get_latest_training_loss())

In [None]:
# save embeddings and delete model
model.save("../word_vectors")
#model = Word2Vec.load('../word_vectors')
word_vectors = model.wv
del model

In [None]:
print(word_vectors["conduction_NN"])

### Creating the dataset

In [None]:
def load_data(df):
    X = np.array([]).reshape(0,vector_size)
    words = np.array([])
    Y = np.array([])
    for sentenceID, labels in label_dict.items():
        if df[df.sentenceID == sentenceID].empty:
            #print('empty')
            continue
        else: 
            sentence = df[df.sentenceID == sentenceID]['sentenceText'].values[0]
            #print(sentence) 
        tok_sentence = word_tokenize(sentence)
        tok_sentence_pos = [ word + '_' + pos for word, pos in pos_tag(tok_sentence, tagset=None)]

        for word, label in zip(tok_sentence_pos, labels):
            word_vector = word_vectors[word]
            #print(word_vector, label)
            X = np.vstack((X, word_vector))
            Y = np.append(Y, label)       
            words = np.append(words, word)
            #print(word)
    return X, Y, words

In [None]:
# create X_train, Y_train
X_train = np.array([]).reshape(0,vector_size)
Y_train = np.array([])
words_train = np.array([])

X_test1 = np.array([]).reshape(0,vector_size)
Y_test1 = np.array([])
words_test1 = np.array([])

X_test2 = np.array([]).reshape(0,vector_size)
Y_test2 = np.array([])
words_test2 = np.array([])

In [None]:
X_train, Y_train, words_train = load_data(sentences_df_train)
X_test1, Y_test1, words_test1 = load_data(sentences_df_test1)
X_test2, Y_test2, words_test2 = load_data(sentences_df_test2)

In [None]:
print (X_train.shape, Y_train.shape)
print (X_test1.shape, Y_test1.shape)
print (X_test2.shape, Y_test2.shape)


### One-hot encoding of labels
Convert labels from B-I-O to $[1 0 0, 0 1 0, 0 0 1]$

In [None]:
# encode class values as integers = B-I-O -> 0-1-2
encoder = LabelEncoder()
encoded_Y = encoder.fit_transform(Y_train)
Y_train = encoded_Y
# convert integers to one-hot encoding
Y_train_one_hot = np_utils.to_categorical(encoded_Y) # SVM does not need one-hot encoding

### Creating feature transformers

In [None]:
class Embedder(FeatureTransformer):
  # returns embedding for each word
  def transform(self, X, y=None, **fit_params):
    return np.array([word_vectors[x] for x in X])

In [None]:
emb = Embedder()
e = emb.transform(['conduction_NN', words_train[7]])
print(e.shape)

In [None]:
class Capitalizer(FeatureTransformer):
    # return 1 if the word is capitalized
    def transform(self, X, y=None, **fit_params):
        return np.array([1 if x[0].isupper() else 0 for x in X]).reshape(-1,1)

In [None]:
cap = Capitalizer()
c = cap.transform(['conduction_NN', words_train[7]])
print(c.shape)

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC, LinearSVC

clf = LinearSVC()

pipe = FeatureUnion([
    ('emb', Embedder()),
    ('cap', Capitalizer()),
])

print(words_train.reshape(-1, 1).shape)
data = pipe.transform(words_train.reshape(-1, 1))
print(data.shape)
#clf.fit(data)

## split train / validation set

In [None]:
# split train validation (SVM)
train_perc = 0.8
train_size = int(len(X_train) * train_perc)

X_tr, X_vl = X_train[:train_size,:], X_train[train_size:,:]
Y_tr, Y_vl = Y_train[:train_size], Y_train[train_size:]

print (X_tr.shape, X_vl.shape)
print (Y_tr.shape, Y_vl.shape)

In [None]:
# split train validatioin (NN)
Y_tr_nn, Y_vl_nn = Y_train_one_hot[:train_size], Y_train_one_hot[train_size:]

# Using SVM

In [None]:
from sklearn import svm

model = svm.SVC(kernel='rbf', C=1.0, class_weight=None, gamma='auto', tol=0.001, random_state=None)

In [None]:
model.fit(X_tr, Y_tr)

In [None]:
print ('SVM validation accuracy:', round(model.score(X_vl, Y_vl),4))

# Using ANN

In [None]:
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from sklearn import preprocessing
from keras.optimizers import *
from keras.initializers import *

import matplotlib.pyplot as plt

In [None]:
num_inputs = X_train.shape[1] # size of a vector
num_outputs = 3 # b-i-o tags

model = Sequential()

model.add(Dense(units=64, input_shape=(num_inputs,), activation='relu'))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=num_outputs, activation='softmax'))

model.compile(loss='categorical_crossentropy', 
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

In [None]:
epochs = 10
batch_size = 10

history = model.fit(X_tr, Y_tr_nn, 
                    epochs=10, 
                    shuffle=True, verbose=1, 
                    batch_size=batch_size,
                    validation_data=(X_vl, Y_vl_nn))

In [None]:
plt.plot(history.history['loss'], label='tr loss', linestyle='--', marker='o')
plt.plot(history.history['val_loss'], label='vl loss', linestyle='-', marker='o')
plt.xlabel('Epoch')
plt.ylabel('Cathegorical cross-entropy')
plt.legend(loc='best')

plt.show()

plt.plot(history.history['acc'], label='tr accuracy', linestyle='--', marker='o')
plt.plot(history.history['val_acc'], label='vl accuracy', linestyle='-', marker='o')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='best')

plt.show()

# Test