In [None]:
# Gensim
import gensim
from gensim.models import Word2Vec


# to make nbs importable
import io, os, sys, types
from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell
#from nbs_import import NotebookLoader

import pandas as pd
import numpy as np

# custom
from analize_text import get_sentenceID
from paths import *


# nltk
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk import pos_tag, pos_tag_sents

from multiprocessing import cpu_count

# scikit learn
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score

# keras
from keras.utils import np_utils

from feature_transformer import *

### Loading the dataset (embeddings_POS)

In [None]:
data_path = os.path.join(ROOT_DIR, 'XY', 'STEM_20')
X_test2 = np.load(os.path.join(data_path, 'X_test2.npy'))
X_test1 = np.load(os.path.join(data_path, 'X_test1.npy'))
X_train = np.load(os.path.join(data_path, 'X_train.npy'))

Y_test2 = np.load(os.path.join(data_path, 'Y_test2.npy'))
Y_test1 = np.load(os.path.join(data_path, 'Y_test1.npy'))
Y_train = np.load(os.path.join(data_path, 'Y_train.npy'))
print(X_train.shape, Y_train.shape)

In [None]:
w2vmodel = Word2Vec.load('../word_vectors_stem_20')
word_vectors = w2vmodel.wv

### Encode labels
- Convert labels from B-I-O to $0, 1, 2$ for SVM
- Convert labels from B-I-O to $[1 0 0, 0 1 0, 0 0 1]$ for ANN

In [None]:
# encode class values as integers = B-I-O -> 0-1-2
encoder = LabelEncoder()
encoded_Y = encoder.fit_transform(Y_train)
Y_train = encoded_Y

### Creating feature transformers

In [None]:
class Embedder(FeatureTransformer):
  # returns embedding for each word
  def transform(self, X, y=None, **fit_params):
    return np.array([word_vectors[x] for x in X])

In [None]:
emb = Embedder()
e = emb.transform(['conduction_NN', words_train[7]])
print(e.shape)

In [None]:
class Capitalizer(FeatureTransformer):
    # return 1 if the word is capitalized
    def transform(self, X, y=None, **fit_params):
        return np.array([1 if x[0].isupper() else 0 for x in X]).reshape(-1,1)

In [None]:
cap = Capitalizer()
c = cap.transform(['conduction_NN', words_train[7]])
print(c.shape)

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC, LinearSVC

clf = LinearSVC()

pipe = FeatureUnion([
    ('emb', Embedder()),
    ('cap', Capitalizer()),
])

print(words_train.reshape(-1, 1).shape)
data = pipe.transform(words_train.reshape(-1, 1))
print(data.shape)
#clf.fit(data)

# Data preprocessing
experiment with:
- original data
- MinMaxScaler
- Standardizer

In [None]:
#min_max_scaler = MinMaxScaler()
#X_train = min_max_scaler.fit_transform(X_train)

#standard_scaler = StandardScaler()
#X_train = standard_scaler.fit_transform(X_train)

## split train / validation set

In [None]:
# split train validation (SVM)
train_perc = 0.9
train_size = int(len(X_train) * train_perc)

X_tr, X_vl = X_train[:train_size,:], X_train[train_size:,:]
Y_tr, Y_vl = Y_train[:train_size], Y_train[train_size:]

print ("X train/validation shapes:", X_tr.shape, X_vl.shape)
print ("Y train/validation shapes (svm):", Y_tr.shape, Y_vl.shape)


# Using SVM

In [None]:
from sklearn import svm
from sklearn.grid_search import GridSearchCV

# one-vs-all classifier
model = svm.SVC(kernel='rbf', 
                C=1.0,
                class_weight=None,
                gamma='auto',
                #penalty='l2',
                #loss='squared_hinge',
                tol=0.001, random_state=None)

In [None]:
param_grid = [ # SVM
    { 'kernel': ['linear', 'rbf'],
      'C': [0.01, 0.1, 1, 10],
    }]

In [None]:
gs_clf = GridSearchCV(model, param_grid, cv=2, verbose=50)
gs_clf = gs_clf.fit(X_tr, Y_tr)

#### Compute metrics
- accuracy
- f1-score (micro, macro, weighted)
- precision (micro, macro, weighted)
- recall (micro, macro, weighted)

In [None]:
predictions = model.predict(X_vl)

# micro:
# computes metrics globally - considering total number of TP, FP, FN
f1_micro = f1_score(Y_vl, predictions, average='micro')
recall_micro = recall_score(Y_vl, predictions, average='micro')
precision_micro = precision_score(Y_vl, predictions, average='micro')

# macro:
# compute metrics for each label, then unweighted average them - does not take class imbalance into account
f1_macro = f1_score(Y_vl, predictions, average='macro')
recall_macro = recall_score(Y_vl, predictions, average='macro')
precision_macro = precision_score(Y_vl, predictions, average='macro')

# weighted:
# as macro, but taking into account true class frequencies for each label
f1_weighted = f1_score(Y_vl, predictions, average='weighted')
recall_weighted = recall_score(Y_vl, predictions, average='weighted')
precision_weighted = precision_score(Y_vl, predictions, average='weighted')

accuracy = accuracy_score(Y_vl, predictions)

In [None]:
print ('SVM validation accuracy:', round(accuracy, 4))
print ('-'*20)
print ('SVM validation f1-score (micro):', round(f1_micro, 4))
print ('SVM validation precision (micro):', round(precision_micro, 4))
print ('SVM validation recall (micro):', round(recall_micro, 4))
print ('-'*20)
print ('SVM validation f1-score (macro):', round(f1_macro, 4))
print ('SVM validation precision (macro):', round(precision_macro, 4))
print ('SVM validation recall (macro):', round(recall_macro, 4))
print ('-'*20)
print ('SVM validation f1-score (weighted):', round(f1_weighted, 4))
print ('SVM validation precision (weighted):', round(precision_weighted, 4))
print ('SVM validation recall (weighted):', round(recall_weighted, 4))

# Output
IdSentence|startOffset-endOffset|text|null

In [None]:
sentences_df_test1 = pd.read_csv(SENTENCE_PATH_test1)
entities_df_test1 = pd.read_csv(ENTITY_PATH_test1)

In [None]:
def token_spans(txt):
    token_offset = []
    tokens = nltk.word_tokenize(txt)
    offset = 0
    for token in tokens:
        offset = txt.find(token, offset)
        token_offset.append((token, offset, offset+len(token)-1))
        offset += len(token)
    return tokens, token_offset

def find_end_entity_index(labels_list, current_word_index):
    end_entity_index = current_word_index
    for i in range(current_word_index + 1, len(labels_list)):
        if labels_list[i] == 1: # if label == I
            end_entity_index += 1
            continue
        else:
            break
    return end_entity_index

stemmer = EnglishStemmer()

In [None]:
output_string = ''
vector_size = 20


for index, row in sentences_df_test1.iterrows():
    sentenceId = row['sentenceID']
    sentenceText = row['sentenceText']
    # 1. tokenize sentence
    tok_sentence, token_offset = token_spans(sentenceText)
    # 2. add part of speech
    #tok_sentence_pos = [ word + '_' + pos for word, pos in pos_tag(tok_sentence, tagset=None)]
    tok_sentence_pos = [ stemmer.stem(word) for word, pos in pos_tag(tok_sentence, tagset=None)]
    # 3. get word vectors, predict and write output line
    vectors_to_predict = np.array([]).reshape(0, vector_size)
    for word in tok_sentence_pos:
        vector = word_vectors[word]
        vectors_to_predict = np.vstack((vectors_to_predict, vector))
    # 4. predict
    predictions = model.predict(vectors_to_predict)
    predicted_labels = predictions #np.argmax(predictions, axis=1)
    # 5. generate output
    for i in range(len(predicted_labels)):
        if predicted_labels[i] == 0:
            end_entity_index = find_end_entity_index(predicted_labels, i)
            start = token_offset[i][1]
            end = token_offset[end_entity_index][2]
            output_string += sentenceId + '|' + str(start) + '-' + str(end) + '|' + sentenceText[start:end+1] + '|null\n'

In [None]:
output_file_task_1 = '../results/task9.1_GROUP_1_SVM.txt'
with open(output_file_task_1, "w") as out_file:
    out_file.write(output_string)

In [None]:
output_string = ''
vector_size = 20


for index, row in sentences_df_test1.iterrows():
    sentenceId = row['sentenceID']
    sentenceText = row['sentenceText']
    # 1. tokenize sentence
    tok_sentence, token_offset = token_spans(sentenceText)
    # 2. add part of speech
    tok_sentence_pos = [ word + '_' + pos for word, pos in pos_tag(tok_sentence, tagset=None)]
    # 3. get word vectors, predict and write output line
    vectors_to_predict = np.array([]).reshape(0, vector_size)
    for word in tok_sentence_pos:
        vector = word_vectors[word]
        vectors_to_predict = np.vstack((vectors_to_predict, vector))
    # 4. predict
    predictions = model.predict(vectors_to_predict)
    predicted_labels = np.argmax(predictions, axis=1)
    # 5. generate output
    for i in range(len(predicted_labels)):
        if predicted_labels[i] == 0:
            end_entity_index = find_end_entity_index(predicted_labels, i)
            start = token_offset[i][1]
            end = token_offset[end_entity_index][2]
            output_string += sentenceId + '|' + str(start) + '-' + str(end) + '|' + sentenceText[start:end+1] + '|null\n'

In [None]:
output_file_task_1 = '../results/task9.1_GROUP_1.txt'
with open(output_file_task_1, "w") as out_file:
    out_file.write(output_string)