In [None]:
import sys

sys.path.append('../libs/')

import os

import keras

from tensorflow.python.client import device_lib

import numpy as np
np.random.seed(31)

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

### Data loading

In [None]:
import json 

input_data_path = '../data/annotated_corpus_fixed+syntaxnet.json'

with open(input_data_path, 'r') as f:
    data = json.load(f)

print 'Number of examples: ', len(data)

In [None]:
from framebank_preprocessing.convert_corpus_to_brat import make_text, create_verb_example_index

verb_index = create_verb_example_index(data)

print 'Original number of verbs: ', len(verb_index)

stat = sorted([(verb, len(examples)) for verb, examples in verb_index.iteritems()], 
              key = lambda x: x[1], reverse=True)

verbs_to_keep = [verb for verb, count in stat if count >= 10]
print 'Number of selected verbs: ', len(verbs_to_keep)

In [None]:
examples = list()

for verb in verbs_to_keep:
    indexes = verb_index[verb]
    
    for ind in indexes:
        examples.append((ind, data[ind]))

print 'Number of framebank text examples for selected verbs:', len(examples)

In [None]:
text, _ = make_text(examples[0][1], 0)

print text

In [None]:
examples[0][1][0][0]

### Feature construction

In [None]:
import re

MAX_VALUE = 50000

def get_words_around(address, example, n):
    it = n
    curr = 0
    result = list()
    while it > 0:
        index = address[1] - curr
        if index < 0:
            result.append(-1)
            it -= 1
        elif 'feat' in example[address[0]][index]:
            result.append(index)
            it -= 1
        
        curr += 1
    
    curr = 1
    it = n
    while it > 0:
        index = address[1] + curr
        if index >= len(example[address[0]]):
            result.append(MAX_VALUE)
            it -= 1
        elif 'feat' in example[address[0]][index]:
            result.append(index)
            it -= 1
        
        curr += 1
    
    return sorted(result)


def get_pos(feat_vec):
    pos = feat_vec[0].strip() if feat_vec else str()
    
    replace_dict = {u'S-PRO' : u'SPRO',
                   u'A-PRO' : u'APRO',
                   u'ADV-PRO' : u'ADVPRO',
                   u'PRAEDIC-PRO' : 'PRAEDICPRO'}
    
    if pos in replace_dict:
        return replace_dict[pos]
    else:
        return pos

    
RUSVECTORES_EMBEDDINGS = True
    

def get_seq_lemmas(seq, ex_sent, word_num, special_tag):
    result = list()
    
    for e in seq:
        if e < 0:
            result.append(u'')
        elif e == MAX_VALUE:
            result.append(u'')
        else:
            lemma = str()
            morph = str()
            if 'lemma' in ex_sent[e]:
                lemma = ex_sent[e]['lemma']
            else:
                lemma = ex_sent[e]['form']
            
            if 'feat' in ex_sent[e]:
                morph = get_pos(ex_sent[e]['feat'])
            
            if e == word_num:
                morph = special_tag
            
            if RUSVECTORES_EMBEDDINGS:
                result.append(lemma + '_' + morph)
            else:
                result.append(lemma.encode('utf8'))
    
    return result


def split_feats(feat):
    return re.split('=|,| |\|', feat)


ARG_SPECIAL_TAG = 'ARGSPECIAL'
PRED_SPECIAL_TAG = 'PREDSPECIAL'


def parse_morph_features(feat_str):
    splitted_feats = split_feats(feat_str)
    
    def make_binary_feat(feat, feat_true, feat_false):
        result = 0.
        if feat_true in feat:
            result = 1.
        elif feat_false in feat:
            result = -1.
        return result
            
    def make_multicategorial_feat(feat, feats):
        result = u''
        for categ in feats:
            if categ in feat:
                return categ
        
        return u'ABSENT'
    
    def prepare_feat(feat, feats):
        if len(feats) > 2:
            return make_multicategorial_feat(feat, feats)
        else:
            return make_binary_feat(feat, feats[0], feats[1])
    
    anim = prepare_feat(splitted_feats, [u'anim', u'inan'])
    vform = prepare_feat(splitted_feats, [u'pf', u'ipf'])
    case = prepare_feat(splitted_feats, ['nom', 'gen', 'dat', 'dat2', 'acc', 'ins', 
                                         'loc', 'gen2', 'acc2', 'loc2', 'voc', 'adnum'])
    #case = splitted_feats[-1] if splitted_feats else str()
    zform = prepare_feat(splitted_feats, [u'act', u'pass', u'med'])
    shform = prepare_feat(splitted_feats, [u'brev', u'plen'])
    pform = prepare_feat(splitted_feats, [u'intr', u'tran'])
    vvform = prepare_feat(splitted_feats, [u'inf', u'partcp', u'ger'])
    nform = prepare_feat(splitted_feats, [u'indic', u'imper', u'imper2'])
    time = prepare_feat(splitted_feats, [u'praet', u'praes', u'fut'])
        
    return {
        'pos' : get_pos(splitted_feats),
        'case' : case,
        'anim' : anim,
        'vform' : vform,
        'zform' : zform,
        'shform' : shform,
        'pform' : pform,
        'vvform' : vvform,
        'nform' : nform,
        'time' : time
    }


def extract_preposition(arg, example):
    sent = example[arg[0]]
    arg_word = sent[arg[1]]
    
    for word in sent:
        if 'parent' not in word:
            continue
        
        if word['parent'] != arg[1]:
            continue
        
        if word['postag_p'] != u'ADP':
            continue
        
        if 'lemma' not in word:
            continue
        
        return word['lemma']
    
    return u'ABSENT'


def process_arg_pred(ex_id, pred, args, example):
    feature_sets = list()
    
    for arg in args:
        pred_word = example[pred[0]][pred[1]]
        arg_word = example[arg[0]][arg[1]]
        
        word_indexes = get_words_around(arg, example, 1)
        arg_context_lemmas = get_seq_lemmas(word_indexes, 
                                            example[arg[0]],
                                            arg[1],
                                            ARG_SPECIAL_TAG)
        
        word_indexes = get_words_around(pred, example, 5)
        pred_context_lemmas = get_seq_lemmas(word_indexes, 
                                             example[pred[0]],
                                             pred[1],
                                             PRED_SPECIAL_TAG)
        
        arg_feat = arg_word['feat'] if 'feat' in arg_word else str()
        splitted_arg_feat = split_feats(arg_feat)
        arg_pos = get_pos(splitted_arg_feat)
        arg_case = splitted_arg_feat[-1] if splitted_arg_feat else str()
        
        pred_feat = pred_word['feat'] if 'feat' in pred_word else str()
        splitted_pred_feat = split_feats(pred_feat)
        pred_pos = get_pos(splitted_pred_feat)
        
        if RUSVECTORES_EMBEDDINGS:
            arg_lemma = arg_word['lemma'] + u'_' + arg_pos if 'lemma' in arg_word else str()
            pred_lemma = pred_word['lemma'] + u'_' + pred_pos if 'lemma' in pred_word else str()
        else:
            arg_lemma = arg_word['lemma'].encode('utf8') if 'lemma' in arg_word else str()
            pred_lemma = pred_word['lemma'].encode('utf8') if 'lemma' in pred_word else str()
        
        dist = 1. * abs(arg[1] - pred[1]) if pred[0] == arg[0] else 10.
        
        link_name = arg_word['link_name'] if 'link_name' in arg_word else str()
        
        arg_prep = extract_preposition(arg, example)
        
        tag_features = lambda string, ff: {string + u'_' + k : v for k, v in parse_morph_features(ff).iteritems()}
        arg_feat_res = tag_features('arg', arg_feat)
    
        pred_feat_res = tag_features('pred', pred_feat)
        
        features = {'pred_lemma' : pred_lemma, 
                    'arg_lemma' : arg_lemma,
                    'arg_context_lemmas' : arg_context_lemmas,
                    'pred_context_lemmas' : pred_context_lemmas,
                    'role' : arg_word['rolepred1'],
                    'rel_pos' : 1. if arg < pred else -1.,
                    'dist' : dist,
                    'arg_prep' : arg_prep,
                    'link_name' : link_name,
                    'ex_id' : ex_id,
                    'arg_address' : arg
                   }
        features.update(arg_feat_res)
        features.update(pred_feat_res)
        
        feature_sets.append(features)
    
    return feature_sets


In [None]:
def process_example(ex_id, sentences):
    pred = None
    args = list()
    for sent_num, sent in enumerate(sentences):
        for word_num, word in enumerate(sent):
            if 'rank' in word and word['rank'] == u'Предикат':
                pred = (sent_num, word_num)
            elif 'rolepred1' in word:
                args.append((sent_num, word_num))
    
    return process_arg_pred(ex_id, pred, args, sentences)

In [None]:
import pandas as pd

def extract_features_and_make_dataframe(examples):
    feature_sets = list()
    for ex_id, ex in examples:
        feature_sets += process_example(ex_id, ex)

    print 'Number of dataset objects:', len(feature_sets)
    
    pd_data = pd.DataFrame(feature_sets)
    pd_data = pd_data.sample(frac=1) # Shuffeling data for neural network
    
    return pd_data

pd_data = extract_features_and_make_dataframe(examples)
pd_data[:10]

In [None]:
del data

### Preprocessing

#### Label preprocessing 

In [None]:
y_stat = pd_data.loc[:, 'role'].value_counts()
drop_ys = y_stat[y_stat < 180].index
clear_data = pd_data.drop(pd_data[pd_data.loc[:, 'role'].isin(drop_ys)].index)

In [None]:
repl_roles = {
    u'агенс - субъект восприятия' : u'субъект восприятия',
    u'агенс - субъект ментального состояния' : u'субъект ментального состояния',
    u'результат / цель' : u'результат',
    u'место - пациенс' : u'место',
    u'говорящий - субъект психологического состояния' : u'субъект психологического состояния'
}


def normalize_single_region(data, rep, val):
    data.loc[:, 'role'] = data.loc[:, 'role'].str.replace(rep, val)


for rep, val in repl_roles.iteritems():
    normalize_single_region(clear_data, rep, val)
    
number_of_roles = len(clear_data.loc[:, 'role'].value_counts().index)
print 'Number of roles: ', number_of_roles
clear_data.loc[:, 'role'].value_counts()

In [None]:
y_orig = clear_data.loc[:, 'role']
X_orig = clear_data.drop('role', axis = 1)

In [None]:
from sklearn.preprocessing import LabelBinarizer

label_encoder = LabelBinarizer()
y = label_encoder.fit_transform(y_orig)

In [None]:
X_orig.shape

#### Embedding

In [None]:
from gensim.models.word2vec import Word2Vec

embeddings_path = '../data/ruscorpora_mean_hs.model.bin'
embeddings = Word2Vec.load_word2vec_format(embeddings_path, binary=True)

In [None]:
import multiprocessing as mp
import numpy as np


def get_embeddings_length(embeddings):
    if RUSVECTORES_EMBEDDINGS:
        return embeddings[u'стоять_V'].shape[0]
    else:
        return embeddings[u'бежать'.encode('utf8')].shape[0]


def make_embeded_form(word):
    if word:
        return u"{}_{}".format(word[1], word[0])
    else:
        return word


class Embedder_map(object):
    def __init__(self, X):
        self.X_ = X
        self.embeddings_length_ = get_embeddings_length(embeddings)

    def __call__(self, i):  
        result = np.zeros((len(self.X_[0]), self.embeddings_length_))

        for j in xrange(len(self.X_[0])):
            word = self.X_[i][j]
            tag = word[0] if word else str()
            #word = make_embeded_form(word)
            if tag == ARG_SPECIAL_TAG or tag == ARG_SPECIAL_TAG:
                result[j, :] = np.ones(self.embeddings_length_)
            elif word and word in embeddings:
                result[j, :] = embeddings[word]

        return result


def embed(X):
    pool = mp.Pool(4)
    result = pool.map(Embedder_map(X), X.index, 1000)
    pool.close()
        
    return np.asarray(result)


In [None]:
%%time

arg_context_embedded = embed(X_orig.loc[:, 'arg_context_lemmas'])

In [None]:
%%time

pred_context_embedded = embed(X_orig.loc[:, 'pred_context_lemmas'])

In [None]:
class Embedder_single_map(object):
    def __init__(self, X):
        self.X_ = X
        self.embeddings_length_ = get_embeddings_length(embeddings)

    def __call__(self, i):
        #word = make_embeded_form(self.X_[i])
        word = self.X_[i]
        if word in embeddings:
            return embeddings[word]
        else:
            return np.zeros((self.embeddings_length_,))

        
def embed_single(X):
    pool = mp.Pool(4)
    result = pool.map(Embedder_single_map(X), X.index, 1000)
    
    pool.close()
        
    return np.asarray(result)


In [None]:
%%time

embedded_verbs = embed_single(X_orig.pred_lemma)
print embedded_verbs.shape

In [None]:
(np.linalg.norm(embedded_verbs, axis = 1) < 0.001).sum()

In [None]:
clear_data[(np.linalg.norm(embedded_verbs, axis = 1) < 0.001)].pred_lemma.value_counts().shape

In [None]:
%%time

embedded_args = embed_single(X_orig.arg_lemma)
print embedded_args.shape

In [None]:
(np.linalg.norm(embedded_args, axis = 1) < 0.001).sum()

#### Vectorizing categorial features

In [None]:
from sklearn.feature_extraction import DictVectorizer


def vectorize_categorial_features(feat_list):   
    categ_feats = [e for e in feat_list if X_orig.loc[:,e].dtype in [str, unicode, object]]
    not_categ = [e for e in feat_list if e not in categ_feats]
    print 'Category features:\n', categ_feats
    print 'Not category features:\n', not_categ
    
    vectorizer = DictVectorizer(sparse = False)
    one_hot_feats = vectorizer.fit_transform(X_orig.loc[:, categ_feats].to_dict(orient = 'records'))
    print one_hot_feats.shape
    
    not_categ_columns = np.concatenate(tuple(X_orig.loc[:, e].as_matrix().reshape(-1, 1) 
                                             for e in not_categ), axis =1)
    no_lemma_plain_features = np.concatenate((not_categ_columns,
                                              one_hot_feats), axis = 1)
    
    return no_lemma_plain_features, vectorizer


In [None]:
morph_feats = ['pos', 'case', 'anim', 'vform', 'zform', 'shform', 'pform', 'vvform', 'nform', 'time']
morph_feats_arg_pred = ['arg_' + e for e in morph_feats] + ['pred_' + e for e in morph_feats]
all_feats_no_pred_lemma = morph_feats_arg_pred + ['rel_pos', 'arg_prep', 'link_name'] 
no_lemma_plain_features, categ_feat_vecorizer = vectorize_categorial_features(all_feats_no_pred_lemma)
print 'Categorical features without predicate lemma shape:', no_lemma_plain_features.shape

pred_lemma_vectorizer = DictVectorizer(sparse = False)
pred_lemma_feats = pred_lemma_vectorizer.fit_transform(X_orig.loc[:, ['pred_lemma']].to_dict(orient = 'records'))
print 'Pred lemma features shape:', pred_lemma_feats.shape

plain_features = np.concatenate((no_lemma_plain_features,
                                 pred_lemma_feats), axis = 1)
print 'All categorical features shape:', plain_features.shape

#### Out of domain split

In [None]:
from scipy.spatial.distance import cosine

def make_dist_matrix(pd_data, embeddings):
    verb_counts = pd_data.pred_lemma.value_counts()
    dist_matrix = np.zeros((len(verb_counts), len(verb_counts)))

    for i in xrange(dist_matrix.shape[0]):
        left = verb_counts.index[i]
        if left not in embeddings:
            continue

        left_embed = embeddings[left]
        for j in xrange(i, dist_matrix.shape[0]):
            right = verb_counts.index[j]
            if right not in embeddings:
                continue

            right_embed = embeddings[right]
            dist = 1 - cosine(left_embed.reshape(-1,1), right_embed.reshape(-1,1))
            dist_matrix[i, j] = dist
    
    return dist_matrix, verb_counts

In [None]:
def sort_verb_pairs(dist_matrix, verb_counts):
    verb_pairs = list()
    for i in xrange(dist_matrix.shape[0]):
        for j in xrange(i + 1, dist_matrix.shape[0]):
            verb_pairs.append((i, j, dist_matrix[i,j]))
            
    sresult = sorted(verb_pairs, key= lambda k: k[2], reverse=True) # good split
    #sresult = sorted(verb_pairs, key= lambda k: k[2]) # bad split
    
    vresult = [(verb_counts.index[e[0]], verb_counts.index[e[1]], e[2]) for e in sresult[:30]]
    for v in vresult:
        print u'({}, {}, {})'.format(v[0], v[1], v[2])
    
    return vresult

In [None]:
def get_out_of_domain_verbs(vresult):
    out_of_domain_verbs = set()
    in_domain_verbs = set()

    for v in vresult:
        if v[0] not in out_of_domain_verbs:
            if v[0] not in in_domain_verbs:
                out_of_domain_verbs.add(v[0])
                in_domain_verbs.add(v[1])
        else:
            if v[1] not in in_domain_verbs:
                out_of_domain_verbs.add(v[1])
                in_domain_verbs.add(v[0])
    
    return list(out_of_domain_verbs), in_domain_verbs

In [None]:
dist_matrix, verb_counts = make_dist_matrix(pd_data, embeddings)
srt_verbs = sort_verb_pairs(dist_matrix, verb_counts)
ood_verbs, ind_verbs = get_out_of_domain_verbs(srt_verbs)

print
print 'Out-of domain-verbs'
print len(ood_verbs)
for e in ood_verbs:
    print e

In [None]:
selector = X_orig.pred_lemma.isin(ood_verbs).as_matrix()

ood_plain_features = no_lemma_plain_features[selector]
ind_plain_features = no_lemma_plain_features[~selector]

ood_y = y[selector]
ind_y = y[~selector]

ood_arg_context = arg_context_embedded[selector]
ind_arg_context = arg_context_embedded[~selector]

ood_pred_context = pred_context_embedded[selector]
ind_pred_context = pred_context_embedded[~selector]

ood_arg_embed = embedded_args[selector]
ind_arg_embed = embedded_args[~selector]

ood_pred_embed = embedded_verbs[selector]
ind_pred_embed = embedded_verbs[~selector]


In [None]:
print ind_pred_embed.shape
ood_pred_embed.shape

In [None]:
from sklearn.metrics import f1_score

print 'Baseline for ood evaluation:'
gold_pred = ood_y.argmax(axis = 1)
baseline_pred = pd.Series(gold_pred).value_counts().index[0] * np.ones(gold_pred.shape)

f1_micro = f1_score(baseline_pred, gold_pred, average = 'micro')
f1_macro = f1_score(baseline_pred, gold_pred, average = 'macro')
print 'micro', f1_micro
print 'macro', f1_macro

### Model construction

In [None]:
from keras.layers import Input
from keras.layers import TimeDistributed
from keras.layers import Activation
from keras.layers import RepeatVector
from keras.layers import Permute
from keras.layers import merge
from keras.layers import Lambda
from keras.layers import Merge
from keras import backend as K
from keras.models import Model
from keras.layers import Convolution1D
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Flatten
from keras.callbacks import EarlyStopping
from keras.layers import BatchNormalization
from keras.regularizers import l2, activity_l2, l1
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dropout
from keras.preprocessing import sequence
from keras.layers import MaxPooling1D
from gensim.models.word2vec import Word2Vec

#### Simple model

In [None]:
def construct_plain_model(input_shape):
    print 'Plain model.'
    
    plain_model = Sequential()
    plain_model.add(Dense(600, 
                          #input_shape=(plain_features.shape[1],), 
                          input_shape = input_shape,
                          activation = 'relu'))
    plain_model.add(Dropout(0.3))
    
    plain_model.add(Dense(400))
    plain_model.add(BatchNormalization())
    plain_model.add(Activation('relu'))
    plain_model.add(Dropout(0.3))
    
    plain_model.add(Dense(number_of_roles))
    plain_model.add(BatchNormalization())
    plain_model.add(Activation('softmax'))
    
    plain_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return plain_model

#### Complex model

In [None]:
def construct_plain_model_sparse(input_shape, n_embedding_vecs = 2):
    print 'Complex model.'
    
    plain_model = Sequential()
    plain_model.add(Dense(400, input_shape = input_shape))
    plain_model.add(BatchNormalization())
    plain_model.add(Activation('relu'))
    
    def create_embed_model():
        embed_model = Sequential()
        embed_model.add(Dense(100, input_shape = (get_embeddings_length(embeddings), )))
        embed_model.add(BatchNormalization())
        embed_model.add(Activation('relu'))
        
        return embed_model
    
    embed_models = [create_embed_model() for i in xrange(n_embedding_vecs)]
#     arg_embed_model = create_embed_model()
#     pred_embed_model = create_embed_model()
    
    final = Sequential()
#     final.add(Merge([arg_embed_model, pred_embed_model, plain_model], 
#                     mode = 'concat', concat_axis=1))
    final.add(Merge(embed_models + [plain_model], 
                    mode = 'concat', concat_axis = 1))
    final.add(Dropout(0.3))
    final.add(Dense(400))
    final.add(BatchNormalization())
    final.add(Activation('relu'))
    final.add(Dropout(0.3))
    
    final.add(Dense(number_of_roles))
    final.add(BatchNormalization())
    final.add(Activation('softmax'))
    
    final.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return final
    

#### Context model

In [None]:
from keras.layers import Masking
from keras.layers import Bidirectional


def construct_graph_lstm_model(plain_features_shape):
    print 'Context model.'
    
    def create_embed_model():
        embed_model = Sequential()
        embed_model.add(Dense(100, input_shape = (get_embeddings_length(embeddings), )))
        embed_model.add(BatchNormalization())
        embed_model.add(Activation('relu'))
        return embed_model
    
    def construct_attentional_part(context_length):
        seq_model = Sequential()
        seq_model.add(Convolution1D(nb_filter=50, 
                                    filter_length=1, 
                                    border_mode='same', 
                                    activation='relu',
                                    input_shape = (context_length, 
                                                   get_embeddings_length(embeddings))))
#         seq_model.add(Masking(mask_value=0., input_shape = (context_length, 
#                                                             get_embeddings_length(embeddings))))
        #seq_model.add(Masking(mask_value=1.))
        seq_model.add(Bidirectional(LSTM(50), merge_mode='sum'))
        #seq_model.add(LSTM(100))
        seq_model.add(Dense(50))
        seq_model.add(BatchNormalization())
        seq_model.add(Activation('relu'))
        
        return seq_model
    
    ###############################
    
    #arg_context_model = construct_attentional_part(arg_context_embedded.shape[1])
    pred_context_model = construct_attentional_part(pred_context_embedded.shape[1])
    
    ###############################
    
    plain_model = Sequential()
    plain_model.add(Dense(400, input_shape = plain_features_shape))
    plain_model.add(BatchNormalization())
    plain_model.add(Activation('relu'))
    
    ###############################
    
    arg_embed_model = create_embed_model()
    pred_embed_model = create_embed_model()
    
    ###############################
    
    final1 = Sequential()
    final1.add(Merge([
  #              arg_context_model, 
                     #pred_context_model,
                     arg_embed_model,
                     pred_embed_model,
                     plain_model], 
                    mode = 'concat', concat_axis=1))
    final1.add(Dropout(0.3))
    
    final1.add(Dense(400))
    final1.add(BatchNormalization())
    final1.add(Activation('relu'))
    final1.add(Dropout(0.3))
    
    final = Sequential()
    final.add(Merge([final1, pred_context_model], mode = 'concat', concat_axis = 1))
    
    final.add(Dense(number_of_roles))
    final.add(BatchNormalization())
    final.add(Activation('softmax'))
    
    final.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return final

#### Other

In [None]:
def construct_graph_attentional_model():
    print 'Context attentional model.'
    
    def construct_attentional_part(context_length):
        _input = Input(shape = (context_length, 
                                get_embeddings_length(embeddings)), dtype = 'float')

        conv = Convolution1D(nb_filter=200, 
                            filter_length=2, 
                            border_mode='same', 
                            activation='relu')(_input)

        units = 100
        activations = LSTM(units, return_sequences=True)(conv)

        # compute importance for each step
        attention = TimeDistributed(Dense(1, activation='tanh'))(activations)  
        attention = Flatten()(attention)
        attention = Activation('softmax')(attention)
        attention = RepeatVector(units)(attention)
        attention = Permute([2, 1])(attention)

        # apply the attention
        seq_repr = merge([activations, attention], mode='mul')
        seq_repr = Lambda(lambda xin: K.sum(xin, axis=1))(seq_repr)
        seq_model = Model(input=_input, output=seq_repr)
        
        return seq_model
    
    arg_context_model = construct_attentional_part(arg_context_embedded.shape[1])
    pred_context_model = construct_attentional_part(pred_context_embedded.shape[1])
    
    ###############################
    
    plain_model = Sequential()
    plain_model.add(Dense(800, 
                          input_shape=(plain_features.shape[1],), 
                          activation = 'relu'))
    
    
    ###############################
    
    final = Sequential()
    final.add(Merge([arg_context_model, pred_context_model, plain_model], 
                    mode = 'concat', concat_axis=1))
    final.add(Dropout(0.3))
    
    #final.add(Dense(300, activation = 'relu'))
    final.add(Dense(400))
    final.add(BatchNormalization())
    final.add(Activation('relu'))
    final.add(Dropout(0.3))
    
    final.add(Dense(number_of_roles))
    final.add(BatchNormalization())
    final.add(Activation('softmax'))
    #final.add(Dense(number_of_roles, activation = 'softmax'))
#    final.add(BatchNormalization())
    #final.add(Activation('softmax'), W_regularizer=l2(0.01))
    #final.add(Dense(number_of_roles, activation='softmax', W_regularizer = l2(0.01)))
    
    final.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return final

In [None]:
from keras.layers import Bidirectional


def construct_graph_bidirectional_model():
    print 'Bidirectional model.'
    
    arg_context_model = Sequential()
    arg_context_model.add(Convolution1D(nb_filter=150, 
                                        filter_length=2, 
                                        border_mode='same', 
                                        activation='relu',
                                        input_shape = (arg_context_embedded.shape[1], 
                                                       get_embeddings_length(embeddings))))
    arg_context_model.add(Bidirectional(LSTM(100), merge_mode = 'sum'))
    
    ###############################
    
    plain_model = Sequential()
    plain_model.add(Dense(700, 
                          input_shape=(plain_features.shape[1],), 
                          activation = 'relu'))
    
    ###############################
    
    final = Sequential()
    final.add(Merge([arg_context_model, plain_model], mode = 'concat', concat_axis=1))
    final.add(Dropout(0.3))
    
    #final.add(Dense(300, activation = 'relu'))
    final.add(Dense(300))
    final.add(BatchNormalization())
    final.add(Activation('relu'))
    final.add(Dropout(0.3))
    
    final.add(Dense(number_of_roles))
    final.add(BatchNormalization())
    final.add(Activation('softmax'))
    
    final.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return final

In [None]:
def construct_simple_attentional_model():
    units = 80
    _input = Input(shape = (arg_context_embedded.shape[1], 
                            get_embeddings_length(embeddings)), dtype = 'float')

    conv = Convolution1D(nb_filter=128, 
                        filter_length=2, 
                        border_mode='same', 
                        activation='relu')(_input)

    activations = LSTM(units, return_sequences=True)(conv)

    # compute importance for each step
    attention = TimeDistributed(Dense(1, activation='tanh'))(activations) 
    #attention = Dense(6, activation='tanh')(activations) 
    attention = Flatten()(attention)
    attention = Activation('softmax')(attention)
    attention = RepeatVector(units)(attention)
    attention = Permute([2, 1])(attention)

    # apply the attention
    sent_representation = merge([activations, attention], mode='mul')
    sent_representation = Lambda(lambda xin: K.sum(xin, axis=1))(sent_representation)

    #dn = Dense(100, activation = 'tanh')(sent_representation)
    #probabilities = Dense(number_of_roles, activation='softmax')(dn)
    probabilities = Dense(number_of_roles, activation='softmax')(sent_representation)

    model = Model(input=_input, output=probabilities)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
def construct_simple_model():
    model = Sequential()
    model.add(Convolution1D(nb_filter=128, 
                            filter_length=2, 
                            border_mode='same', 
                            activation='relu', 
                            input_shape = (seq_embeded.shape[1], 
                                           get_embeddings_length(embeddings))))

    #model.add(MaxPooling1D(pool_length=2))
    model.add(LSTM(80))
    model.add(Dropout(0.1))
    model.add(Dense(60, activation='tanh'))
    model.add(Dense(number_of_roles, activation='softmax'))
    model.compile(loss='categorical_crossentropy', 
                       optimizer='adam',
                       metrics=['accuracy'])
    print(model.summary())
    
    return model

### Model tuning

#### Tuning for in-domain test

##### Simple model

In [None]:
from keras.callbacks import EarlyStopping

model = construct_plain_model((plain_features.shape[1],))
model.summary()
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
model.fit(plain_features, y, nb_epoch=15, batch_size=64, validation_split = 0.1, 
          shuffle=True, callbacks = [early_stopping])

##### Complex model

In [None]:
model = construct_plain_model_sparse((plain_features.shape[1],))
model.summary()
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
model.fit([embedded_args, embedded_verbs, plain_features], y, 
          nb_epoch=15, batch_size=64, validation_split = 0.1, 
          shuffle=True, callbacks = [early_stopping])

##### Context model

In [None]:
%%time

from keras.callbacks import EarlyStopping

model = construct_graph_lstm_model((plain_features.shape[1],))
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
model.fit([arg_context_embedded, pred_context_embedded, embedded_args, embedded_verbs, plain_features], y, 
          nb_epoch=15, batch_size=64, validation_split = 0.1, 
          shuffle=True, callbacks = [early_stopping])

#### Tuning for out-of-domain test

##### Simple model

In [None]:
from keras.callbacks import EarlyStopping


model = construct_plain_model((ind_plain_features.shape[1],))
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
model.fit(ind_plain_features, ind_y, nb_epoch=15, batch_size=64, validation_split = 0.1, 
          shuffle=True, callbacks = [early_stopping])

#model.evaluate(ood_plain_features, ood_y)
ev_res = evaluate_model(model, [ood_plain_features], ood_y)
print 
print pd.DataFrame([ev_res], columns = ['keras_accur', 'keras_loss', 'f1_micro', 'f1_macro', 'accur'])

##### Complex model

In [None]:
from keras.callbacks import EarlyStopping

model = construct_plain_model_sparse((ind_plain_features.shape[1],))
model.summary()
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
model.fit([ind_arg_embed, ind_pred_embed, ind_plain_features], ind_y, nb_epoch=20, batch_size=64, validation_split = 0.1, 
          shuffle=True, callbacks = [early_stopping])
#model.evaluate([ood_arg_embed, ood_pred_embed, ood_plain_features], ood_y)

ev_res = evaluate_model(model, [ood_arg_embed, ood_pred_embed, ood_plain_features], ood_y)
print 
print pd.DataFrame([ev_res], columns = ['keras_accur', 'keras_loss', 'f1_micro', 'f1_macro', 'accur'])

##### Context model

In [None]:
from keras.callbacks import EarlyStopping

model = construct_graph_lstm_model((ind_plain_features.shape[1],))
model.summary()
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
model.fit([
           #ind_arg_context, 
        #ind_pred_context,   
        ind_arg_embed, 
        ind_pred_embed, 
        ind_plain_features,
        ind_pred_context], 
#model.fit([ind_arg_context, ind_pred_context, ind_arg_embed, ind_pred_embed, ind_plain_features], 
           ind_y, nb_epoch=6, batch_size=64, validation_split = 0.1, 
          shuffle=True, callbacks = [early_stopping])

model.evaluate([
        ood_arg_embed, 
        ood_pred_embed,
        ood_plain_features,
        ood_pred_context
    ], ood_y)

### Evaluation

#### Evaluate for in domain

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

def evaluate_model(model, X_test, y_test):
    keras_eval = model.evaluate(X_test, y_test)
    pred = model.predict(X_test).argmax(axis = 1)
    f1_micro = f1_score(pred, y_test.argmax(axis = 1), average = 'micro')
    f1_macro = f1_score(pred, y_test.argmax(axis = 1), average = 'macro')
    accur = accuracy_score(pred, y_test.argmax(axis = 1))
    
    return np.array(list(keras_eval) + [f1_micro, f1_macro, accur])

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

class Baseline(object):
    def __init__(self, *args, **kwargs):
        pass
        
    def fit(self, X_train, y_train, *args, **kwargs):
        self.pred_class_ = pd.Series(y_train.argmax(axis = 1)).value_counts().index[0]
        self.class_num_ = y_train.shape[1]
    
    def predict(self, X_test, *args, **kwargs):
        result = np.zeros((X_test[0].shape[0], self.class_num_))
        result[:, self.pred_class_] = np.ones((X_test[0].shape[0],))
        return result
    
    def evaluate(self, X_test, y_test, *args, **kwargs):
        return (0., 0.)
    
    def summary(self):
        print 'Baseline'
        pass


def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, *args, **kwargs):
    model.fit(X_train, y_train, *args, **kwargs)
    
    keras_eval = model.evaluate(X_test, y_test)
    
    pred = model.predict(X_test).argmax(axis = 1)
    f1_micro = f1_score(pred, y_test.argmax(axis = 1), average = 'micro')
    f1_macro = f1_score(pred, y_test.argmax(axis = 1), average = 'macro')
    accur = accuracy_score(pred, y_test.argmax(axis = 1))
    
    return list(keras_eval) + [f1_micro, f1_macro, accur]
    

def custom_cross_val(cr_f, X, y, cv, *args, **kwargs):
    cr_f().summary()
    eval_res = list()
    for i, (train, test) in enumerate(cv.split(y)):
        model = cr_f()
        print "Running Fold", i+1, "/", cv.n_splits
        eval1 = train_and_evaluate_model(model, 
                                         [X[j][train] for j in xrange(len(X))], y[train], 
                                         [X[j][test] for j in xrange(len(X))], y[test], 
                                         *args, **kwargs)
        
        print
        print 'Fold result: ', eval1
        eval_res.append(eval1)
    
    return np.array(eval_res)


def describe_cv_result(cv_res):
    print cv_res
    mean_cv_res = cv_res.mean(axis = 0)
    std_cv_res = cv_res.std(axis = 0)
    print 'Mean'
    print pd.DataFrame([mean_cv_res], columns = ['loss', 'keras_accur', 'micro_f1', 'macro_f1', 'accur'])
    print 'Std'
    print pd.DataFrame([std_cv_res], columns = ['loss', 'keras_accur', 'micro_f1', 'macro_f1', 'accur'])

    
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, random_state=42)

In [None]:
curr_features = np.concatenate((no_lemma_plain_features, embedded_verbs), axis = 1)
cv_res = custom_cross_val(lambda : Baseline(), 
                          [curr_features], 
                          y, cv = cv, nb_epoch=13, batch_size=64,
                          validation_split = 0., shuffle=True, verbose = 0)

describe_cv_result(cv_res)

##### Simple model

###### No predicate lemma

In [None]:
curr_features = np.concatenate((no_lemma_plain_features, embedded_verbs), axis = 1)
cv_res = custom_cross_val(lambda : construct_plain_model((curr_features.shape[1],)), 
                          [curr_features], 
                          y, cv = cv, nb_epoch=13, batch_size=64,
                          validation_split = 0., shuffle=True, verbose = 0)

describe_cv_result(cv_res)

###### All categorial features

In [None]:
cv_res = custom_cross_val(lambda : construct_plain_model((plain_features.shape[1],)), 
                          [plain_features], 
                          y, cv = cv, nb_epoch=13, batch_size=64,
                          validation_split = 0., shuffle=True, verbose = 0)

describe_cv_result(cv_res)

###### Categorical features + embeddings

In [None]:
plain_features.shape

In [None]:
single_chunk = np.concatenate((embedded_args, embedded_verbs, plain_features), axis = 1)
cv_res = custom_cross_val(lambda : construct_plain_model((single_chunk.shape[1],)), 
                          [single_chunk], 
                          y, cv = cv, nb_epoch=13, batch_size=64,
                          validation_split = 0., shuffle=True, verbose = 0)

describe_cv_result(cv_res)

##### Complex model

In [None]:
cv_res = custom_cross_val(lambda : construct_plain_model_sparse((plain_features.shape[1],)), 
                          [embedded_args, embedded_verbs, plain_features], y, 
                          cv = cv, nb_epoch=13, batch_size=64,
                          validation_split = 0., shuffle=True, verbose = 0)

describe_cv_result(cv_res)

##### Context model

In [None]:
cv_res = custom_cross_val(lambda : construct_graph_lstm_model((plain_features.shape[1],)), 
                          [arg_context_embedded, 
                           pred_context_embedded, 
                           embedded_args, 
                           embedded_verbs,
                           plain_features], y, 
                          cv = cv, nb_epoch=6, batch_size=64, validation_split = 0., 
                          shuffle=True)

describe_cv_result(cv_res)

#### Evaluate for out of domain

In [None]:
def evaluate_out_of_domain(model, X_train, y_train, X_test, y_test):
    final_res = list()
    N_ITERATIONS = 5
    for i in xrange(N_ITERATIONS):
        print 'Eval iter:', i + 1, '/', N_ITERATIONS
        early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, 
                                       patience=2, verbose=0, mode='auto')
        model.fit(X_train, y_train, nb_epoch=15, 
                  batch_size=64, validation_split = 0.1, 
                  shuffle=True, callbacks = [early_stopping],
                 verbose = 0)

        ev_res = evaluate_model(model, X_test, y_test)
        print 
        print pd.DataFrame([ev_res], columns = ['keras_accur', 'keras_loss', 'f1_micro', 'f1_macro', 'accur'])
        final_res.append(ev_res)
    
    return np.array(final_res)

##### Simple model

In [None]:
model = construct_plain_model((ind_plain_features.shape[1],))
model_eval = evaluate_out_of_domain(model, ind_plain_features, ind_y, ood_plain_features, ood_y)
print model_eval
describe_cv_result(model_eval)

##### Complex model

In [None]:
model = construct_plain_model_sparse((ind_plain_features.shape[1],), 2)
model.summary()
model_eval = evaluate_out_of_domain(model, 
                                    [ind_arg_embed, ind_pred_embed, ind_plain_features], ind_y, 
                                    [ood_arg_embed, ood_pred_embed, ood_plain_features], ood_y)
print model_eval
describe_cv_result(model_eval)

In [None]:
model = construct_plain_model_sparse((ind_plain_features.shape[1],), 1)
model.summary()
model_eval = evaluate_out_of_domain(model, 
                                    [ind_arg_embed, ind_plain_features], ind_y, 
                                    [ood_arg_embed, ood_plain_features], ood_y)
print model_eval
describe_cv_result(model_eval)

##### Context model

In [None]:
model = construct_graph_lstm_model((ind_plain_features.shape[1],))
model.summary()
model_eval = evaluate_out_of_domain(model, 
                                    [ind_arg_embed, ind_pred_embed, ind_plain_features, ind_pred_context], ind_y, 
                                    [ood_arg_embed, ood_pred_embed, ood_plain_features, ood_pred_context], ood_y)
print model_eval
describe_cv_result(model_eval)

### Training and predicting

In [None]:
from sklearn.model_selection import train_test_split


def select_from_nparray_list(nparray_list, selector):
    return [e[selector] for e in nparray_list]


train_ids, test_ids = train_test_split(X_orig.ex_id.unique(), test_size=0.2, random_state=42)
train_ids = set(train_ids.tolist())
test_ids = set(test_ids.tolist())
train_selector_pd = X_orig.ex_id.isin(train_ids)
test_selector_pd = X_orig.ex_id.isin(test_ids)
train_selector = train_selector_pd.as_matrix()
test_selector = test_selector_pd.as_matrix()

In [None]:
model_data = {'model' : construct_plain_model((plain_features.shape[1],)),
              'data' : [plain_features],
              'name' : 'simple'}

In [None]:
model_data = {'model' : construct_plain_model_sparse((plain_features.shape[1],)),
              'data' : [embedded_args, embedded_verbs, plain_features],
              'name' : 'complex'}

In [None]:
model_data = {'model' : construct_plain_model_sparse((no_lemma_plain_features.shape[1],)),
              'data' : [embedded_args, embedded_verbs, no_lemma_plain_features],
              'name' : 'no_pred_lemma'}

In [None]:
model = model_data['model']
model.summary()
model.fit(select_from_nparray_list(model_data['data'], train_selector),
          select_from_nparray_list([y], train_selector), 
          nb_epoch=10, batch_size=64, validation_split = 0.1, shuffle=True, verbose = 0)
evaluate_model(model,
               select_from_nparray_list(model_data['data'], test_selector), 
               select_from_nparray_list([y], test_selector)[0])

In [None]:
pred = model.predict(select_from_nparray_list(model_data['data'], test_selector))

In [None]:
test_examples_to_store = X_orig.loc[test_selector_pd[test_selector_pd].index, :].loc[:, ['arg_address', 'ex_id']]
test_data = {k : data[k] for k in test_ids}

for index, (pd_index, row) in enumerate(test_examples_to_store.iterrows()):
    ex = test_data[row['ex_id']]
    arg_addr = row['arg_address']
    sent = ex[arg_addr[0]]
    token = sent[arg_addr[1]]
    cl = pred[index]
    predicted_role = label_encoder.inverse_transform(np.array([cl]))[0]
    actual_role = label_encoder.inverse_transform(np.array([select_from_nparray_list([y], test_selector)[0][index]]))[0]

    token['rolepred1'] = actual_role
    token['rolepred2'] = predicted_role

In [None]:
result_path = "../data/test_data_annotated_with_{}.json".format(model_data['name'])
with open(result_path, 'w') as f:
    json.dump(test_data, f)

###  Saving model

In [None]:
from keras.models import model_from_json

def save_model(model, name, model_dir):
    output_path = os.path.join(model_dir, name)
    with open(output_path + '.json', 'w') as f:
        f.write(model.to_json())
    
    model.save_weights(output_path + '.h5')

In [None]:
model_path = '../data/models'
save_model(model, model_data['name'], model_path)