# Notebook Setting

In [0]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')


In [0]:
# https://pypi.python.org/pypi/pydot
!apt-get -qq install -y graphviz && pip install -q pydot
import pydot

# Utilities

In [0]:
import nltk
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()

In [0]:
def dataRead(fname):
    print( "Input File Reading")
    fp = open(fname, 'r')
    samples = fp.read().strip().split('\n\n')
    sent_lengths   = []		#1-d array
    sent_contents  = []		#2-d array [[w1,w2,....] ...]
    sent_lables    = []		#1-d array
    entity1_list   = []		#2-d array [[e1,e1_t] [e1,e1_t]...]
    entity2_list   = []		#2-d array [[e1,e1_t] [e1,e1_t]...]
    for sample in samples:
      sent, entities, relation = sample.strip().split('\n')

      e1, e1_t, e2, e2_t = entities.split('\t') 
      sent_contents.append(sent.lower())
      entity1_list.append([e1, e1_t])
      entity2_list.append([e2, e2_t])
      sent_lables.append(relation)

    return sent_contents, entity1_list, entity2_list, sent_lables 


In [0]:
def preProcess(sent):
	sent = sent.lower()
	sent = sent.replace('/',' ')
	sent = sent.replace('.','')
	
	sent = tokenizer.tokenize(sent)
	sent = ' '.join(sent)
	sent = re.sub('\d', 'dg',sent)
	return sent

In [0]:
def makeFeatures(sent_list, entity1_list, entity2_list):
	print ('Making Features')
	word_list = []
	d1_list = []
	d2_list = []
	type_list = []
	for sent, ent1, ent2 in zip(sent_list, entity1_list, entity2_list):
		sent = preProcess(sent)
		sent_list1 = sent.split()
 		
		entity1 = preProcess(ent1[0]).split()
		entity2 = preProcess(ent2[0]).split()
		s1 = sent_list1.index('druga')
		s2 = sent_list1.index('drugb') 
		# distance1 feature	
		d1 = []
		for i in range(len(sent_list1)):
		    if i < s1 :
			     d1.append(str(i - s1))
		    elif i > s1 :
			     d1.append(str(i - s1 ))
		    else:
			     d1.append('0')
		#distance2 feature		
		d2 = []
		for i in range(len(sent_list1)):
		    if i < s2:
			     d2.append(str(i - s2))
		    elif i > s2:
			     d2.append(str(i - s2))
		    else:
			     d2.append('0')
 

	return word_list, d1_list, d2_list


In [0]:
def findSentLengths(tr_te_list):
	lis = []
	for lists in tr_te_list:
		lis.append([len(l) for l in lists])
	return lis

In [0]:
def makeWordList(lista):  
  sent_list = sum(lista, [])
  wf = {}
  for sent in sent_list:
    for w in sent:
      if w in wf:
        wf[w] += 1
      else:
        wf[w] = 0

  wl = []	
  i = 1

  wl.append('<pad>')
  wl.append('<unkown>')
  for w,f in wf.items():
    wl.append(w)
  return wl

In [0]:
def makeDistanceList(lista):
  sent_list = sum(lista, [])
  wf = {}
  for sent in sent_list:
    for w in sent:
      if w in wf:
        wf[w] += 1
      else:
        wf[w] = 0
   
  wl = []	
  i = 1
  for w,f in wf.items():
    wl.append(w)
  return wl

In [0]:
def readWordEmb(word_list, fname, embSize):
  print ("Reading word vectors")
  wv = []
  wl = []
  
  for word in fname.vocab:
    wv.append(fname[word])
    wl.append(word)
  wordemb = []
  count = 0
  for word in word_list:
    if word in wl:
      wordemb.append(wv[wl.index(word)])
    else:
      print(word)
      count += 1
      wordemb.append(np.random.rand(embSize))
      
  wordemb[word_list.index('<pad>')] = np.zeros(embSize)
  wordemb = np.asarray(wordemb, dtype='object')
      
  print ("number of unknown word in word embedding", count)
  return wordemb

In [0]:
def mapWordToId(sent_contents, word_list):
  T = []
  for sent in sent_contents:
    t = []
    for w in sent:
      t.append(word_list.index(w))
    T.append(t)
  return T


In [0]:
def mapLabelToId(sent_lables, label_dict):
  if len(label_dict) > 2:
    return [label_dict[label] for label in sent_lables]
  else:
    return [int (label != 'false') for label in sent_lables]

In [0]:
def paddData(listL, maxl): #W_batch, d1_tatch, d2_batch, t_batch)
  rlist = []
  import keras
  for mat in listL:
    mat_n=keras.preprocessing.sequence.pad_sequences(mat, maxlen=maxl, dtype='int32', padding='post', truncating='post', value=0.0)
    rlist.append(np.array(mat_n))
  return rlist

In [0]:
import operator
def frequentWord(sents):
  wf = {}
  for s in sents:
    for w in s:
      if w in wf:
        wf[w]+=1
      else:
        wf[w]=0

  sorted_x = sorted(wf.items(), key=operator.itemgetter(1),reverse=True)
  return sorted_x

In [0]:
!python -m spacy download en_core_web_lg
import networkx as nx
import spacy
from nltk import Tree
nlp = spacy.load('en_core_web_lg')

In [0]:
def shortest_dependency_path(sents, e1=None, e2=None):
    temp=[]
    for s in sents:
      
      doc = nlp(s)
      edges = []
      for token in doc:
          for child in token.children:
              edges.append(('{0}'.format(token),
                            '{0}'.format(child)))
      graph = nx.Graph(edges)
      try:
          shortest_path = nx.shortest_path(graph, source=e1, target=e2)
      except:
          shortest_path = []
      temp.append(shortest_path)
    return temp

In [0]:
# doc = nlp(q)

# def to_nltk_tree(node):
#     if node.n_lefts + node.n_rights > 0:
#         return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
#     else:
#         return node.orth_


# [to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]

In [0]:
# nx.shortest_path(graph, source=e1, target=e2)

# Main

## Preparation

In [0]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

import numpy as np
import pandas  as pd
import sklearn as sk
import random
import csv
import re
import collections
import pickle
import sys

In [0]:
!pip install nltk
import nltk
nltk.download('averaged_perceptron_tagger')

In [0]:
wv_embSize = 200
d1_emb_size=10
d2_emb_size=10
type_emb_size=10
numfilter = 200
LSTM_unit = 200

In [0]:
num_epochs = 18
batch_size=200
reg_para = 0.001
drop_out = 1.0

## Read FIles

In [0]:
ftrain = r'train_data.txt'
ftest = r'test_data.txt'


In [0]:
Tr_sent_contents, Tr_entity1_list, Tr_entity2_list, Tr_sent_lables = dataRead(ftrain)
Te_sent_contents, Te_entity1_list, Te_entity2_list, Te_sent_lables = dataRead(ftest)

In [0]:
train=pd.DataFrame({'sents':Tr_sent_contents})
freq = pd.Series(' '.join(train['sents']).split()).value_counts()[-600:]
# freq
freq = list(freq.index)
train['sents'] = train['sents'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['sents'].head()



test=pd.DataFrame({'sents':Te_sent_contents})
# 

freq = pd.Series(' '.join(test['sents']).split()).value_counts()[-300:]
# freq
freq = list(freq.index)
test['sents'] = test['sents'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
test['sents'].head()


Tr_sent_contents=list(train['sents'])
Te_sent_contents=list(test['sents'])


In [0]:

Tr_sent_contents_SDP=shortest_dependency_path(Tr_sent_contents, e1='druga', e2='drugb')
Te_sent_contents_SDP=shortest_dependency_path(Te_sent_contents, e1='druga', e2='drugb')

## Normal Features

In [0]:
Tr_word_list, Tr_d1_list, Tr_d2_list = makeFeatures(Tr_sent_contents, Tr_entity1_list, Tr_entity2_list)
                                                                            
                                                                            

Tr_word_pos_t=[]
for i in Tr_word_list:
  Tr_word_pos_t.append(nltk.pos_tag(i))

Tr_word_pos=[]
for i in range(np.shape(Tr_word_pos_t)[0]):
  temp=[]
  for j in range(np.shape(Tr_word_pos_t[i])[0]):
    temp.append(Tr_word_pos_t[i][j][1])
  Tr_word_pos.append(temp)   

In [0]:
Te_word_list, Te_d1_list, Te_d2_list = makeFeatures(Te_sent_contents, Te_entity1_list, Te_entity2_list)                                

Te_word_pos_t=[]
for i in Te_word_list:
  Te_word_pos_t.append(nltk.pos_tag(i))

Te_word_pos=[]
for i in range(np.shape(Te_word_pos_t)[0]):
  temp=[]
  for j in range(np.shape(Te_word_pos_t[i])[0]):
    temp.append(Te_word_pos_t[i][j][1])
  Te_word_pos.append(temp)


In [0]:
word_dict = makeWordList([Tr_word_list, Te_word_list])
print ("word dictonary length", len(word_dict))

## SDP Features

In [0]:
Tr_sent_contents_SDP_expand=[]

for i in range(np.shape(Tr_word_list)[0]):
  temp=[]
  for j in range(np.shape(Tr_word_list[i])[0]):
    
    if Tr_word_list[i][j] in Tr_sent_contents_SDP[i]:
      temp.append(1)
    else:
      temp.append(0)
  
  Tr_sent_contents_SDP_expand.append(temp)    
  
  
  
Te_sent_contents_SDP_expand=[]

for i in range(np.shape(Te_word_list)[0]):
  temp=[]
  for j in range(np.shape(Te_word_list[i])[0]):
    
    if Te_word_list[i][j] in Te_sent_contents_SDP[i]:
      temp.append(1)
    else:
      temp.append(0)
  
  Te_sent_contents_SDP_expand.append(temp)    
  


## Build Dictionary

In [0]:
train_sent_lengths,test_sent_lengths = findSentLengths([Tr_word_list,Te_word_list])
sentMax = max(train_sent_lengths  + test_sent_lengths)
print ("max sent length", sentMax)

In [0]:
max(test_sent_lengths)

In [0]:
# sentMax=105

In [0]:
train_sent_lengths = np.array(train_sent_lengths, dtype='int32')
test_sent_lengths = np.array(test_sent_lengths, dtype='int32')

In [0]:
label_dict = {'false':0, 'advise': 1, 'mechanism': 2, 'effect': 3, 'int': 4}

In [0]:
word_dict = makeWordList([Tr_word_list, Te_word_list])
d1_dict = makeDistanceList([Tr_d1_list,  Te_d1_list,])
d2_dict = makeDistanceList([Tr_d2_list,  Te_d2_list])
pos_dict=makeDistanceList([Tr_word_pos,Te_word_pos])

print ("word dictonary length", len(word_dict))

## Build Word Vector

In [0]:
!pip install gensim
from gensim.models import KeyedVectors
filename = r'drive/My Drive/Colab Notebooks/embeddings/wikipedia-pubmed-and-PMC-w2v.bin'
WV_model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [0]:
# # # Word Embedding
wv = readWordEmb(word_dict, WV_model, wv_embSize)

## Mapping

In [0]:
# Mapping Train
W_train =   mapWordToId(Tr_word_list, word_dict)
d1_train = mapWordToId(Tr_d1_list, d1_dict)
d2_train = mapWordToId(Tr_d2_list, d2_dict)
pos_train=mapWordToId(Tr_word_pos, pos_dict)

#One Hot Encoding
Y_t = mapLabelToId(Tr_sent_lables, label_dict)
Y_train = np.zeros((len(Y_t), len(label_dict)))
for i in range(len(Y_t)):
	Y_train[i][Y_t[i]] = 1.0

In [0]:
!# Mapping Test
W_test =   mapWordToId(Te_word_list, word_dict)
d1_test = mapWordToId(Te_d1_list, d1_dict)
d2_test = mapWordToId(Te_d2_list, d2_dict)
pos_test=mapWordToId(Te_word_pos, pos_dict)


Y_t = mapLabelToId(Te_sent_lables, label_dict)
Y_test = np.zeros((len(Y_t), len(label_dict)))
for i in range(len(Y_t)):
	Y_test[i][Y_t[i]] = 1.0

## Padding

In [0]:
da=np.zeros(sentMax)
db=np.zeros(sentMax)
da[0]=4
db[0]=8
da=np.reshape(da,(1,sentMax))
db=np.reshape(db,(1,sentMax))

In [0]:

#padding
W_train, d1_train, d2_train,pos_train,T_train,Tr_sent_contents_SDP_expand,W_test, d1_test, d2_test,pos_test,Te_sent_contents_SDP_expand,T_test\
=paddData([W_train, d1_train, d2_train,pos_train,T_train,Tr_sent_contents_SDP_expand,W_test, d1_test, d2_test,pos_test,Te_sent_contents_SDP_expand,T_test],
          sentMax) 
            
print ("train", len(W_train))
print ("test", len(W_test))

In [0]:
#vocabulary size
word_dict_size = len(wv)
d1_dict_size = len(d1_dict)
d2_dict_size = len(d2_dict)
pos_dict_size = len(pos_dict)


# Model

## Functions

### F1

In [0]:
import tensorflow as tf
import keras.backend as K

def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    # tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

### AttentionWitPositionAndSimilarity_v2

In [0]:
from keras import backend as K, initializers, regularizers, constraints
from keras.engine.topology import Layer





class AttentionWitPositionAndSimilarity_v2(Layer):
    def __init__(self,
                 W_regularizer=regularizers.l2(0.0001), b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True,
                 return_attention=True,
                 **kwargs):

        self.supports_masking = True
        self.return_attention = return_attention
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWitPositionAndSimilarity_v2, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3
        

        
        self.W = self.add_weight((input_shape[0][-1],input_shape[0][-1]),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
       
        
        self.WP1 = self.add_weight((10,input_shape[0][-1]),
                                 initializer=self.init,
                                 name='{}_WP1'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)

        self.WP2 = self.add_weight((10,input_shape[0][-1]),
                                 initializer=self.init,
                                 name='{}_WP2'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        
        self.WS = self.add_weight((1,input_shape[0][-1]),
                                 initializer=self.init,
                                 name='{}_WS'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)          
        
        self.v = self.add_weight((input_shape[0][-1],1),
                                 initializer=self.init,
                                 name='{}_v'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)                                 
        if self.bias:
            self.b = self.add_weight((input_shape[0][1],input_shape[0][-1]),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        
        d1_emb=embedding_model.predict([da])[0][0]
        d2_emb=embedding_model.predict([db])[0][0]

        d1 =  K.variable(d1_emb, dtype='float32',name="input_d1")
        d1=K.expand_dims(d1,axis=-1)
        d1_sco=K.dot(x1,d1)
        d1_sco=d1_sco/wv_embSize
        d1_sco_soft=keras.activations.softmax(d1_sco, axis=1)

        d2 =  K.variable(d2_emb, dtype='float32',name="input_d2")
        d2=K.expand_dims(d2,axis=-1)
        d2_sco=K.dot(x1,d2)
        d2_sco=d2_sco/wv_embSize
        d2_sco_soft=keras.activations.softmax(d2_sco, axis=1)

        avg=(d1_sco+d2_sco)/2#(?,143,1)
        

        
        ew = K.dot(x[0], self.W)
        ewp1=K.dot(x[1],self.WP1)
        ewp2=K.dot(x[2],self.WP2)
        ews=K.dot(avg,self.WS)
        

      
        avg_all=(ewp1+ewp2+ews)/3
        avg_all=keras.activations.softmax(avg_all, axis=1)
        eij=ew+avg_all
        eij += self.b

        eij = K.tanh(eij)
        eij=K.dot(eij,self.v)
        eij=K.squeeze(eij,axis=-1)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())


        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        weighted_input = x[0] * K.expand_dims(a)

        result = K.sum(weighted_input, axis=1)

        if self.return_attention:
            return [result, a]
        return result

    def compute_output_shape(self, input_shape):
        if self.return_attention:
            return [(input_shape[0][0], input_shape[0][-1]),
                    (input_shape[0][0], input_shape[0][1])]
        else:
            return input_shape[0][0], input_shape[0][-1]

### similarityAttention

In [0]:
##no softmax
from keras import backend as K, initializers, regularizers, constraints
from keras.engine.topology import Layer





class similarityAttention(Layer):
    def __init__(self,
                 W_regularizer=regularizers.l2(0.0001), b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True,
                 return_attention=True,
                 **kwargs):

        self.supports_masking = True
        self.return_attention = return_attention
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(similarityAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3
        

        
        self.W = self.add_weight((input_shape[-1],input_shape[-1]),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
       
        
        
        self.WS = self.add_weight((1,input_shape[-1]),
                                 initializer=self.init,
                                 name='{}_WS'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)          
        
        self.v = self.add_weight((input_shape[-1],1),
                                 initializer=self.init,
                                 name='{}_v'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)                                 
        if self.bias:
            self.b = self.add_weight((input_shape[1],input_shape[-1]),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        
        d1_emb=embedding_model.predict([da])[0][0]
        d2_emb=embedding_model.predict([db])[0][0]

        d1 =  K.variable(d1_emb, dtype='float32',name="input_d1")
        d1=K.expand_dims(d1,axis=-1)
        d1_sco=K.dot(x1,d1)
        d1_sco=d1_sco/wv_embSize
        d1_sco_soft=keras.activations.softmax(d1_sco, axis=1)

        d2 =  K.variable(d2_emb, dtype='float32',name="input_d2")
        d2=K.expand_dims(d2,axis=-1)
        d2_sco=K.dot(x1,d2)
        d2_sco=d2_sco/wv_embSize
        d2_sco_soft=keras.activations.softmax(d2_sco, axis=1)

        avg=(d1_sco+d2_sco)/2#(?,143,1)
        

        
        ew = K.dot(x, self.W)
        ews=K.dot(avg,self.WS)
        

        avg_all=keras.activations.softmax(ews, axis=1)
        eij=ew+avg_all
        eij += self.b

        eij = K.tanh(eij)
        eij=K.dot(eij,self.v)
        eij=K.squeeze(eij,axis=-1)
#         K.squeeze(eig,axis=-1)
        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        weighted_input = x * K.expand_dims(a)

        result = K.sum(weighted_input, axis=1)

        if self.return_attention:
            return [result, a]
        return result

    def compute_output_shape(self, input_shape):
        if self.return_attention:
            return [(input_shape[0], input_shape[-1]),
                    (input_shape[0], input_shape[1])]
        else:
            return input_shape[0], input_shape[-1]

### TemporalMaxPooling

In [0]:
## TemporalMaxPooling
from keras import backend as K
from keras.engine import InputSpec
from keras.engine.topology import Layer
import numpy as np


class TemporalMaxPooling(Layer):
    """
    This pooling layer accepts the temporal sequence output by a recurrent layer
    and performs temporal pooling, looking at only the non-masked portion of the sequence.
    The pooling layer converts the entire variable-length hidden vector sequence
    into a single hidden vector.
    Modified from https://github.com/fchollet/keras/issues/2151 so code also
    works on tensorflow backend. Updated syntax to match Keras 2.0 spec.
    Args:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        3D tensor with shape: `(samples, steps, features)`.
        input shape: (nb_samples, nb_timesteps, nb_features)
        output shape: (nb_samples, nb_features)
    Examples:
        > x = Bidirectional(GRU(128, return_sequences=True))(x)
        > x = TemporalMaxPooling()(x)
    """
    def __init__(self, **kwargs):
        super(TemporalMaxPooling, self).__init__(**kwargs)
        self.supports_masking = True
        self.input_spec = InputSpec(ndim=3)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[2])

    def call(self, x, mask=None):
        if mask is None:
            mask = K.sum(K.ones_like(x), axis=-1)

        # if masked, set to large negative value so we ignore it when taking max of the sequence
        # K.switch with tensorflow backend is less useful than Theano's
        if K._BACKEND == 'tensorflow':
            mask = K.expand_dims(mask, axis=-1)
            mask = K.tile(mask, (1, 1, K.int_shape(x)[2]))
            masked_data = K.tf.where(K.equal(mask, K.zeros_like(mask)),
                K.ones_like(x)*-np.inf, x)  # if masked assume value is -inf
            return K.max(masked_data, axis=1)
        else:  # theano backend
            mask = mask.dimshuffle(0, 1, "x")
            masked_data = K.switch(K.eq(mask, 0), -np.inf, x)
            return masked_data.max(axis=1)

    def compute_mask(self, input, mask):
        # do not pass the mask to the next layers
        return None

### Attentive Pooling

In [0]:
from keras import backend as K, initializers, regularizers, constraints
from keras.engine.topology import Layer
def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        # todo: check that this is correct
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)


class Attention(Layer):
    def __init__(self,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True,
                 return_attention=True,
                 **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Note: The layer has been tested with Keras 1.x
        Example:
        
            # 1
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
            # next add a Dense layer (for classification/regression) or whatever...
            # 2 - Get the attention scores
            hidden = LSTM(64, return_sequences=True)(words)
            sentence, word_scores = Attention(return_attention=True)(hidden)
        """
        self.supports_masking = True
        self.return_attention = return_attention
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        eij = dot_product(x, self.W)

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        weighted_input = x * K.expand_dims(a)

        result = K.sum(weighted_input, axis=1)

        if self.return_attention:
            return [result, a]
        return result

    def compute_output_shape(self, input_shape):
        if self.return_attention:
            return [(input_shape[0], input_shape[-1]),
                    (input_shape[0], input_shape[1])]
        else:
            return input_shape[0], input_shape[-1]

### AttentionWitPosition

In [0]:
from keras import backend as K, initializers, regularizers, constraints
from keras.engine.topology import Layer


def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        # todo: check that this is correct
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)


class AttentionWitPosition(Layer):
    def __init__(self,
                 W_regularizer=regularizers.l2(0.0001), b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True,
                 return_attention=False,
                 **kwargs):
        
       
        self.supports_masking = True
        self.return_attention = return_attention
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWitPosition, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3
        

        
        self.W = self.add_weight((input_shape[0][-1],input_shape[0][-1]),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
       
        
        self.WP1 = self.add_weight((10,input_shape[0][-1]),
                                 initializer=self.init,
                                 name='{}_WP1'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)

        self.WP2 = self.add_weight((10,input_shape[0][-1]),
                                 initializer=self.init,
                                 name='{}_WP2'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        
      
        
        self.v = self.add_weight((input_shape[0][-1],1),
                                 initializer=self.init,
                                 name='{}_v'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)                                 
        if self.bias:
            self.b = self.add_weight((input_shape[0][1],input_shape[0][-1]),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
      

        
        
        
        ew = K.dot(x[0], self.W)
        ewp1=K.dot(x[1],self.WP1)
        ewp2=K.dot(x[2],self.WP2)
        eij=ew+ewp1+ewp2
        eij += self.b

        eij = K.tanh(eij)
        eij=K.dot(eij,self.v)
        eij=K.squeeze(eij,axis=-1)
#         K.squeeze(eig,axis=-1)
        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        weighted_input = x[0] * K.expand_dims(a)

        result = K.sum(weighted_input, axis=1)

        if self.return_attention:
            return [result, a]
        return result

    def compute_output_shape(self, input_shape):
        if self.return_attention:
            return [(input_shape[0][0], input_shape[0][-1]),
                    (input_shape[0][0], input_shape[0][1])]
        else:
            return input_shape[0][0], input_shape[0][-1]

## Model

In [0]:
from keras.models import Sequential 
from keras.layers import *
from keras.optimizers import Adam
from keras.layers import BatchNormalization
from keras.models import Model
from keras import regularizers
import keras
from keras.callbacks import ModelCheckpoint
!pip install tensorboardcolab
from tensorboardcolab import *
import tensorflow_hub as hub
import tensorflow as tf

from keras import initializers, regularizers, constraints



#### Multi

#####Define Model

In [0]:
#main
K.clear_session()

input1=Input(shape=(sentMax,),name='text')
input2=Input(shape=(sentMax,),name='d1')
input3=Input(shape=(sentMax,),name='d2')
input4=Input(shape=(sentMax,),name='pos')
input5=Input(shape=(sentMax,),name='SDP_main')

input5_r=Reshape((sentMax,1))(input5)

x1=Embedding(np.shape(wv)[0], 200, weights=[wv], input_length=sentMax,trainable=True)(input1)
embedding_model=Model(inputs=[input1], outputs=x1)
x2=Embedding(d1_dict_size, 10,trainable=True,input_length=sentMax)(input2)
x3=Embedding(d2_dict_size, 10,trainable=True,input_length=sentMax)(input3)
x4=Embedding(pos_dict_size, 4,trainable=True,input_length=sentMax)(input4)
# x5=Dense((sentMax,1))(input5)
inputs=concatenate([x1,x2,x3,x4,input5_r],axis=-1,name="concat")
inputs=BatchNormalization()(inputs)
inputs=SpatialDropout1D(0.4)(inputs)
model_h1=Bidirectional(CuDNNLSTM(200,return_sequences=True),merge_mode='concat')(inputs)

att,a=AttentionWitPositionAndSimilarity_v2()([model_h1,x2,x3])

att=BatchNormalization()(att)
main_output=Dropout(0.5, noise_shape=None, seed=None)(att)   

main_output=Dense(300,kernel_regularizer=regularizers.l2(0.001))(main_output)            
main_output=BatchNormalization()(main_output)
main_output=LeakyReLU()(main_output)

main_output=Dropout(0.5, noise_shape=None, seed=None)(main_output)  
main_output = Dense(100,kernel_regularizer=regularizers.l2(0.001))(main_output)
main_output=BatchNormalization()(main_output)
main_output=LeakyReLU()(main_output)

main_output = Dense(5,activation='softmax',kernel_regularizer=regularizers.l2(0.0001))(main_output)

# ,input1_h1,input2_h1,input3_h1,input4_h1,input_sdp
output_model = Model(inputs=[input1,input2,input3,input4,input5], outputs=main_output)

sgd = keras.optimizers.SGD(lr=0.01, momentum=0.9, decay=0.0, nesterov=True)
# adam=Adam(lr=0.001,  beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0, amsgrad=False)
output_model.compile(loss = 'categorical_crossentropy', optimizer=sgd,metrics=[f1])
##78.3

In [0]:
output_model.summary()

In [0]:
from keras.utils.vis_utils import plot_model as plot
plot(output_model, to_file='./model.png', show_shapes=True)
from IPython.display import Image
Image('./model.png')

#####Prepare to fit

In [0]:
yt=Y_train.copy()
yt=np.argmax(Y_train,axis=-1)
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(yt),
                                                 yt)
print(class_weights)

In [0]:
te_key_word_vec=np.asarray(te_key_word_vec)
tr_key_word_vec=np.asarray(tr_key_word_vec)


In [0]:
mcp_save = ModelCheckpoint('mdl_wts_m_{epoch:02d}.hdf5', save_best_only=False, monitor='val_loss', mode='min')
mcp_save2 = ModelCheckpoint('mdl_wts_m2_{epoch:02d}.hdf5', save_best_only=False, monitor='val_loss', mode='min')

#####Fit

In [0]:
 output_model.fit([W_train,d1_train,d2_train,pos_train,Tr_sent_contents_SDP_expand], Y_train,
                 epochs = 300, shuffle=True, batch_size=200, verbose = 1,
                 validation_data=([W_test, d1_test,d2_test,pos_test,Te_sent_contents_SDP_expand],Y_test),
                 callbacks=[mcp_save2],class_weight=class_weights)


##### Test

In [0]:
for i in range(1,10):
  output_model.load_weights(filepath = 'mdl_wts_m2_0'+str(i)+'.hdf5')
  from sklearn import metrics
  y_prob_m=output_model.predict([W_test, d1_test,d2_test,pos_test,Te_sent_contents_SDP_expand])
  y_prob_m=np.argmax(y_prob_m,axis=-1)
  yt=Y_test.copy()
  yt=np.argmax(Y_test,axis=-1)
  matrix = metrics.confusion_matrix(yt, y_prob_m)
  print(str(i)+': '+str(f1_score(yt,y_prob_m,[1,2,3,4],average='micro')))
  
  
for i in range(10,301):
  output_model.load_weights(filepath = 'mdl_wts_m2_'+str(i)+'.hdf5')
  from sklearn import metrics
  y_prob_m=output_model.predict([W_test, d1_test,d2_test,pos_test,Te_sent_contents_SDP_expand])
  y_prob_m=np.argmax(y_prob_m,axis=-1)
  yt=Y_test.copy()
  yt=np.argmax(Y_test,axis=-1)
  matrix = metrics.confusion_matrix(yt, y_prob_m)
  print(str(i)+': '+str(f1_score(yt,y_prob_m,[1,2,3,4],average='micro')))


In [0]:
#load the best model
output_model.load_weights(filepath = 'mdl_wts_m2_1.hdf5')
y_prob_m=output_model.predict([W_test, d1_test,d2_test,pos_test,Te_sent_contents_SDP_expand])
y_prob_m=np.argmax(y_prob_m,axis=-1)
yt=Y_test.copy()
yt=np.argmax(Y_test,axis=-1)
matrix = metrics.confusion_matrix(yt, y_prob_m)


In [0]:
matrix

In [0]:
f1_score(yt,y_prob_m,[1,2,3,4],average='micro'),precision_score(yt,y_prob_m,average='micro'),recall_score(yt,y_prob_m,average='micro')

In [0]:
f1_score(yt,y_prob_m,average=None),precision_score(yt,y_prob_m,average=None),recall_score(yt,y_prob_m,average=None)