In [None]:
import os
# if you want to only reserve memory on a single GPU
#os.environ["CUDA_VISIBLE_DEVICES"] = "3"

import tensorflow as tf
import time
import numpy as np
import sys
import json

from tensorflow.keras.layers import Layer, Conv1D, Input, GlobalMaxPooling1D, Multiply, Lambda, Embedding, Dense, Dropout, Activation
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import plot_model

from sklearn.model_selection import train_test_split

%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
############################################################################
# data is available here: http://people.csail.mit.edu/taolei/beer/
# place the data in the subfolder "data"
# here we toggle between aspects
aspect = 1
input_path_train = "data/reviews.aspect" + str(aspect) + ".train.txt"
input_path_validation = "data/reviews.aspect" + str(aspect) + ".heldout.txt"
############################################################################

In [None]:
# the dictionary mapping words to their IDs
word_to_id = dict()
token_id_counter = 3


with open(input_path_train) as fin:
    for line in fin:
        y, sep, text = line.partition("\t")
        token_list = text.split(" ")
        for token in token_list:
            if token not in word_to_id:
                word_to_id[token] = token_id_counter
                token_id_counter = token_id_counter + 1
        
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2
id_to_word = {value:key for key,value in word_to_id.items()}

X_train_list = []
Y_train_list = []
# now we iterate again to assign IDs
with open(input_path_train) as fin:
    for line in fin:
        y, sep, text = line.partition("\t")
        token_list = text.split(" ")
        tokenid_list = [word_to_id[token] for token in token_list]
        X_train_list.append(tokenid_list)
        
        # extract the normalized [0,1] value for the aspect
        y = [ float(v) for v in y.split() ]
        Y_train_list.append(y[aspect])

#print(y_list)        
X_train = np.asarray(X_train_list)
Y_train = np.asarray(Y_train_list)

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=350)

print("Loading heldout data...")
X_val_list = []
Y_val_list = []
# now we iterate again to assign IDs
with open(input_path_validation) as fin:
    for line in fin:
        y, sep, text = line.partition("\t")
        token_list = text.split(" ")
        tokenid_list = [word_to_id.get(token, 2) for token in token_list]
        X_val_list.append(tokenid_list)
        
        # extract the normalized [0,1] value for the aspect
        y = [ float(v) for v in y.split() ]
        Y_val_list.append(y[aspect])

#print(y_list)        
X_val_both = np.asarray(X_val_list)
Y_val_both = np.asarray(Y_val_list)

print('Pad sequences (samples x time)')
X_val_both = sequence.pad_sequences(X_val_both, maxlen=350)

print(X_train.shape)
#print(X_train[10])
#print(Y_train[0:10])
#print(Y_val_both[0:10])
#print(Y_train[0:100])
#print(token_id_counter)

In [None]:
# Set parameters:
max_features = token_id_counter+1
maxlen = 350
batch_size = 40
embedding_dims = 200
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 20
select_k = 10 # Number of selected words by the methods

In [None]:
# this cell loads the word embeddings from the external data
embeddings_index = {}
f = open("data/review+wiki.filtered.200.txt")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

embedding_matrix = np.zeros((len(word_to_id) + 1, 200))
for word, i in word_to_id.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
EPSILON = np.finfo(tf.float32.as_numpy_dtype).tiny
def gumbel_keys(w):
    # sample some gumbels
    uniform = tf.random.uniform(
        tf.shape(w),
        minval=EPSILON,
        maxval=1.0)
    z = tf.math.log(-tf.math.log(uniform))
    w = w + z
    return w

In [None]:
def continuous_topk(w, k, t, separate=False):
    khot_list = []
    onehot_approx = tf.zeros_like(w, dtype=tf.float32)
    for i in range(k):
        khot_mask = tf.maximum(1.0 - onehot_approx, EPSILON)
        w += tf.math.log(khot_mask)
        onehot_approx = tf.nn.softmax(w / t, axis=-1)
        khot_list.append(onehot_approx)
    if separate:
        return khot_list
    else:
        return tf.reduce_sum(khot_list, 0)

In [None]:
def sample_subset(w, k, t=0.1):
    '''
    Args:
        w (Tensor): Float Tensor of weights for each element. In gumbel mode
            these are interpreted as log probabilities
        k (int): number of elements in the subset sample
        t (float): temperature of the softmax
    '''
    w = gumbel_keys(w)
    return continuous_topk(w, k, t)

In [None]:
class SampleSubset(Layer):
    """
    Layer for continuous approx of subset sampling
    """
    def __init__(self, tau0, k, **kwargs):
        self.tau0 = tau0
        self.k = k
        super(SampleSubset, self).__init__(**kwargs)

    def call(self, logits):
        # logits: [BATCH_SIZE, d, 1]
        logits = tf.squeeze(logits, 2)
        samples = sample_subset(logits, self.k, self.tau0)

        # Explanation Stage output.
        threshold = tf.expand_dims(tf.nn.top_k(logits, self.k, sorted = True)[0][:,-1], -1)
        discrete_logits = tf.cast(tf.greater_equal(logits,threshold),tf.float32)
        output = K.in_train_phase(samples, discrete_logits)
        return tf.expand_dims(output,-1)

    def compute_output_shape(self, input_shape):
        return input_shape
    

    def get_config(self):
        cfg = super().get_config()
        return cfg

In [None]:
def subset_precision(modelTestInput):
    data = []
    num_annotated_reviews = 0
    with open("data/annotations.json") as fin:
        for line in fin:
            item = json.loads(line)
            data.append(item)
            num_annotated_reviews = num_annotated_reviews + 1

    selected_word_counter = 0
    correct_selected_counter = 0

    for anotr in range(num_annotated_reviews):
        #print(anotr),
        ranges = data[anotr][str(aspect)] # the aspect id
        text_list = data[anotr]['x']
        #print(ranges)
        review_length = len(text_list)
        #print(text_list)

        list_test = []
        tokenid_list = [word_to_id.get(token, 0) for token in text_list]
        list_test.append(tokenid_list)

        #print(list_test)
        X_test_subset = np.asarray(list_test)
        X_test_subset = sequence.pad_sequences(X_test_subset, maxlen=350)
        #print(X_test_subset)

        prediction = modelTestInput.predict(X_test_subset)
        prediction = tf.squeeze(prediction, -1)
        #print(np.count_nonzero(prediction[0]))

        #print(prediction[0])
        x_val_selected = prediction[0] * X_test_subset
        #print(tf.cast(x_val_selected, tf.int32))

        selected_words = np.vectorize(id_to_word.get)(x_val_selected)[0][-review_length:]
        selected_nonpadding_word_counter = 0
        
        for i, w in enumerate(selected_words):
            if w != '<PAD>': # we are nice to the L2X approach by only considering selected non-pad tokens
                selected_nonpadding_word_counter = selected_nonpadding_word_counter + 1
                for r in ranges:
                    rl = list(r)
                    if i in range(rl[0], rl[1]):
                        correct_selected_counter = correct_selected_counter + 1
        # we make sure that we select at least 10 non-padding words
        # if we have more than select_k non-padding words selected, we allow it but count that in
        selected_word_counter = selected_word_counter + max(selected_nonpadding_word_counter, select_k)

    return correct_selected_counter / selected_word_counter

In [None]:
###########################################
####################L2X####################
###########################################
# Define various Keras layers.
Mean = Lambda(lambda x: K.sum(x, axis = 1) / float(select_k), output_shape=lambda x: [x[0],x[2]])

class Concatenate(Layer):
    """
    Layer for concatenation. 
    
    """
    def __init__(self, **kwargs): 
        super(Concatenate, self).__init__(**kwargs)

    def call(self, inputs):
        input1, input2 = inputs  
        input1 = tf.expand_dims(input1, axis = -2) # [batchsize, 1, input1_dim] 
        dim1 = int(input2.get_shape()[1])
        input1 = tf.tile(input1, [1, dim1, 1])
        return tf.concat([input1, input2], axis = -1)

    def compute_output_shape(self, input_shapes):
        input_shape1, input_shape2 = input_shapes
        input_shape = list(input_shape2)
        input_shape[-1] = int(input_shape[-1]) + int(input_shape1[-1])
        input_shape[-2] = int(input_shape[-2])
        return tuple(input_shape)

class Sample_Concrete(Layer):
    """
    Layer for sample Concrete / Gumbel-Softmax variables. 

    """
    def __init__(self, tau0, k, **kwargs): 
        self.tau0 = tau0
        self.k = k
        super(Sample_Concrete, self).__init__(**kwargs)

    def call(self, logits):   
        # logits: [batch_size, d, 1]
        logits_ = K.permute_dimensions(logits, (0,2,1))# [batch_size, 1, d]

        d = int(logits_.get_shape()[2])
        
        uniform = tf.random.uniform(shape=tf.shape(logits_), minval=0.0, maxval = 1.0)
        gumbel = - K.log(-K.log(uniform))
        noisy_logits = (gumbel + logits_)/self.tau0
        samples = K.softmax(noisy_logits)
        samples = K.max(samples, axis = 1) 
        logits = tf.reshape(logits,[-1, d]) 
        threshold = tf.expand_dims(tf.nn.top_k(logits, self.k, sorted = True)[0][:,-1], -1)
        discrete_logits = tf.cast(tf.greater_equal(logits,threshold),tf.float32)
        
        output = K.in_train_phase(samples, discrete_logits) 
        return tf.expand_dims(output,-1)

    def get_config(self):
        cfg = super().get_config()
        return cfg  

    def compute_output_shape(self, input_shape):
        return input_shape


class IMLESubsetkLayer(tf.keras.layers.Layer):
    
    def __init__(self, k, _tau=1.0, _lambda=10.0):
        super(IMLESubsetkLayer, self).__init__()
        
        self.k = k
        self._tau = _tau
        self._lambda = _lambda
        self.samples = None
        
    def sample_gumbel(self, shape, eps=1e-20):
        U = tf.random.uniform(shape, minval=0, maxval=1)
        return -tf.math.log(-tf.math.log(U + eps) + eps)
        
    def sample_discrete(self, logits):
        gumbel_softmax_sample = logits + self.sample_gumbel(tf.shape(logits))
        threshold = tf.expand_dims(tf.nn.top_k(gumbel_softmax_sample, self.k, sorted=True)[0][:,-1], -1)
        y = tf.cast(tf.greater_equal(gumbel_softmax_sample, threshold), tf.float32)
        return y
    
    @tf.function
    def sample_gumbel_k(self, shape):
        
        s = tf.map_fn(fn=lambda t: tf.random.gamma(shape, 1.0/self.k, t/self.k), 
                  elems=tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]))   
        # now add the samples
        s = tf.reduce_sum(s, 0)
        # the log(m) term
        s = s - tf.math.log(10.0)
        # divide by k --> each s[c] has k samples whose sum is distributed as Gumbel(0, 1)
        s = self._tau * (s / self.k)

        return s
    
    #@tf.function
    def sample_discrete_2(self, logits): 
        self.samples = self.sample_gumbel_k(tf.shape(logits))
        gumbel_softmax_sample = logits + self.samples
        threshold = tf.expand_dims(tf.nn.top_k(gumbel_softmax_sample, self.k, sorted=True)[0][:,-1], -1)
        y = tf.cast(tf.greater_equal(gumbel_softmax_sample, threshold), tf.float32)
        return y
    
    #@tf.function
    def sample_discrete_2_reuse(self, logits):     
        gumbel_softmax_sample = logits + self.samples
        threshold = tf.expand_dims(tf.nn.top_k(gumbel_softmax_sample, self.k, sorted=True)[0][:,-1], -1)
        y = tf.cast(tf.greater_equal(gumbel_softmax_sample, threshold), tf.float32)
        return y
    

    @tf.custom_gradient
    def gumbel_topk_new(self, logits):

        # we compute a map state for the distribution
        # we also store the sample for later
        z_train = self.sample_discrete_2(logits)        
        threshold = tf.expand_dims(tf.nn.top_k(logits, self.k, sorted=True)[0][:,-1], -1)
        z_test = tf.cast(tf.greater_equal(logits, threshold), tf.float32)
        z_output = K.in_train_phase(z_train, z_test)
        
        def custom_grad(dy):

            # we perturb (implicit diff) and then resuse sample for perturb and MAP
            map_dy = self.sample_discrete_2_reuse(logits - (self._lambda*dy))
            # we now compute the gradients as the difference (I-MLE gradients)
            grad = tf.math.subtract(z_train, map_dy)
            
            # for the straight-through estimator, simply use the following line
            #return dy, k
            return grad

        return z_output, custom_grad
    
    def call(self, logits):
        
        logits = tf.squeeze(logits, -1) # [batchsize, d]
        y = self.gumbel_topk_new(logits)
        y = tf.expand_dims(y, -1) #[batchsize, d, 1]
        return y

    def get_config(self):
        cfg = super().get_config()
        return cfg


def construct_gumbel_selector(X_ph, num_words, embedding_dims, maxlen):
    """
    Build the L2X model for selecting words. 

    """
    emb_layer = Embedding(num_words, embedding_dims, weights=[embedding_matrix], input_length=maxlen, trainable=False, name='emb_gumbel')
    emb = emb_layer(X_ph) #(350, 200) 
    #net = Dropout(0.2, name = 'dropout_gumbel')(emb)# this is not used in the L2X experiments
    net = emb
    first_layer = Conv1D(100, kernel_size, padding='same', activation='relu', strides=1, name = 'conv1_gumbel')(net)    

    # global info
    net_new = GlobalMaxPooling1D(name = 'new_global_max_pooling1d_1')(first_layer)
    global_info = Dense(100, name = 'new_dense_1', activation='relu')(net_new) 

    # local info
    net = Conv1D(100, 3, padding='same', activation='relu', strides=1, name = 'conv2_gumbel')(first_layer) 
    local_info = Conv1D(100, 3, padding='same', activation='relu', strides=1, name = 'conv3_gumbel')(net)  
    combined = Concatenate()([global_info,local_info]) 
    net = Dropout(0.2, name = 'new_dropout_2')(combined)
    net = Conv1D(100, 1, padding='same', activation='relu', strides=1, name = 'conv_last_gumbel')(net)   

    logits_T = Conv1D(1, 1, padding='same', activation=None, strides=1, name = 'conv4_gumbel')(net)  
    
    return logits_T

In [None]:
print('Creating model...')

list_test_loss = []
list_subset_precision = []

# here we can now iterate a few times to compute statistics
for iterc in range(10):
    
    tf.random.set_seed(iterc)
    np.random.seed(iterc)

    # create a new validation/test split
    X_val, X_test, Y_val, Y_test = train_test_split(X_val_both, Y_val_both, test_size=0.5, random_state=iterc)
    
    # P(S|X)
    #with tf.variable_scope('selection_model'):
    X_ph = Input(shape=(maxlen,), dtype='int32')

    logits_T = construct_gumbel_selector(X_ph, max_features, embedding_dims, maxlen)

    ###############################################################################
    #### here we switch between the different methods #############################
    # the standard L2X approach
    #tau = 0.1
    #T = Sample_Concrete(tau, select_k)(logits_T)

    # the I-MLE approach (ours)
    # tau = temperature
    # lambda = implicit differentiation perturbation magnitude:  q(z; theta') with theta' = theta - lambda dL/dz
    T = IMLESubsetkLayer(select_k, _tau=1.0, _lambda=1000.0)(logits_T)
    
    # the SoftSub relaxation
    #tau = 0.5
    #T = SampleSubset(tau, select_k)(logits_T)
    ###############################################################################
    ###############################################################################     

    # q(X_S)
    emb2 = Embedding(max_features, embedding_dims, input_length=maxlen, weights=[embedding_matrix], trainable=False)(X_ph)
    net = Mean(Multiply()([emb2, T]))
    net = Dense(hidden_dims)(net)
    net = Activation('relu')(net)
    preds = Dense(1, activation='sigmoid', name = 'new_dense')(net)

    model = Model(inputs=X_ph, outputs=preds)
    #plot_model(model, to_file='model_plot.pdf', show_shapes=True, show_layer_names=True)
      
    model.compile(loss=['mse'], optimizer='adam', metrics=['mse']) 

    filepath="models/l2x.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_mse', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint] 
    st = time.time()

    model.fit(X_train, Y_train, validation_data=(X_val, Y_val), callbacks=callbacks_list, epochs=20, batch_size=batch_size)
    duration = time.time() - st
    print('Training time is {}'.format(duration))

    results = model.evaluate(X_test, Y_test, batch_size=100)
    print("test loss, test acc:", results)
    list_test_loss.append(results[0])
    
    modelTest = Model(inputs=X_ph, outputs=T)
    modelTest.load_weights('models/l2x.hdf5', by_name=True)

    subset_prec = subset_precision(modelTest)
    list_subset_precision.append(subset_prec)
    print("Subset precision: " + str(subset_prec))

In [None]:
print(np.mean(list_subset_precision))
print(np.std(list_subset_precision))

In [None]:
print(np.mean(list_test_loss))
print(np.std(list_test_loss))