In this notebook, we exploit document knowledge for aspect-level sentiment classification. More specifically, we build an attention-based aspect-level sentiment classification model with biLISTM. The biLSTM network learns sentence representations from input sequences. Additionally, an attention network assigns an attention score over a sequence of biLSTM hidden states based on aspect term representations. Then, a fully connected network predicts the sentiment label.

The model is trained based on transfer learning, i.e., we first train the parameters of a model on document-level examples, and we use the learned parameters to initialize and fine-tune the parameters of the aspect-level model.

This notebook is based on the work proposed by He et al. in [1].

[1] R. He, WS. Lee & D. Dahlmeier. Exploiting document knowledge for aspect-level sentiment classification. 2018. https://arxiv.org/abs/1806.04346.

# Imports

In [None]:
from google.colab import drive
drive.mount('/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /drive


In [None]:
import os
import sys
import codecs
import operator
import numpy as np
import re
import _pickle as cPickle

from time import time

# Load Data

In [None]:
def read_pickle(data_path, file_name):

    f = open(os.path.join(data_path, file_name), 'rb')
    read_file = cPickle.load(f)
    f.close()

    return read_file


def save_pickle(data_path, file_name, data):

    f = open(os.path.join(data_path, file_name), 'wb')
    cPickle.dump(data, f)
    print(" file saved to: %s"%(os.path.join(data_path, file_name)))
    f.close()

In [None]:
# Path pointing to the aspect-level data
aspect_path = '/drive/My Drive/projects/aspect-level-data' 

vocab = read_pickle(aspect_path, 'all_vocab.pkl')

train_x = read_pickle(aspect_path, 'train_x.pkl')
train_y = read_pickle(aspect_path, 'train_y.pkl')
dev_x = read_pickle(aspect_path, 'dev_x.pkl')
dev_y = read_pickle(aspect_path, 'dev_y.pkl')
test_x = read_pickle(aspect_path, 'test_x.pkl')
test_y = read_pickle(aspect_path, 'test_y.pkl')

train_aspect = read_pickle(aspect_path, 'train_aspect.pkl')
dev_aspect = read_pickle(aspect_path, 'dev_aspect.pkl')
test_aspect = read_pickle(aspect_path, 'test_aspect.pkl')

pretrain_data = read_pickle(aspect_path, 'pretrain_data.pkl')
pretrain_label = read_pickle(aspect_path, 'pretrain_label.pkl')

In [None]:
class Dataiterator_doc():
    '''
      1) Iteration over minibatches using next(); call reset() between epochs to randomly shuffle the data
      2) Access to the entire dataset using all()
    '''
    def __init__(self, X, y, seq_length=32, decoder_dim=300, batch_size=32):      
        self.X = X 
        self.y = y 
        self.num_data = len(X) # total number of examples
        self.batch_size = batch_size # batch size
        self.reset() # initial: shuffling examples and set index to 0
    

    def __iter__(self): # iterates data
        return self


    def reset(self): # initials
        self.idx = 0
        self.order = np.random.permutation(self.num_data) # shuffling examples by providing randomized ids 
        

    def __next__(self): # return model inputs - outputs per batch
        X_ids = [] # hold ids per batch 
        while len(X_ids) < self.batch_size:
            X_id = self.order[self.idx] # copy random id from initial shuffling
            X_ids.append(X_id)
            self.idx += 1 # 
            if self.idx >= self.num_data: # exception if all examples of data have been seen (iterated)
                self.reset()
                raise StopIteration()
                
        batch_X = self.X[np.array(X_ids)] # X values (encoder input) per batch
        batch_y = self.y[np.array(X_ids)] # y_in values (decoder input) per batch
        return batch_X, batch_y

          
    def all(self): # return all data examples
        return self.X, self.y


class Dataiterator_aspect():
    '''
      1) Iteration over minibatches using next(); call reset() between epochs to randomly shuffle the data
      2) Access to the entire dataset using all()
    '''
    def __init__(self, aspect_data, seq_length=32, decoder_dim=300, batch_size=32):
        
        len_aspect_data = len(aspect_data[0])
        #self.len_doc_data = len(doc_data[0])
        
        self.X_aspect = aspect_data[0] 
        self.y_aspect = aspect_data[1]
        self.aspect_terms = aspect_data[2]  
        self.num_data = len_aspect_data
        self.batch_size = batch_size # batch size
        self.reset() # initial: shuffling examples and set index to 0
    
    def __iter__(self): # iterates data
        return self


    def reset(self): # initials
        self.idx = 0
        self.order = np.random.permutation(self.num_data) # shuffling examples by providing randomized ids 
        

    def __next__(self): # return model inputs - outputs per batch
        
        X_ids = [] # hold ids per batch 
        while len(X_ids) < self.batch_size:
            X_id = self.order[self.idx] # copy random id from initial shuffling
            X_ids.append(X_id)
            self.idx += 1 # 
            if self.idx >= self.num_data: # exception if all examples of data have been seen (iterated)
                self.reset()
                raise StopIteration()
                
        batch_X_aspect = self.X_aspect[np.array(X_ids)] # X values (encoder input) per batch
        batch_y_aspect = self.y_aspect[np.array(X_ids)] # y_in values (decoder input) per batch
        batch_aspect_terms = self.aspect_terms[np.array(X_ids)]
        
        return batch_X_aspect, batch_y_aspect, batch_aspect_terms

          
    def all(self): # return all data examples
        return self.X_aspect, self.y_aspect, self.aspect_terms

In [None]:
from tensorflow import keras
from keras.models import Model
from keras.layers import Input, Embedding, Dense, Lambda, Dropout, LSTM,Bidirectional
from keras.layers import Reshape, Activation, RepeatVector, concatenate, Concatenate, Dot, Multiply
from keras.engine.topology import Layer
from keras import initializers
from keras import regularizers
from keras import constraints

import keras.backend as K
import keras.optimizers as opt

In [None]:
overal_maxlen = 82
overal_maxlen_aspect = 7


#Defining the Attention Network Layer

In [None]:
class Attention(Layer):
    def __init__(self,  **kwargs):
        """
        Keras Layer that implements an Content Attention mechanism.
        Supports Masking.
        """
       
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        super(Attention, self).__init__(**kwargs)


    def build(self, input_shape):
        assert type(input_shape) == list
       
        self.steps = input_shape[0][1]

        self.W = self.add_weight(shape=(input_shape[0][-1], input_shape[1][-1]),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),)

        self.built = True


    def compute_mask(self, input_tensor, mask=None):
        assert type(input_tensor) == list
        assert type(mask) == list
        return None


    def call(self, input_tensor, mask=None):
        x = input_tensor[0] # h matrix
        aspect = input_tensor[1] # t vector
        mask = mask[0]
        
        t = aspect
        h = x
        
        # h is a sequence of vectors h_1, h_2, ..., h_LSTM_dim
        # each h_1 is of dimension steps
        # So we have
        # h_11 h_12 .... h_1steps
        # h_21 h_22 .... h_2steps
        # .... .... .... ....
        # h_LSTM_dim1 .. h_LSTM_dimsteps
        # Shape is (batch_size, LSTM_dim, steps)

        # W_a is the weights matrix
        # It is of shape (LSTM_dim, embedding_dim)
        # A difficult thing to overcome is that W_a has no batch_size axis.

        # t is the already averaged embeddings x_1, ..., x_m.
        # It is of the form t_1, t_2, ..., t_embedding_dim, 
        # And of shape (batch_size, embedding_dim)

        # We first compute the dot product of W and t 
        # t is assumed to be a column vector, so we need to transpose it
        # We also need to transpose W since the batch_size axis is at the back
        # We need to have that in the front
        Wat = K.transpose(K.dot(self.W, K.transpose(t)))
        # Wat is of shape (batch_size, LSTM_dim)

        # We have for h still the same: (batch_size, LSTM_dim, steps)
        # We now do multiply the h_matrix with the Wat column for each
        # element in the batch. Wat column dim must be h row dim. 
        # so we transpose h, with a batch transpose to keep batch axis at 0.
        h_tran = K.permute_dimensions(h, (0, 2, 1))

        # So we now matrix multiply shapes: 
        # (batch_size, LSTM_dim, steps) * (batch_size, LSTM_dim)
        # Using the convenient batch_dot, this yields (batch_size, steps)
        hTWat = K.batch_dot(Wat, h_tran)

        # We can now compute the aspect vector
        a = K.softmax(K.tanh(hTWat))

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        return a

   
    def compute_output_shape(self, input_shape):
        return (input_shape[0][0], input_shape[0][1])

In [None]:
class Average(Layer):  
    def __init__(self, mask_zero=True, **kwargs):
        self.mask_zero = mask_zero
        self.supports_masking = True
        super(Average, self).__init__(**kwargs)

    def call(self, x,mask=None):
        if self.mask_zero:           
            mask = K.cast(mask, K.floatx())
            mask = K.expand_dims(mask)
            x = x * mask
            return K.sum(x, axis=1) / (K.sum(mask, axis=1) + K.epsilon())
        else:
            return K.mean(x, axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])
    
    def compute_mask(self, x, mask):
        return None

#Establishing the computation graph for the model



In [None]:
dropout = 0.1   
recurrent_dropout = 0.4
vocab_size = len(vocab)
num_outputs = 3  # labels

##Input tensors

In [None]:
# Inputs
sentence_input = Input(shape=(overal_maxlen,), dtype='int32', name='sentence_input')
aspect_input = Input(shape=(overal_maxlen_aspect,), dtype='int32', name='aspect_input')
pretrain_input = Input(shape=(None,), dtype='int32', name='pretrain_input')

##Shared WordEmbedding layer

In [None]:
# Represent aspect as averaged word embedding
word_emb = Embedding(vocab_size, 300, mask_zero=True, name='word_emb')
aspect_term_embs = word_emb(aspect_input)
aspect_embs = Average(mask_zero=True, name='aspect_emb')(aspect_term_embs)

In [None]:
# Sentence representation from embedding
sentence_embs = word_emb(sentence_input)
pretrain_embs = word_emb(pretrain_input)

##Shared BiLSTM layer

In [None]:
# Sentence representation from embedding
bilstm = Bidirectional(LSTM(300, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout, name='lstm'), name='bilstm')
sentence_bilstm = bilstm(sentence_embs)
pretrain_bilstm = bilstm(pretrain_embs)

##Attention Layer

In [None]:
att_weights = Attention(name='att_weights')([sentence_bilstm, aspect_embs])

##Prediction Layer

In [None]:
pretrain_output = Average(mask_zero=True)(pretrain_bilstm)

# Function for computed the weighted sum
def weighted_sum(tensor):
  x = tensor[0]
  a = tensor[1]
  a = K.expand_dims(a, axis=2)
  return K.sum(x * a, axis=1)

# Get the weighted sum of att_weights and sentence_bilstm
weighted_sum_layer = Lambda(weighted_sum)([sentence_bilstm, att_weights])

# Played around with dropout, first few test runs did not seem to improve the accuracies
# weighted_sum_layer = Dropout(0.4)(weighted_sum_layer)
# pretrain_output = Dropout(0.4)(pretrain_output)

sentence_output = Dense(num_outputs, name='dense_1')(weighted_sum_layer)
pretrain_output = Dense(num_outputs, name='dense_2')(pretrain_output)

aspect_probs = Activation('softmax', name='aspect_model')(sentence_output)
doc_probs = Activation('softmax', name='pretrain_model')(pretrain_output)

#Build Models for document-level and aspect-level data
- The two models shared the embedding, BiLSTM, Prediction Layer

In [None]:
from tensorflow.keras.utils import plot_model
model1 = Model(inputs=pretrain_input, outputs=doc_probs)
model2 = Model(inputs=[sentence_input, aspect_input], outputs=[aspect_probs])

# plot_model(model1, show_shapes=True)
# plot_model(model2, show_shapes=True)
model1.summary()
model2.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
pretrain_input (InputLayer)  (None, None)              0         
_________________________________________________________________
word_emb (Embedding)         multiple                  3000900   
_________________________________________________________________
bilstm (Bidirectional)       multiple                  1442400   
_________________________________________________________________
average_2 (Average)          (None, 600)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 1803      
_________________________________________________________________
pretrain_model (Activation)  (None, 3)                 0         
Total params: 4,445,103
Trainable params: 4,445,103
Non-trainable params: 0
_________________________________________________


#Training the model

##Train on document-level data

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

def train_generator_doc(model, batch_train_iter, batch_val_iter):
    earlystop_callbacks = [EarlyStopping(monitor='val_loss', patience=10),
                     ModelCheckpoint(filepath=os.path.join('./','{epoch:02d}-{loss:.2f}.check'), \
                                     monitor='val_loss', save_best_only=False, \
                                     save_weights_only=True)
                     ]

    def train_gen():
        while True:
            train_batches = [[X, y] for X, y in batch_train_iter]
            for train_batch in train_batches:
                yield train_batch
                
    def val_gen():
        while True:
            val_batches = [[X, y] for X, y in batch_val_iter]
            for val_batch in val_batches:
                yield val_batch
                
    history = model.fit_generator(train_gen(), validation_data=val_gen(), \
                                  validation_steps=val_steps_epoch, steps_per_epoch=train_steps_epoch, \
                                  epochs=20, callbacks=earlystop_callbacks)
      

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

def train_generator_aspect(model, batch_train_iter, batch_val_iter):
    
    earlystop_callbacks = [EarlyStopping(monitor='val_loss', patience=3),
                     ModelCheckpoint(filepath=os.path.join('./','{epoch:02d}-{loss:.2f}.check'), \
                                     monitor='val_loss', save_best_only=False, \
                                     save_weights_only=True)
                     ]
    
    def train_gen():
        while True:
            train_batches = [[[X, aspect], [y]] for X, y, \
                             aspect in batch_train_iter]
            for train_batch in train_batches:
                yield train_batch
                
    def val_gen():
        while True:
            val_batches = [[[X, aspect], [y]] for X, y, \
                           aspect in batch_val_iter]
            for val_batch in val_batches:
                yield val_batch
                
    history = model.fit_generator(train_gen(), validation_data=val_gen(), \
                                  validation_steps=val_steps_epoch, steps_per_epoch=train_steps_epoch, \
                                  epochs=20, callbacks=earlystop_callbacks)
    print(history.history['val_loss'], history.history['val_categorical_accuracy'])

In [None]:
import keras.optimizers as opt

optimizer = opt.RMSprop(lr=0.001, rho=0.9, epsilon=1e-06, clipnorm=10, clipvalue=0)
model1.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['categorical_accuracy'])

batch_size = 128
train_steps_epoch = len(pretrain_data)/batch_size
batch_train_iter_doc = Dataiterator_doc(pretrain_data, pretrain_label, batch_size)

batch_val_iter_doc = Dataiterator_doc(dev_x, dev_y, batch_size)
val_steps_epoch = len(dev_x)/batch_size

In [None]:
history = train_generator_doc(model1, batch_train_iter_doc, batch_val_iter_doc)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
save_path = '/drive/My Drive/projects/saved-weights'
save_weights = False

if save_weights:
  # Get weights of trained model1
  # The shared weights are the weights of the embedding layer,
  # the bidirectional LSTM layer, and the prediction layer
  embedding_weights = model1.get_layer('word_emb').get_weights()[0]
  bilstm_weights = model1.get_layer('bilstm').get_weights()
  dense_2_weights = model1.get_layer('dense_2').get_weights() 

  # Function to save weights to a name
  def save_weights(weights, save_name):
      with open(save_path + save_name + '.pkl', 'wb') as f:
          cPickle.dump(weights, f)

  save_weights(embedding_weights, 'embedding_weights')
  save_weights(bilstm_weights, 'bilstm_weights')
  save_weights(dense_2_weights, 'dense_2_weights')

For convenience, we save the weights of the shared layer for transfer learning.
The reason for this is that we do not have to retrain model1 to get the weights for model2. We also initialise the weights for model2 to these pretrained weights.

In [None]:
load_path = '/drive/My Drive/projects/saved-weights'
load_pretrained_weights = True

if load_pretrained_weights:
  # Retrieve the picke files of pretrained weights
  pretrained_embedding_weights = open(load_path + 'embedding_weights.pkl', 'rb')
  pretrained_bilstm_weights = open(load_path + 'bilstm_weights.pkl', 'rb')
  pretrained_dense_2_weights = open(load_path + 'dense_2_weights.pkl', 'rb')

  # Load the weights from the pickle files
  loaded_embedding_weights = cPickle.load(pretrained_embedding_weights)
  loaded_bilstm_weights = cPickle.load(pretrained_bilstm_weights)
  loaded_dense_2_weights = cPickle.load(pretrained_dense_2_weights)

  # Initialise weights of model2 with these weights
  model2.get_layer('word_emb').set_weights(K.expand_dims(loaded_embedding_weights, axis=0))
  model2.get_layer('bilstm').set_weights(loaded_bilstm_weights)
  model2.get_layer('dense_1').set_weights(loaded_dense_2_weights)

##Train on aspect-level data

In [None]:
model2 = Model(inputs=[sentence_input, aspect_input], outputs=[aspect_probs])

batch_size = 32
train_steps_epoch = len(train_x)/batch_size
batch_train_iter_aspect = Dataiterator_aspect([train_x, train_y, train_aspect], batch_size)
val_steps_epoch = len(dev_x)/batch_size
batch_val_iter_aspect = Dataiterator_aspect([dev_x, dev_y, dev_aspect], batch_size)

optimizer = opt.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=True)
model2.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [None]:
train_generator_aspect(model2, batch_train_iter_aspect, batch_val_iter_aspect)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


##Evaluating on test set

In [None]:
test_loss, test_categorical_accuracy = model2.evaluate([test_x, test_aspect], test_y, batch_size=batch_size)
print(f"test loss is {test_loss}, categorical test accuracy is {test_categorical_accuracy}")

test loss is 1.0172974099186147, categorical test accuracy is 0.6708333492279053
