paper source -- https://www.sciencedirect.com/science/article/abs/pii/S0950705118302144# Loading Libraries

In [1]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'

import numpy as np
from nltk import word_tokenize
from collections import Counter

import tensorflow as tf
from keras.models import Model
from keras import layers, callbacks
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

import re

from numpy.linalg import norm

from keras import layers
from keras.callbacks import Callback
from keras import backend as K

# for reproducibility
np.random.seed(2333)
tf.set_random_seed(2333)


Using TensorFlow backend.


# Defining HyperParameters

In [2]:
glove_path = os.path.expanduser('glove.6B')

# domains
DOMAINS = ('books', 'dvd', 'electronics', 'kitchen')


"""global data"""
coef = 0.04  # domain loss coefficient in total loss

# model params
embed_dim = 300  # word embedding dimension
rnn_dim = 300  # rnn state dimension
hidden_dim = 100  # hidden state dimension
embed_dropout = 0.2  # dropout rate for word embedding layer
fc_dropout = 0.2  # dropout rate for fully connected layer
batch_size = 64  # training batch size
epochs = 1  # maximal training epochs
activation = 'relu'  # activation function of fully connected layer
optimizer = 'adam'  # optimizer method for training
RNN = layers.LSTM  # RNN type, can be SimpleRNN/GRU/LSTM/...

# train params
lr_factor = 0.1  # reduce factor of learning rate for training plateau
lr_patience = 3  # reduce learning rate when val loss has stopped improving
stop_patience = 5  # stop training when val loss has stopped improving

# data params
glove_corpus = 6  # glove corpus size, can be 6B/42B/840B
min_count = 1  # minimal word frequency cutoff
max_words = None  # maximal words kept (None means all kept)
n_words = None
maxlen = None
word2index = None
wv_weights = None  # word vectors weights



## Data Loader for MDSD Dataset (Multi-Domain Sentiment Dataset )

In [3]:

def load_mdsd(domains, n_labeled=None):
    texts = []
    s_labels = []
    d_labels = []
    sentiments = ('positive', 'negative')
    for d_id, d_name in enumerate(domains):
        for s_id, s_name in zip((1, 0, -1), sentiments):
            fpath = os.path.join('', 'datasets', d_name + '_' + s_name + '.review')
            print(' - loading', d_name, s_name, end='')
            count = 0
            text = ''
            in_review_text = False
            with open(fpath, encoding='utf8', errors='ignore') as fr:
                for line in fr:
                    if '<review_text>' in line:
                        text = ''
                        in_review_text = True
                        continue
                    if '</review_text>' in line:
                        in_review_text = False
                        text = text.lower().replace('\n', ' ').strip()
                        text = re.sub(r'&[a-z]+;', '', text)
                        text = re.sub(r'\s+', ' ', text)
                        texts.append(text)
                        s_labels.append(s_id)
                        d_labels.append(d_id)
                        count += 1
                    if in_review_text:
                        text += line
                    # labeled cutoff
                    if (s_id >= 0) and n_labeled and (count == n_labeled):
                        break
            print(': %d texts' % count)
    print('data loaded')
    s_labels = np.asarray(s_labels, dtype='int')
    d_labels = np.asarray(d_labels, dtype='int')
    print(' - texts:', len(texts))
    print(' - s_labels:', len(s_labels))
    print(' - d_labels:', len(d_labels))

    return texts, s_labels, d_labels


## Utils 

In [4]:
assert os.path.exists(glove_path)


def load_glove(path=glove_path, embedding_dim=300, corpus_size=6, desired=None, verbose=False):
    """Load glove embeddings from original txt file
    """
    if embedding_dim != 300:
        assert embedding_dim in (50, 100, 200), 'embedding dim must be one of 50/100/200 if not 300'
        fpath = os.path.join(path, 'glove.6B.{}d.txt'.format(embedding_dim))
    else:
        assert corpus_size in (6, 42, 840), 'corpus type must be one of 6B/42B/840B'
        fpath = os.path.join(path, 'glove.{}B.300d.txt'.format(corpus_size))
    word2vec = {}
    print('loading glove from', fpath)
    f = open(fpath, 'r', encoding='utf8', errors='ignore')
    for line in f if verbose else f:
        values = line.split()
        word = values[0]  # the word
        if not desired or word in desired:
            coefs = np.asarray(values[1:], dtype="float32")
            word2vec[word] = coefs
    f.close()
    print('glove info: {} words, {} dims'.format(len(word2vec), embedding_dim))
    return word2vec


def get_embedding_mat(embeddings, word2index, embedding_dim, random_uniform_level=0.01, idx_from=2):
    """Use embeddings and word2index to get embedding-mat (for input layer)
    idx_from=2, usually, 0 for <PAD>, 1 for <OOV>
    """
    # embedding_mat = np.zeros((n_words, embedding_dim))
    n_words = len(word2index)
    for idx in range(0, idx_from):
        if idx in word2index.values():
            n_words -= 1
    n_words += idx_from
    embedding_mat = np.random.uniform(low=-random_uniform_level, high=random_uniform_level, size=(n_words, embedding_dim))
    embedding_mat[0] = np.zeros(embedding_dim)
    for word, idx in word2index.items():
        if idx < idx_from:
            continue
        embedding_vec = embeddings.get(word)
        if embedding_vec is not None:  # means we have this word's embedding
            embedding_mat[idx] = embedding_vec
    return embedding_mat


# ========== keras utils ==========
def att_process(candidates, att, activation='tanh'):
    """
    Attention Process (functional API, can get weights at the same time)
     - candidates: (*, maxlen, features)
     - att: (*, att_dim)
    """
    att_dim = K.int_shape(att)[-1]
    candidates2 = layers.TimeDistributed(
        layers.Dense(att_dim, activation=activation))(candidates)
    dotted = layers.dot([candidates2, att], axes=(2, 1), normalize=True)
    weights = layers.Activation('softmax')(dotted)  # (*, maxlen), sums up to 1
    weighted = layers.dot([candidates, weights], axes=(1, 1))
    return weighted, weights


class UpdateMonitor(Callback):
    """monitor a model's training process:
    monitor each layer's update rate (~1e-3 is good rate)
    """
    def __init__(self):
        super(UpdateMonitor, self).__init__()
        self.weights = None

#     @classmethod
    def _get_updates(cls, old_weights, new_weights):
        """Calculate updates rate for layers' weights
        Note: only calculate the first parameter of a layer"""
        if not old_weights:
            old_weights = new_weights
        updates = []
        for old_layerwise_weights, new_layerwise_weights in zip(old_weights, new_weights):
            if len(old_layerwise_weights) == 0 or len(new_layerwise_weights) == 0:
                updates.append(None)
            else:
                w1, w2 = old_layerwise_weights[0], new_layerwise_weights[0]  # only check the first weight of a layer
                updates.append(norm(w2 - w1) / norm(w2))
        return updates

    def on_epoch_end(self, epoch, logs={}):
        # monitor update rates
        new_weights = _get_weights(self.model)
        updates = self._get_updates(old_weights=self.weights, new_weights=new_weights)
        self.weights = new_weights  # update
        updates_info = ', '.join('{:.4f}'.format(1e3 * update) if update else '-' for update in updates)
        print('- updates: 1e-3 * [{}]'.format(updates_info))


def _get_weights(model):
    """Get all layers' weights as a list of list:
    [[l1_w1, l1_w2, ...], ... , [ln_w1, ln_w2, ...]]"""
    weights = []
    for layer in model.layers:
        # if no weights, return value is []
        weights.append(layer.get_weights())
    return weights


## train/val/test split for one single domain

In [5]:
def _tvt_split(_seqs, _slabels, splits=(7, 2, 1)):
    assert len(_seqs) == len(_slabels)
    splits = np.asarray(splits)
    splits = np.cumsum(splits / splits.sum())
    # shuffle
    indices = [range(len(_seqs))]
    np.random.shuffle(indices)
    _seqs = _seqs[indices]
    _slabels = _slabels[indices]
    # prepare data (balance data from all labels)
    X_train, y_train, X_val, y_val, X_test, y_test = [], [], [], [], [], []
    for slabel in sorted(np.unique(_slabels)):
        seqs_ofs = _seqs[_slabels == slabel]
        slabels_ofs = _slabels[_slabels == slabel]
        # split
        split_ats = np.asarray(splits * len(seqs_ofs), dtype=int)
        X_train.extend(seqs_ofs[:split_ats[0]])
        X_val.extend(seqs_ofs[split_ats[0]:split_ats[1]])
        X_test.extend(seqs_ofs[split_ats[1]:])
        y_train.extend(slabels_ofs[:split_ats[0]])
        y_val.extend(slabels_ofs[split_ats[0]:split_ats[1]])
        y_test.extend(slabels_ofs[split_ats[1]:])
    X_train = np.asarray(X_train, dtype='int')
    X_val = np.asarray(X_val, dtype='int')
    X_test = np.asarray(X_test, dtype='int')
    y_train = np.asarray(y_train, dtype='int')
    y_val = np.asarray(y_val, dtype='int')
    y_test = np.asarray(y_test, dtype='int')
    print(' * X:', X_train.shape, X_val.shape, X_test.shape)
    print(' * y:', y_train.shape, y_val.shape, y_test.shape)
    return (X_train[:100], X_val[:100], X_test[:100]), (y_train[:100], y_val[:100], y_test[:100])



## data pre-processing

In [6]:
def make_data():
    # load data
    print('loading data: Multi-Domain Sentiment Dataset v2')
    texts, s_labels, d_labels = load_mdsd(domains=DOMAINS)

    # build vocabulary for words
    print('building vocabulary')
    texts_tokens = []
    lens = []
    for text in texts:
        words = word_tokenize(text)
        for idx, word in enumerate(words):
            if word.isdigit():
                words[idx] = '<NUM>'  # replace number token with <NUM>
        texts_tokens.append(words)
        lens.append(len(words))
    maxlen = int(np.percentile(lens, 95))
    print('maxlen:', maxlen)
    counter = Counter()
    for words in texts_tokens:
        counter.update(words)
    word2index = {'<PAD>': 0, '<UNK>': 1}
    for idx, word_count in enumerate(counter.most_common(max_words)):
        if word_count[1] >= min_count:  # min_count
            word2index[word_count[0]] = idx + 2  # starting from 2, 0 used as <PAD>, 1 used as <OOV>
    n_words = len(word2index)
    print('n_words:', n_words)

    # data encode
    print('data encoding')
    seqs = []
    for words in texts_tokens:
        seqs.append([word2index.get(word, 1) for word in words])
    seqs_padded = pad_sequences(seqs, maxlen=maxlen, padding='post', truncating='post')
    s_labels = np.asarray(s_labels, dtype=int)
    d_labels = np.asarray(d_labels, dtype=int)

    # domain & train/val/test split
    print('labeled data: domain & train/val/test splitting')
    X_train, ys_train, yd_train = [], [], []
    X_val, ys_val, yd_val = [], [], []
    X_test_byd, ys_test_byd, yd_test_byd = {}, {}, {}
    for d_id, d_name in enumerate(DOMAINS):
        print(d_name, 'splitting')
        seqs_padded_ofd = seqs_padded[(d_labels == d_id) & (s_labels != -1)]
        slabels_ofd = s_labels[(d_labels == d_id) & (s_labels != -1)]
        print(' * all:', seqs_padded_ofd.shape, slabels_ofd.shape)
        (X_train_ofd, X_val_ofd, X_test_ofd), (y_train_ofd, y_val_ofd, y_test_ofd) = _tvt_split(seqs_padded_ofd, slabels_ofd)
        # train data (add this domain)
        X_train.extend(X_train_ofd)
        ys_train.extend(y_train_ofd)
        yd_train.extend([d_id] * len(X_train_ofd))
        # val data
        X_val.extend(X_val_ofd)
        ys_val.extend(y_val_ofd)
        yd_val.extend([d_id] * len(X_val_ofd))
        # test data
        X_test_byd[d_id] = X_test_ofd
        ys_test_byd[d_id] = to_categorical(y_test_ofd, num_classes=2)
        yd_test_byd[d_id] = to_categorical([d_id] * len(X_test_ofd), num_classes=len(DOMAINS))
    X_train = np.asarray(X_train, dtype='int')
    ys_train = to_categorical(ys_train, num_classes=2)
    yd_train = to_categorical(yd_train, num_classes=len(DOMAINS))
    X_val = np.asarray(X_val, dtype='int')
    ys_val = to_categorical(ys_val, num_classes=2)
    yd_val = to_categorical(yd_val, num_classes=len(DOMAINS))
    # combine test data from different domains
    X_test = np.concatenate([X_test_byd[idx] for idx in range(len(DOMAINS))])
    ys_test = np.concatenate([ys_test_byd[idx] for idx in range(len(DOMAINS))])
    yd_test = np.concatenate([yd_test_byd[idx] for idx in range(len(DOMAINS))])

    # shuffle train data
    indices = list(range(len(X_train)))
    np.random.shuffle(indices)
    X_train = X_train[indices]
    ys_train = ys_train[indices]
    yd_train = yd_train[indices]
    print('combined labeled data:')
    print('  - train:', X_train.shape, ys_train.shape, yd_train.shape)
    print('  - val:', X_val.shape, ys_val.shape, yd_val.shape)
    print('  - test:', X_test.shape, ys_test.shape, yd_test.shape)
    for d_id, d_name in enumerate(DOMAINS):
        print('  - test for {}:'.format(d_name[:3]), X_test_byd[d_id].shape, ys_test_byd[d_id].shape, yd_test_byd[d_id].shape)

    # embeddings
    print('loading word embeddings from glove')
    embeddings = load_glove(embedding_dim=embed_dim, desired=word2index.keys(), corpus_size=glove_corpus)
    print('processing embedding matrix')
    embedding_mat = get_embedding_mat(embeddings, word2index, embed_dim, idx_from=2)
    wv_weights = [embedding_mat]

    # inject data into SharedData for other functions
    maxlen = maxlen
    n_words = n_words
    word2index = word2index
    X_train, ys_train, yd_train = X_train, ys_train, yd_train
    X_val, ys_val, yd_val = X_val, ys_val, yd_val
    X_test, ys_test, yd_test = X_test, ys_test, yd_test
    X_test_byd, ys_test_byd, yd_test_byd = X_test_byd, ys_test_byd, yd_test_byd
    return wv_weights, maxlen, n_words, word2index, X_train, X_val, ys_val, yd_val, X_test, ys_test, yd_test, X_test_byd, ys_test_byd, yd_test_byd, ys_train, yd_train

wv_weights, maxlen, n_words, word2index, X_train, X_val, ys_val, yd_val, X_test, ys_test, yd_test, X_test_byd, ys_test_byd, yd_test_byd, ys_train, yd_train = make_data()

loading data: Multi-Domain Sentiment Dataset v2
 - loading books positive: 1000 texts
 - loading books negative: 1000 texts
 - loading dvd positive: 1000 texts
 - loading dvd negative: 1000 texts
 - loading electronics positive: 1000 texts
 - loading electronics negative: 1000 texts
 - loading kitchen positive: 1000 texts
 - loading kitchen negative: 1000 texts
data loaded
 - texts: 8000
 - s_labels: 8000
 - d_labels: 8000
building vocabulary
maxlen: 461
n_words: 45577
data encoding
labeled data: domain & train/val/test splitting
books splitting
 * all: (2000, 461) (2000,)
 * X: (1400, 461) (398, 461) (202, 461)
 * y: (1400,) (398,) (202,)
dvd splitting
 * all: (2000, 461) (2000,)
 * X: (1400, 461) (398, 461) (202, 461)
 * y: (1400,) (398,) (202,)
electronics splitting
 * all: (2000, 461) (2000,)
 * X: (1400, 461) (398, 461) (202, 461)
 * y: (1400,) (398,) (202,)
kitchen splitting
 * all: (2000, 461) (2000,)
 * X: (1400, 461) (398, 461) (202, 461)
 * y: (1400,) (398,) (202,)
combined l

  
  if __name__ == '__main__':


glove info: 35110 words, 300 dims
processing embedding matrix


### get model

In [7]:
def get_model():

    # load embeddings
    weights = wv_weights

    # the model
    print('\nbuilding the model')
    inputs = layers.Input(shape=(maxlen,))
    embeddings = layers.Embedding(
        input_dim= n_words,
        output_dim= embed_dim,
        input_length= maxlen,
        weights=weights)(inputs)
    embeddings = layers.SpatialDropout1D(rate=embed_dropout)(embeddings)

    # domain part
    d_repr = layers.Bidirectional(RNN(
        units=rnn_dim,
        return_sequences=False))(embeddings)
    d_repr = layers.Dense(hidden_dim, activation=activation)(d_repr)
    d_repr = layers.Dropout(fc_dropout)(d_repr)
    d_pred = layers.Dense(len(DOMAINS), activation='softmax', name='d_pred')(d_repr)

    # senti part
    # use domain representation as attention
    episodes = layers.Bidirectional(RNN(
        units=rnn_dim,
        return_sequences=True))(embeddings)
    selected, _ = att_process(candidates=episodes, att=d_repr)
    s_repr = layers.Dense(hidden_dim, activation=activation)(selected)
    s_repr = layers.Dropout(fc_dropout)(s_repr)
    s_pred = layers.Dense(2, activation='softmax', name='s_pred')(s_repr)

    # model
    model = Model(
        inputs=inputs,
        outputs=[s_pred, d_pred])
    model.compile(optimizer=optimizer, metrics=['acc'], loss={
        's_pred': 'categorical_crossentropy',
        'd_pred': 'categorical_crossentropy'
    }, loss_weights={
        's_pred': 1,
        'd_pred': coef
    })
    model.summary()
    return model


## Train Model and run

In [8]:
def train_and_test(model):

    # training
    updater = UpdateMonitor()
    reducer = callbacks.ReduceLROnPlateau(factor=lr_factor, patience=lr_patience, verbose=1)
    stopper = callbacks.EarlyStopping(patience=stop_patience, verbose=1)
    cbks = [updater, reducer, stopper]
    print('\ntraining model')
    model.fit(
        X_train,
        [ys_train, yd_train],
        validation_data=(X_val, [ys_val, yd_val]),
        shuffle=True, batch_size=batch_size, epochs=epochs, verbose=2,
        callbacks=cbks)

    # evaluation
    print('\nTest evaluation:')
    for d_id, d_name in enumerate(DOMAINS):
        scores = model.evaluate(
            X_test_byd[d_id],
            [ys_test_byd[d_id], yd_test_byd[d_id]],
            batch_size=batch_size, verbose=0)
        print('{} acc: {:.4f}'.format(d_name[:3], scores[-2]))


if __name__ == '__main__':
    # data process


    # build & compile model
    model = get_model()

    # train and test
    train_and_test(model)

    print('\nprocess finished ~~~')


building the model
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 461)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 461, 300)     13673100    input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 461, 300)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
b