# I. Little Tutorial:

In [2]:
import tensorflow as tf
import numpy as np

### Define the Graph:

In [3]:
# Create Placeholders For X And Y (for feeding in data)
X = tf.placeholder("float",[10, 10],name="X") # Our input is 10x10
Y = tf.placeholder("float", [10, 1],name="Y") # Our output is 10x1
# Create a Trainable Variable, "W", our weights for the linear transformation
initial_W = np.zeros((10,1))
W = tf.Variable(initial_W, name="W", dtype="float32")

# Define Your Loss Function
Loss = tf.pow(tf.add(Y,-tf.matmul(X,W)),2,name="Loss")

In [8]:
with tf.Session() as sess: # set up the session
    sess.run(tf.global_variables_initializer())
    Model_Loss = sess.run(
                Loss, # the first argument is the name of the Tensorflow variabl you want to return
                { # the second argument is the data for the placeholders
                  X: np.random.rand(10,10),
                  Y: np.random.rand(10).reshape(-1,1)
                })
    print(Model_Loss)

Losses
[[ 0.57452118]
 [ 0.04220407]
 [ 0.11925153]
 [ 0.16426601]
 [ 0.02727749]
 [ 0.03581695]
 [ 0.25064379]
 [ 0.07625095]
 [ 0.98142004]
 [ 0.79552728]]


### Multi-Task Case:

In [26]:
# Define the Placeholders
X = tf.placeholder("float", [10, 10], name="X")
Y1 = tf.placeholder("float", [10, 20], name="Y1")
Y2 = tf.placeholder("float", [10, 20], name="Y2")

# Define the weights for the layers

initial_shared_layer_weights = np.random.rand(10,20)
initial_Y1_layer_weights = np.random.rand(20,20)
initial_Y2_layer_weights = np.random.rand(20,20)

shared_layer_weights = tf.Variable(initial_shared_layer_weights, name="share_W", dtype="float32")
Y1_layer_weights = tf.Variable(initial_Y1_layer_weights, name="share_Y1", dtype="float32")
Y2_layer_weights = tf.Variable(initial_Y2_layer_weights, name="share_Y2", dtype="float32")

# Construct the Layers with RELU Activations
shared_layer = tf.nn.relu(tf.matmul(X,shared_layer_weights))
Y1_layer = tf.nn.relu(tf.matmul(shared_layer,Y1_layer_weights))
Y2_layer = tf.nn.relu(tf.matmul(shared_layer,Y2_layer_weights))

# Calculate Loss
Y1_Loss = tf.nn.l2_loss(Y1-Y1_layer)
Y2_Loss = tf.nn.l2_loss(Y2-Y2_layer)

Joint_Loss = Y1_Loss + Y2_Loss

## Alternate Training:

In [24]:
# Calculation (Session) Code
# ==========================

# open the session

# optimisers
Y1_op = tf.train.AdamOptimizer().minimize(Y1_Loss)
Y2_op = tf.train.AdamOptimizer().minimize(Y2_Loss)

with tf.Session() as session:
    session.run(tf.initialize_all_variables())
    for iters in range(10):
        if np.random.rand() < 0.5:
            _, Y1_loss = session.run([Y1_op, Y1_Loss],
                            {
                              X: np.random.rand(10,10)*10,
                              Y1: np.random.rand(10,20)*10,
                              Y2: np.random.rand(10,20)*10
                              })
            print(Y1_loss)
        else:
            _, Y2_loss = session.run([Y2_op, Y2_Loss],
                            {
                              X: np.random.rand(10,10)*10,
                              Y1: np.random.rand(10,20)*10,
                              Y2: np.random.rand(10,20)*10
                              })
            print(Y2_loss)


5.89154e+06
5.85761e+06
6.87897e+06
7.56048e+06
8.18204e+06
6.5788e+06
7.81574e+06
7.29146e+06
8.12310e+06
6.57724e+06


## Training at the Same time: Joint Training:

In [27]:
Optimiser = tf.train.AdamOptimizer().minimize(Joint_Loss)
with tf.Session() as session:
    session.run(tf.initialize_all_variables())
    _, Joint_Loss = session.run([Optimiser, Joint_Loss],
                    {
                      X: np.random.rand(10,10)*10,
                      Y1: np.random.rand(10,20)*10,
                      Y2: np.random.rand(10,20)*10
                      })
    print(Joint_Loss)

1.11279e+07


# II. Task ONE:
Training multilingual embeddings (initialized with the embeddings aligned offline or not) using concatenation of parallel sentences in fr-en, it-en, de-en where the weights of LSTM-CNN representation of the sentences are shared with Task TWO

## III. Task TWO:
Training hierarchical representation of documents using CNN+LSTM (document <- sentence <- word):

In [7]:
import tensorflow as tf
import tensorflow.contrib.layers as layers
import numpy as np
import data_util
from model_components import task_specific_attention, bidirectional_rnn

tf.reset_default_graph()
class MultiTaskEmbeddingsHANClassifier():
    """ Implementation of Multi-Tasking of the training of alignment of multilingual embeddings 
    and crosslingual document classification model described in `Hierarchical Attention Networks for Document Classification (Yang et al., 2016)`
    (https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf)"""

    def __init__(self,
           vocab_size,
           embedding_size,
           classes,
           word_cell,
           sentence_cell,
           word_output_size,
           sentence_output_size,
           max_grad_norm,
           dropout_keep_proba,
           is_training=None,
           learning_rate=1e-4,
           device='/cpu:0',
           scope=None):
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.classes = classes
        self.word_cell = word_cell
        self.word_output_size = word_output_size
        self.sentence_cell = sentence_cell
        self.sentence_output_size = sentence_output_size
        self.max_grad_norm = max_grad_norm
        self.dropout_keep_proba = dropout_keep_proba

        with tf.variable_scope(scope or 'tcm') as scope:
            self.global_step = tf.Variable(0, name='global_step', trainable=False)

        if is_training is not None:
            self.is_training = is_training
        else:
            self.is_training = tf.placeholder(dtype=tf.bool, name='is_training')

        self.sample_weights = tf.placeholder(shape=(None,), dtype=tf.float32, name='sample_weights')

        # [document x sentence x word]
        self.inputs = tf.placeholder(shape=(None, None, None), dtype=tf.int32, name='inputs')

        # [document x sentence]
        self.word_lengths = tf.placeholder(shape=(None, None), dtype=tf.int32, name='word_lengths')

        # [document]
        self.sentence_lengths = tf.placeholder(shape=(None,), dtype=tf.int32, name='sentence_lengths')

        # [document]
        self.labels = tf.placeholder(shape=(None,), dtype=tf.int32, name='labels')

        (self.document_size,
         self.sentence_size,
         self.word_size) = tf.unstack(tf.shape(self.inputs))

        self._init_embedding(scope)

        # embeddings cannot be placed on GPU
        with tf.device(device):
            self._init_body(scope)

        with tf.variable_scope('train'):
            self.cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=self.logits)

            self.loss = tf.reduce_mean(tf.multiply(self.cross_entropy, self.sample_weights))
            tf.summary.scalar('loss', self.loss)

            self.accuracy = tf.reduce_mean(tf.cast(tf.nn.in_top_k(self.logits, self.labels, 1), tf.float32))
            tf.summary.scalar('accuracy', self.accuracy)

            tvars = tf.trainable_variables()

            grads, global_norm = tf.clip_by_global_norm(
                tf.gradients(self.loss, tvars),
                self.max_grad_norm)
            tf.summary.scalar('global_grad_norm', global_norm)

            class_opt = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

            self.train_class_opt = class_opt.apply_gradients(
                zip(grads, tvars), name='train_op',
                global_step=self.global_step)

            self.summary_op = tf.summary.merge_all()

    def _init_embedding(self, scope):
        with tf.variable_scope(scope):
              with tf.variable_scope("embedding") as scope:
                self.embedding_matrix = tf.get_variable(
                  name="embedding_matrix",
                  shape=[self.vocab_size, self.embedding_size],
                  initializer=layers.xavier_initializer(),
                  dtype=tf.float32)
                self.inputs_embedded = tf.nn.embedding_lookup(
                  self.embedding_matrix, self.inputs)

    def _init_body(self, scope):
        with tf.variable_scope(scope):
            word_level_inputs = tf.reshape(self.inputs_embedded, [
                self.document_size * self.sentence_size,
                self.word_size,
                self.embedding_size
            ])
            word_level_lengths = tf.reshape(
                self.word_lengths, [self.document_size * self.sentence_size])

        with tf.variable_scope('word') as scope:
            word_encoder_output, _ = bidirectional_rnn(
                self.word_cell, self.word_cell,
                word_level_inputs, word_level_lengths,
                scope=scope)
        
        with tf.variable_scope('attention') as scope:
            word_level_output = task_specific_attention(
                word_encoder_output,
                self.word_output_size,
                scope=scope)

        with tf.variable_scope('dropout'):
            word_level_output = layers.dropout(
                word_level_output, keep_prob=self.dropout_keep_proba,
                is_training=self.is_training,
          )
    
        # sentence_level
        sentence_inputs = tf.reshape(
        word_level_output, [self.document_size, self.sentence_size, self.word_output_size])

        with tf.variable_scope('sentence') as scope:
            sentence_encoder_output, _ = bidirectional_rnn(
                self.sentence_cell, self.sentence_cell, sentence_inputs, self.sentence_lengths, scope=scope)

        with tf.variable_scope('attention', reuse=True) as scope:
            sentence_level_output = task_specific_attention(
                sentence_encoder_output, self.sentence_output_size, scope=scope)

        with tf.variable_scope('dropout'):
            sentence_level_output = layers.dropout(
                sentence_level_output, keep_prob=self.dropout_keep_proba,
                is_training=self.is_training,
            )

        with tf.variable_scope('classifier'):
            self.logits = layers.fully_connected(
                sentence_level_output, self.classes, activation_fn=None)

            self.prediction = tf.argmax(self.logits, axis=-1)

    def get_feed_data(self, x, y=None, class_weights=None, is_training=True):
        x_m, doc_sizes, sent_sizes = data_util.batch(x)
        fd = {
            self.inputs: x_m,
            self.sentence_lengths: doc_sizes,
            self.word_lengths: sent_sizes,
        }
        if y is not None:
            fd[self.labels] = y
            if class_weights is not None:
                fd[self.sample_weights] = [class_weights[yy] for yy in y]
            else:
                fd[self.sample_weights] = np.ones(shape=[len(x_m)], dtype=np.float32)
        fd[self.is_training] = is_training
        return fd


[[ 0.01003566 -0.00049934]
 [ 0.00282546  0.00468037]]


## Multi-Tasking the two tasks:

In [None]:
if __name__ == '__main__':
    try:
        from tensorflow.contrib.rnn import LSTMCell, LSTMStateTuple, GRUCell
    except ImportError:
        LSTMCell = tf.nn.rnn_cell.LSTMCell
        LSTMStateTuple = tf.nn.rnn_cell.LSTMStateTuple
        GRUCell = tf.nn.rnn_cell.GRUCell

    tf.reset_default_graph()
    with tf.Session() as session:
        model = MultiTaskEmbeddingsHANClassifier(
            vocab_size=10,
            embedding_size=5,
            classes=2,
            word_cell=GRUCell(10),
            sentence_cell=GRUCell(10),
            word_output_size=10,
            sentence_output_size=10,
            max_grad_norm=5.0,
            dropout_keep_proba=0.5,
        )
    session.run(tf.global_variables_initializer())

    fd = {
        model.is_training: False,
        model.doc_inputs: [[
            [5, 4, 1, 0],
            [3, 3, 6, 7],
            [6, 7, 0, 0]
        ],
        [
            [2, 2, 1, 0],
            [3, 3, 6, 7],
            [0, 0, 0, 0]
        ]],
        model.source_sentences: [
            [5, 4, 1, 0],
            [3, 3, 6, 7],
            [6, 7, 0, 0],
            [2, 2, 1, 0],
            [3, 3, 6, 7],
            [0, 0, 0, 0]
        ],
        model.target_sentences: [
            [5, 4, 1, 0],
            [3, 3, 6, 7],
            [6, 7, 0, 0],
            [2, 2, 1, 0],
            [3, 3, 6, 7],
            [0, 0, 0, 0]
        ],
        model.word_lengths: [
            [3, 4, 2],
            [3, 4, 0],
        ],
        model.sentence_lengths: [3, 2],
        model.labels: [0, 1],
        model.sample_weights:[1,1],
    }
    
    for iters in range(100): #number of minibatches
        if np.random.rand() < 0.5:
            _, Y1_loss = session.run([Y1_op, Y1_Loss], fd)
        else:
            _, Y1_loss = session.run([Y1_op, Y1_Loss], fd)
            

    print(session.run(model.logits, fd))
    session.run(model.train_op, fd)