# [Assignment 5](https://ovgu-ailab.github.io/idl2023/assignment5.html)

Collaborative Work from Adrian Bremer and Philipp Reinig

## Setup

In [2]:
import tensorflow as tf
import numpy as np

## Preparing IMDB

In [3]:
num_words = 20000
(train_sequences, train_labels), (test_sequences, test_labels) = tf.keras.datasets.imdb.load_data(num_words=num_words)

In [4]:
train_labels

array([1, 0, 0, ..., 0, 1, 0])

In [5]:
sequence_lengths = [len(sequence) for sequence in train_sequences]

max_len = max(sequence_lengths)
print(max_len)
mean_len = int(np.mean(sequence_lengths))
print(mean_len)

2494
238


**Ideas for using not the full-length padding scheme**
- use the mean length and every other word is _UNKNOWN_
  - _truncating_ instead of throwing away since the long sequences are important too because when they are longer the way it is written is different & truncating _the back_ (post) because mostly the first few words are like "Ehh, this is bad"

In [18]:
train_sequences_padded = tf.keras.preprocessing.sequence.pad_sequences(
    train_sequences,
    maxlen=mean_len, # max_len
    padding="pre",
    truncating="post"
)

In [7]:
train_sequences_padded.shape

(25000, 238)

In [8]:
# one_hot_sequences_padded = tf.one_hot(indices=train_sequences_padded, depth=num_words)

**Problem**
- the one-hot-vectors are too large so RAM isn't enough

**Ideas to fix this**
- for storage use integers as vectors because we only need 1 or 0
  - here the indices could be used right away
- or just use the indices and **construct the one-hot-vectors for each batch**
- or encode words of a sequence in one vector but that is not so nice since the word ordering is lost
- _Target encoding_ - but that is probably not so easy to adapt to this problem since we need to classify multiple sentences and not single words & we would give away which sentences are important for the classification

In [19]:
train_data = tf.data.Dataset.from_tensor_slices((train_sequences_padded, train_labels)).shuffle(60000).repeat().batch(64)

In [10]:
for x,y in train_data:
  print(y)
  break

tf.Tensor(
[0 1 0 0 1 1 1 1 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 1 0 0 1 1 0 1 1 0 1 0 1
 1 1 0 0 0 1 1 0 0 1 0 0 1 0 1 0 1 0 1 1 1 0 0 0 1 1 1], shape=(64,), dtype=int64)


In [20]:
sequence_lengths = [len(sequence) for sequence in test_sequences]
max_len = max(sequence_lengths)

test_sequences_padded = tf.keras.preprocessing.sequence.pad_sequences(
    test_sequences,
    maxlen=max_len,
    padding="pre",
    truncating="post"
)
test_data = tf.data.Dataset.from_tensor_slices((test_sequences_padded, test_labels)).shuffle(60000)

## Building the RNN

In [36]:
class RNN:
  def __init__(self, num_features, num_outputs, num_hidden_units, rand_init_low_high=0.1, activation=tf.nn.sigmoid):
    self.nhidden = num_hidden_units
    self.activation = activation
    self.num_words = num_features
    self.Wh = tf.Variable(np.random.uniform(low=-rand_init_low_high, high=rand_init_low_high, size=(num_hidden_units, num_hidden_units)).astype(np.float32))
    self.bh = tf.Variable(np.random.uniform(low=-rand_init_low_high, high=rand_init_low_high, size=(num_hidden_units)).astype(np.float32))
    self.Wx = tf.Variable(np.random.uniform(low=-rand_init_low_high, high=rand_init_low_high, size=(num_features, num_hidden_units)).astype(np.float32))
    self.bx = tf.Variable(np.random.uniform(low=-rand_init_low_high, high=rand_init_low_high, size=(num_hidden_units)).astype(np.float32))
    self.Wy = tf.Variable(np.random.uniform(low=-rand_init_low_high, high=rand_init_low_high, size=(num_hidden_units, num_outputs)).astype(np.float32))
    self.by = tf.Variable(np.random.uniform(low=-rand_init_low_high, high=rand_init_low_high, size=(num_outputs)).astype(np.float32))

  def cell(self, xt, ht_1):
    """
    xt: input x at timestep t
    ht_1: output from hidden unit in previous timestep t-1

    returns: yt, ht
    """
    xo = tf.matmul(xt, self.Wx) + self.bx
    ho = tf.matmul(ht_1, self.Wh) + self.bh
    logits = xo + ho
    ht = self.activation(logits)
    #yt = tf.matmul(self.Wy, ht) + self.by
    return ht

  def compute_y(self, ht):
    return tf.matmul(ht, self.Wy) + self.by

  def __call__(self, x):
    """
    batch is of shape (batch_size, timesteps, features)
    returns: the logits in the last timestep
    """
    # first hidden activation is 0
    ht = tf.Variable(np.zeros(shape=(x.shape[0], self.nhidden)).astype(np.float32))
    one_hot_vectors = tf.one_hot(indices=np.swapaxes(x,0,1), depth=self.num_words) # swap batch & timestep to get (timesteps, batch_size, features)
    for xt in one_hot_vectors:
      # only if not padded
      if tf.reduce_max(xt[:,0]) != 0:
        ht = self.cell(xt, ht)

    return self.compute_y(ht)

  def train(self, train_data, num_epochs, steps_per_epoch, optimizer=tf.optimizers.Adam(), loss_fn=tf.losses.CategoricalCrossentropy(from_logits=True)):
    variables = [self.Wh, self.Wx, self.Wy, self.bh, self.bx, self.by]
    optimizer.build(variables)
    for epoch in range(num_epochs):
      losses = []
      for i,(x,y) in enumerate(train_data):
        if i >= steps_per_epoch:
          break
        with tf.GradientTape() as tape:
          logits = self(x)
          loss = loss_fn(y, tf.reshape(logits, (x.shape[0],)))

        losses.append(loss)

        gradients = tape.gradient(loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))

      print("Epoch {} done: loss = {}".format(epoch, np.mean(losses)))


In [37]:
rnn = RNN(num_words, 1, 100)
rnn.train(train_data, 10, 2)

Epoch 0 done: loss = 133.09063720703125
Epoch 1 done: loss = 139.35427856445312
Epoch 2 done: loss = 135.17076110839844
Epoch 3 done: loss = 143.5218505859375
Epoch 4 done: loss = 126.86592864990234
Epoch 5 done: loss = 149.6897735595703
Epoch 6 done: loss = 145.5753173828125
Epoch 7 done: loss = 131.01925659179688
Epoch 8 done: loss = 103.99049377441406
Epoch 9 done: loss = 137.21192932128906


In [38]:
accuracy = 0
for n, (x,y) in enumerate(test_data.batch(1)):
  guess = rnn(x)
  if (guess > 0.5 and y == 1) or (guess <= 0.5 and y == 0):
    accuracy = (n)/(n+1) * accuracy + 1/(n+1)
  else:
    accuracy = (n)/(n+1) * accuracy

  print("\r" + str(accuracy), end='', flush=True)


# E_n = 1/n * sum(1,n,x_i)
#     = 1/n (sum(1,n-1,x_i)+x_n)
#     = (n-1)/n * 1/(n-1) * sum(1,n-1,x_i) + x_n/n
#     = (n-1)/n * E_n-1 + x_n/n

0.55

KeyboardInterrupt: ignored

**Observations**
- the forward step is really slow
- therefore training this takes ages
  - _**How can you speed this up or is it correct because of the sequential manner of the RNN?**_
- _currently_: network is guessing (accuracy of around 50%)

**Thoughts about outputs**
- having one output means that e.g. 1 is hate speech and 0 is not
  - therefore, the relation between them is hate_speech = 1 - friendly
- having 2 output units on the other hand means that output unit 1 is for hate and output unit 2 is for friendly
  - here they don't necessarly need to add up to 1 since a text may criticise but also emphasises good parts of the movie
- HERE: it is easier with one unit because you can directly compare with the given binary labels -> _there is no such intermediate thing as mentioned above_


## Open problems

**initial state**
- in this case there is no need to learn an initial state because each review is independent of the other ones
- the initial state would give a tendence if a review is positiv or negative and this is not feasible here

**when to pad**
- pre-padding would probably be better, because when the sentence ends we generate the output instead of having to feed the current hidden activation through he network

**avoid computing padded sequences**
- Since we padded with 0, we could check if hot_vector[0] contains a 1
- if it contains it, we could skip

## Use outputs from every timestep