# [Assignment 5](https://ovgu-ailab.github.io/idl2023/assignment5.html)

Collaborative Work from Adrian Bremer & Philipp Reinig

## Setup

In [1]:
import tensorflow as tf
import numpy as np

## Preparing IMDB

In [2]:
num_words = 20000
(train_sequences, train_labels), (test_sequences, test_labels) = tf.keras.datasets.imdb.load_data(num_words=num_words)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [3]:
train_labels

array([1, 0, 0, ..., 0, 1, 0])

In [4]:
sequence_lengths = [len(sequence) for sequence in train_sequences]

max_len = max(sequence_lengths)
print(max_len)
mean_len = int(np.mean(sequence_lengths))
print(mean_len)

2494
238


**Ideas for using not the full-length padding scheme**
- use the mean length and every other word is _UNKNOWN_
  - _truncating_ instead of throwing away since the long sequences are important too because when they are longer the way it is written is different & truncating _the back_ (post) because mostly the first few words are like "Ehh, this is bad"

In [5]:
train_sequences_padded = tf.keras.preprocessing.sequence.pad_sequences(
    train_sequences,
    maxlen=mean_len, # max_len
    padding="pre",
    truncating="post"
)

In [6]:
train_sequences_padded.shape

(25000, 238)

In [7]:
# one_hot_sequences_padded = tf.one_hot(indices=train_sequences_padded, depth=num_words)

**Problem**
- the one-hot-vectors are too large so RAM isn't enough

**Ideas to fix this**
- for storage use integers as vectors because we only need 1 or 0
  - here the indices could be used right away
- or just use the indices and **construct the one-hot-vectors for each batch**
- or encode words of a sequence in one vector but that is not so nice since the word ordering is lost
- _Target encoding_ - but that is probably not so easy to adapt to this problem since we need to classify multiple sentences and not single words & we would give away which sentences are important for the classification

In [20]:
BATCH_SIZE = 64

In [8]:
train_data = tf.data.Dataset.from_tensor_slices((train_sequences_padded, train_labels)).shuffle(60000).repeat().batch(BATCH_SIZE)

In [9]:
for x,y in train_data:
  print(y)
  break

tf.Tensor(
[1 1 0 0 1 1 1 1 1 0 1 1 0 0 0 0 1 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 0 0 0 0 0
 0 0 1 1 0 0 0 1 1 1 1 1 0 0 1 0 0 1 1 1 0 1 1 1 0 0 0], shape=(64,), dtype=int64)


In [10]:
sequence_lengths = [len(sequence) for sequence in test_sequences]
max_len = max(sequence_lengths)

test_sequences_padded = tf.keras.preprocessing.sequence.pad_sequences(
    test_sequences,
    maxlen=max_len,
    padding="pre",
    truncating="post"
)
test_data = tf.data.Dataset.from_tensor_slices((test_sequences_padded, test_labels)).shuffle(60000)

## Building the RNN

In [38]:
class RNN:
  def __init__(self, num_features, num_outputs, num_hidden_units, rand_init_low_high=0.1, activation=tf.nn.sigmoid):
    self.nhidden = num_hidden_units
    self.activation = activation
    self.num_words = num_features
    self.Wh = tf.Variable(np.random.uniform(low=-rand_init_low_high, high=rand_init_low_high, size=(num_hidden_units, num_hidden_units)).astype(np.float32))
    self.bh = tf.Variable(np.random.uniform(low=-rand_init_low_high, high=rand_init_low_high, size=(num_hidden_units)).astype(np.float32))
    self.Wx = tf.Variable(np.random.uniform(low=-rand_init_low_high, high=rand_init_low_high, size=(num_features, num_hidden_units)).astype(np.float32))
    self.bx = tf.Variable(np.random.uniform(low=-rand_init_low_high, high=rand_init_low_high, size=(num_hidden_units)).astype(np.float32))
    self.Wy = tf.Variable(np.random.uniform(low=-rand_init_low_high, high=rand_init_low_high, size=(num_hidden_units, num_outputs)).astype(np.float32))
    self.by = tf.Variable(np.random.uniform(low=-rand_init_low_high, high=rand_init_low_high, size=(num_outputs)).astype(np.float32))

  def cell(self, xt, ht_1):
    """
    xt: input x at timestep t
    ht_1: output from hidden unit in previous timestep t-1

    returns: yt, ht
    """
    xo = tf.matmul(xt, self.Wx) + self.bx
    ho = tf.matmul(ht_1, self.Wh) + self.bh
    logits = xo + ho
    ht = self.activation(logits)
    #yt = tf.matmul(self.Wy, ht) + self.by
    return ht

  def compute_y(self, ht):
    return tf.matmul(ht, self.Wy) + self.by

  def __call__(self, x):
    """
    batch is of shape (batch_size, timesteps, features)
    returns: the logits in the last timestep
    """
    # first hidden activation is 0
    ht = tf.constant(np.zeros(shape=(x.shape[0], self.nhidden)).astype(np.float32))
    one_hot_vectors = tf.one_hot(indices=tf.experimental.numpy.swapaxes(x,0,1), depth=self.num_words) # swap batch & timestep to get (timesteps, batch_size, features)
    for xt in one_hot_vectors:
      # only if not padded
      if tf.reduce_max(xt[:,0]) != 0:
        ht = self.cell(xt, ht)

    return self.compute_y(ht)

  def train(self, train_data, num_epochs, steps_per_epoch, batch_size, optimizer=tf.optimizers.Adam(), loss_fn=tf.losses.CategoricalCrossentropy(from_logits=True)):
    variables = [self.Wh, self.Wx, self.Wy, self.bh, self.bx, self.by]
    optimizer.build(variables)
    for epoch in range(num_epochs):
      losses = []
      for i,(x,y) in enumerate(train_data):
        if i >= steps_per_epoch:
          break
        with tf.GradientTape() as tape:
          logits = self(x)
          # shape of logits is (BATCH_SIZE,1) -> needs reshape
          loss = loss_fn(y, tf.reshape(logits, (batch_size,)))

        losses.append(loss)

        gradients = tape.gradient(loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))

      print("Epoch {} done: loss = {}".format(epoch, np.mean(losses)))


In [39]:
rnn = RNN(num_words, 1, 100)
rnn.train(train_data, 10, 5, BATCH_SIZE)

Epoch 0 done: loss = 130.5621337890625
Epoch 1 done: loss = 124.7631607055664
Epoch 2 done: loss = 143.0655517578125
Epoch 3 done: loss = 141.4359893798828
Epoch 4 done: loss = 138.10482788085938
Epoch 5 done: loss = 127.27082824707031
Epoch 6 done: loss = 123.10667419433594
Epoch 7 done: loss = 126.40826416015625
Epoch 8 done: loss = 129.7408447265625
Epoch 9 done: loss = 129.75808715820312


In [40]:
TEST_SIZE = 100

accuracy = 0
for n, (x,y) in enumerate(test_data.batch(1)):
  if n >= TEST_SIZE:
    break

  guess = rnn(x)
  if (guess > 0.5 and y == 1) or (guess <= 0.5 and y == 0):
    accuracy = (n)/(n+1) * accuracy + 1/(n+1)
  else:
    accuracy = (n)/(n+1) * accuracy

  print("\r" + str(accuracy), end='', flush=True)


# E_n = 1/n * sum(1,n,x_i)
#     = 1/n (sum(1,n-1,x_i)+x_n)
#     = (n-1)/n * 1/(n-1) * sum(1,n-1,x_i) + x_n/n
#     = (n-1)/n * E_n-1 + x_n/n

0.47999999999999987

**Observations**
- the forward step is really slow
- therefore training this takes ages
  - _**How can you speed this up or is it correct because of the sequential manner of the RNN?**_
- _currently_: network is guessing (accuracy of around 50%)

**Thoughts about outputs**
- having one output means that e.g. 1 is hate speech and 0 is not
  - therefore, the relation between them is hate_speech = 1 - friendly
- having 2 output units on the other hand means that output unit 1 is for hate and output unit 2 is for friendly
  - here they don't necessarly need to add up to 1 since a text may criticise but also emphasises good parts of the movie
- HERE: it is easier with one unit because you can directly compare with the given binary labels -> _there is no such intermediate thing as mentioned above_


## Open problems

**initial state**
- in this case there is no need to learn an initial state because each review is independent of the other ones
- the initial state would give a tendence if a review is positiv or negative and this is not feasible here

**when to pad**
- pre-padding would probably be better, because when the sentence ends we generate the output instead of having to feed the current hidden activation through he network

**avoid computing padded sequences**
- Since we padded with 0, we could check if hot_vector[0] contains a 1
- if it contains it, we could skip

## Use outputs from every timestep

**incorporating all outputs**
- averaging logits
- averaging hidden states
  - if logits means the logits before the sigmoid activation to get the next hidden state, then _states_ and logits are the same
  - if logits means the logits before the activation when computing the output y, then it is different
    - here: the logits would have gone through one more matrix multiplication and there are less logits than in the hidden states
- averaging sigmoids
  - this means that we average how sure the network was during reading the text from beginning to end
  - this is similar to a human reading it and trying to understand if it is positive or negative
  - but averaging this - especially per word - can be bad because many words on their own aren't important for the meaning
  - is different because we average wth the non-linearities
  - especially with sigmoid is it bad, because really large logits map to nearly the same value as quite small logits -> many smaller logits can out weight the really large one where the network was really sure

**advantages of such techniques**
- when the model thinks really hard at the beginning that this is a negative / positive text, it needs to propagate this through the whole network
- but with these averaging techniques there are some kind of skip connections to the end
- this improves gradient flow and makes it easier to propagate information through the network

**disadvantages**
- makes it a little bit more complicated
- can be problematic because the network has more options and can behave worse in some cases

> **_use averaging logits (before y)_** because it keeps quite a lot of information and Wy is used more often and can be trained better instead of having just 1 operation with it

In [45]:
class RNN2:
  def __init__(self, num_features, num_outputs, num_hidden_units, rand_init_low_high=0.1, activation=tf.nn.sigmoid):
    self.nhidden = num_hidden_units
    self.activation = activation
    self.num_words = num_features
    self.ys = []
    self.Wh = tf.Variable(np.random.uniform(low=-rand_init_low_high, high=rand_init_low_high, size=(num_hidden_units, num_hidden_units)).astype(np.float32))
    self.bh = tf.Variable(np.random.uniform(low=-rand_init_low_high, high=rand_init_low_high, size=(num_hidden_units)).astype(np.float32))
    self.Wx = tf.Variable(np.random.uniform(low=-rand_init_low_high, high=rand_init_low_high, size=(num_features, num_hidden_units)).astype(np.float32))
    self.bx = tf.Variable(np.random.uniform(low=-rand_init_low_high, high=rand_init_low_high, size=(num_hidden_units)).astype(np.float32))
    self.Wy = tf.Variable(np.random.uniform(low=-rand_init_low_high, high=rand_init_low_high, size=(num_hidden_units, num_outputs)).astype(np.float32))
    self.by = tf.Variable(np.random.uniform(low=-rand_init_low_high, high=rand_init_low_high, size=(num_outputs)).astype(np.float32))

  def cell(self, xt, ht_1):
    """
    xt: input x at timestep t
    ht_1: output from hidden unit in previous timestep t-1

    returns: yt, ht
    """
    xo = tf.matmul(xt, self.Wx) + self.bx
    ho = tf.matmul(ht_1, self.Wh) + self.bh
    logits = xo + ho
    ht = self.activation(logits)
    # keep track of logits at each timestep
    self.ys.append(self.compute_y(ht))
    return ht

  def compute_y(self, ht):
    return tf.matmul(ht, self.Wy) + self.by

  def average_output_guess(self):
    if len(self.ys) <= 0:
      raise AssertionError("There was no output (logit) recorded.")
    # reduce along the timestep axis -> keeping batches
    return tf.nn.sigmoid(tf.reduce_mean(self.ys, axis=0))

  def __call__(self, x):
    """
    batch is of shape (batch_size, timesteps, features)
    returns: the logits in the last timestep
    """
    # initialize logits tracking
    self.ys = []
    # first hidden activation is 0
    ht = tf.Variable(np.zeros(shape=(x.shape[0], self.nhidden)).astype(np.float32))
    one_hot_vectors = tf.one_hot(indices=np.swapaxes(x,0,1), depth=self.num_words) # swap batch & timestep to get (timesteps, batch_size, features)
    for xt in one_hot_vectors:
      # only if not padded
      if tf.reduce_max(xt[:,0]) != 0:
        ht = self.cell(xt, ht)

    return self.average_output_guess()

  def train(self, train_data, num_epochs, steps_per_epoch, optimizer=tf.optimizers.Adam(), loss_fn=tf.losses.CategoricalCrossentropy(from_logits=True)):
    variables = [self.Wh, self.Wx, self.Wy, self.bh, self.bx, self.by]
    optimizer.build(variables)
    for epoch in range(num_epochs):
      losses = []
      for i,(x,y) in enumerate(train_data):
        if i >= steps_per_epoch:
          break
        with tf.GradientTape() as tape:
          logits = self(x)
          loss = loss_fn(y, tf.reshape(logits, (x.shape[0],)))

        losses.append(loss)

        gradients = tape.gradient(loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))

      print("Epoch {} done: loss = {}".format(epoch, np.mean(losses)))


In [46]:
rnn2 = RNN2(num_words, 1, 100)
rnn2.train(train_data, 10, 1)

Epoch 0 done: loss = 145.56103515625
Epoch 1 done: loss = 145.5611114501953
Epoch 2 done: loss = 112.28897857666016
Epoch 3 done: loss = 149.7200927734375
Epoch 4 done: loss = 124.76795959472656
Epoch 5 done: loss = 112.2895278930664
Epoch 6 done: loss = 158.0383758544922
Epoch 7 done: loss = 158.03810119628906
Epoch 8 done: loss = 124.76657104492188
Epoch 9 done: loss = 95.65647888183594


In [52]:
accuracy = 0
for n, (x,y) in enumerate(test_data.batch(1)):
  if n >= TEST_SIZE:
    break

  guess = rnn2(x)
  if (guess > 0.5 and y == 1) or (guess <= 0.5 and y == 0):
    accuracy = (n)/(n+1) * accuracy + 1/(n+1)
  else:
    accuracy = (n)/(n+1) * accuracy

  print("\rStep {}: accuracy = {}".format(n,accuracy), end='', flush=True)

Step 99: accuracy = 0.5299999999999998

**Conclusion**
- using all intermediate outputs and averaging them seems to work a little bit better than only taking the last one
- therefore, _**training now intensively**_

In [53]:
rnn2.train(train_data, 20, 100)

Epoch 0 done: loss = 134.2486572265625
Epoch 1 done: loss = 133.5832977294922
Epoch 2 done: loss = 134.74778747558594
Epoch 3 done: loss = 132.50201416015625
Epoch 4 done: loss = 133.91603088378906
Epoch 5 done: loss = 135.0805206298828
Epoch 6 done: loss = 134.58145141601562
Epoch 7 done: loss = 131.91976928710938
Epoch 8 done: loss = 134.49827575683594
Epoch 9 done: loss = 131.75341796875
Epoch 10 done: loss = 134.8309783935547
Epoch 11 done: loss = 136.32818603515625
Epoch 12 done: loss = 134.49827575683594
Epoch 13 done: loss = 135.1636962890625
Epoch 14 done: loss = 133.91604614257812
Epoch 15 done: loss = 132.3356475830078
Epoch 16 done: loss = 129.008544921875
Epoch 17 done: loss = 132.2524871826172
Epoch 18 done: loss = 140.15435791015625
Epoch 19 done: loss = 133.91603088378906


## Further possible improvements

We had the idea that adjectives and adverbs are most important for classifying a text as positive or negative.
Therefore, we tried some preprocessing which shrinks the vocabulary size and wouldspeed things up.

In [None]:
!pip install lemminflect

In [None]:
import statistics
import string

import nltk

from lemminflect import getAllLemmas

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

In [None]:
# remove infrequent words. you can play with this parameter as it will likely impact model quality
num_words = 5000

# Use the default parameters to keras.datasets.imdb.load_data
start_char = 1
oov_char = 2
index_from = 3

# Retrieve the dataset
(train_seqs, train_labels), (test_seqs, test_labels) = keras.datasets.imdb.load_data(start_char=start_char, oov_char=oov_char, index_from=index_from, num_words=num_words)

# Retrieve the word index file mapping words to indices
word_index = dict(sorted(keras.datasets.imdb.get_word_index().items()))
word_index["[START]"] = start_char
word_index["[OOV]"] = oov_char

print(f"First 100 elements of the word_index: {list(word_index.items())[:100]}")

# Reverse the word index to obtain a dict mapping indices to words
# And add `index_from` to indices to sync with `x_train`
inverted_word_index = dict((i + index_from, word) for (word, i) in word_index.items())
# Update `inverted_word_index` to include `start_char` and `oov_char`
inverted_word_index[start_char] = "[START]"
inverted_word_index[oov_char] = "[OOV]"

inverted_word_index = dict(sorted(inverted_word_index.items()))

In [None]:
def indx_to_word(i: int) -> str:
  return inverted_word_index.get(i, "UNKNOWN")

def word_to_indx(w: str) -> int:
  return word_index[w]+3

def seq_to_words(seq: list[str]) -> str:
  """
  Converts a sequence (a list of indices, decoding words) to decoded string (a list of strings the indices decode)
  """
  return [indx_to_word(indx) for indx in seq]

def seq_to_text(seq: list[str]) -> str:
  """
  Converts a sequence (a list of indices, decoding words) to decoded human readable text (one large string)
  """
  return " ".join(seq_to_words(seq))

In [1]:
# NLTK library has a really bad performance for detecting adjectives ~ 50%
def is_adjective_or_adverb_using_nltk(s: str) -> bool:
  nltk_type_of_word = nltk.pos_tag([s], tagset="universal")[0][1]
  is_adjective_or_adverb = nltk_type_of_word == "ADJ" or nltk_type_of_word == "ADV"
  #if is_adjective_or_adverb:
  print(f"{s} {'is an adjective or adverb' if is_adjective_or_adverb else f'is not an adjective or adverb, it is: {nltk_type_of_word}'}")
  return is_adjective_or_adverb

# Lemminflect's performance is > 90%, way better!
def is_adjective_or_adverb(s: str) -> bool:
  lemmas = getAllLemmas(s)
  #print(f"{s}'s lemmas: {lemmas}")
  return "ADJ" in lemmas or "ADV" in lemmas

In [None]:
# Pre-process a single string
def pre_process_str(s: str) -> str:

  # Remove punctuation
  def remove_punctuation(s: str) -> str:
    s = s.translate(str.maketrans('', '', string.punctuation))
    return s

  return remove_punctuation(s)


# Pre-process an entire sequence
def pre_process_sequence(seq: list[int]) -> list[int]:
  words = seq_to_words(seq)
  words_processed = [w for w in words if(w != "UNKNOWN" and  w != "[OOV]" and w != "[START]" and is_adjective_or_adverb(w))]
  return [word_to_indx(w) for w in words_processed]


In [52]:
print(f"Without processing: {seq_to_text(train_seqs[0])}")
print(f"After preprocessing: {seq_to_text(pre_process_sequence(train_seqs[0]))}")

Without processing: [START] this film was just brilliant casting location scenery story direction everyone's really suited [START] part they played [OOV] you could just imagine being there robert [OOV] is an amazing actor [OOV] now [START] same being director [OOV] father came from [START] same scottish island as myself so i loved [START] fact there was a real connection with this film [START] witty remarks throughout [START] film were great it was just brilliant so much that i bought [START] film as soon as it was released for [OOV] [OOV] would recommend it to everyone to watch [OOV] [START] fly [OOV] was amazing really cried at [START] end it was so sad [OOV] you know what they say if you cry at a film it must have been good [OOV] this definitely was also [OOV] to [START] two little [OOV] that played [START] [OOV] of norman [OOV] paul they were just brilliant children are often left out of [START] [OOV] list i think because [START] stars that play them all grown up are such a big [OO

Unfortunatly we had _**no time to try it out**_ with the model because doing research for this was really time intensive.