In [2]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import text_to_word_sequence

https://pub.towardsai.net/whirlwind-tour-of-rnns-a11effb7808f

The number of hidden nodes (nb_nodes) is a hyperparameter:
- A high number might result in overfitting...
- Try out different amount of hidden nodes - as always
- The number of nodes should ideally be in line with the complexity and dimensionality of your input data. For high-dimensional data, you might need more nodes to effectively capture the data's structure.

 #### RNN layer

In [3]:
print(f'tf-version {tf.__version__}')
nb_input_features = 10000
nb_timesteps = 6
nb_nodes = 6

simple_rnn_model = tf.keras.models.Sequential([
    tf.keras.layers.SimpleRNN(nb_nodes, 
                              input_shape=(nb_timesteps, nb_input_features),
                              return_sequences=True),
    tf.keras.layers.SimpleRNN(units=nb_timesteps, activation="softmax")
])

tf-version 2.10.1


#### LSTM layer

In [4]:
nb_input_features = 10
nb_timesteps = 5
nb_nodes = 4

lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(nb_nodes, 
                         input_shape=(nb_timesteps, nb_input_features))
])

lstm_model.summary()

print(f'Number of trainable parameters = {4 * (nb_input_features + nb_nodes + 1) * nb_nodes}')


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 4)                 240       
                                                                 
Total params: 240
Trainable params: 240
Non-trainable params: 0
_________________________________________________________________
Number of trainable parameters = 240


#### GRU layer

In [5]:
nb_input_features = 10
nb_timesteps = 5
nb_nodes = 4
batch=32
inputs = tf.random.normal([batch, nb_timesteps, nb_input_features])
gru = tf.keras.layers.GRU(units= nb_nodes)
output = gru(inputs)
print(output.shape)

gru = tf.keras.layers.GRU(4, return_sequences=True, return_state=True)
whole_sequence_output, final_state = gru(inputs)
print(whole_sequence_output.shape)

print(final_state.shape)

(32, 4)
(32, 5, 4)
(32, 4)


In [6]:
nb_input_features = 10
nb_timesteps = 5
nb_nodes = 4

gru_model = tf.keras.models.Sequential([
    tf.keras.layers.GRU(nb_nodes, 
                        input_shape=(nb_timesteps, nb_input_features))
]) # depends on whether reset_after is True or False! If False, will substract 3 * nb_nodes parameters.

gru_model.summary()

print(f'Number of trainable parameters = {3 * (nb_input_features + nb_nodes + 2) * nb_nodes}')

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_2 (GRU)                 (None, 4)                 192       
                                                                 
Total params: 192
Trainable params: 192
Non-trainable params: 0
_________________________________________________________________
Number of trainable parameters = 192


#### Bidirectional layer

In [7]:
nb_input_features = 10
nb_timesteps = 5
nb_nodes = 4

bidirectional_concat_lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(nb_nodes), 
                                  input_shape=(nb_timesteps,
                                               nb_input_features)), 
])

bidirectional_concat_lstm_model.summary()

print(f'Number of trainable parameters = {2 * 4 * (nb_input_features + nb_nodes + 1) * nb_nodes}')

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 8)                480       
 l)                                                              
                                                                 
Total params: 480
Trainable params: 480
Non-trainable params: 0
_________________________________________________________________
Number of trainable parameters = 480


##### One to One:

Description: This is the simplest form, where there is one input and one output. It's essentially a standard neural network structure applied to a single data point.
Example: A classic use case is a standard classification problem where you predict a single label from a single data point.

##### One to Many:

Description: In this architecture, a single input generates a sequence of outputs.
Example: An example is image captioning, where an image (single input) is used to generate a sequence of words forming a caption (many outputs).
Many to One:

Description: Here, a sequence of inputs leads to a single output. This is commonly used in tasks where the context or sequence is crucial for making a single prediction.
Example: Sentiment analysis is a typical application. A sequence of words (many inputs) is used to determine the sentiment of the sentence (one output).

##### Many to Many:

Description: This type involves a sequence of inputs and a sequence of outputs. There are two subtypes:

Synced Many to Many: The output is synchronized with the input at each timestep.

Async Many to Many: The input and output sequences are not synchronized.

Examples:

Synced: Part-of-speech tagging, where each word in a sentence (many inputs) is tagged with a part-of-speech label (many outputs), with each output corresponding to each input.

Async: Machine translation, where a sentence in one language (many inputs) is translated into another language (many outputs), but the lengths of the input and output sequences can be different.

#### Multiple recurrent layers

his is done by setting the argument **return_sequences** of a recurrent layer to **True**. As such, this needs to be done for each recurrent layer aside from the last (where we are only interested in the last time step, and as such no longer need the entire sequence).

In [8]:
nb_input_features = 10
nb_timesteps = 5;   nb_nodes_1 = 4; nb_nodes_2 = 3; nb_nodes_3 = 2

deep_rnn_model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(nb_nodes_1, input_shape=(nb_timesteps, nb_input_features), return_sequences=True),
    tf.keras.layers.GRU(nb_nodes_2, return_sequences=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(nb_nodes_3)),
    tf.keras.layers.Dense(1), # maybe we want to perform regression, where this might be the final layer
])

deep_rnn_model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 5, 4)              240       
                                                                 
 gru_3 (GRU)                 (None, 5, 3)              81        
                                                                 
 bidirectional_1 (Bidirectio  (None, 4)                24        
 nal)                                                            
                                                                 
 dense (Dense)               (None, 1)                 5         
                                                                 
Total params: 350
Trainable params: 350
Non-trainable params: 0
_________________________________________________________________


#### n-grams of words (bag of words)

The main idea of using n-grams in a bag-of-words model for text prediction is to capture the frequency and context of word sequences in the text data. In this approach:

N-grams: These are sequences of 'n' consecutive words. For example, in a bigram (2-gram) model, you consider pairs of adjacent words. This helps in capturing some context and word order, unlike a single word (unigram) model.

Bag-of-Words: This model treats text as an unordered collection (or "bag") of words. It ignores grammar and word order, focusing only on the occurrence of words in the document.

When combined, n-grams in a bag-of-words framework provide a simple yet effective way of predicting text. The model can predict or classify new text based on the frequency and patterns of n-grams seen during training. It's a foundational technique in natural language processing used for tasks like language modeling, text classification, and sentiment analysis.

In [9]:
text    = ["the cat sat on the mat"]
encoder = tf.keras.layers.TextVectorization(ngrams=2,max_tokens=100,
                                            output_mode="multi_hot")
encoder.adapt(text) # Computes a vocabulary of string terms from tokens in a dataset.
vocab   = np.array(encoder.get_vocabulary()) # Get and print the vocabulary
print(f'length of vacabulary: {len(vocab)}')
print(f'vocabulary: {vocab}')
encoded_example = []
for ngram in vocab:
    print(ngram)
    print(list(encoder(ngram).numpy()))
    encoded_example.append(list(encoder(ngram).numpy()))

length of vacabulary: 11
vocabulary: ['[UNK]' 'the' 'the mat' 'the cat' 'sat on' 'sat' 'on the' 'on' 'mat'
 'cat sat' 'cat']
[UNK]
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
the
[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
the mat
[0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]
the cat
[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
sat on
[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]
sat
[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
on the
[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0]
on
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
mat
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]
cat sat
[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0]
cat
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]


In [10]:
print('transforming the bag of words to a matrix:\n', 
      *encoded_example,sep='\n')

transforming the bag of words to a matrix:

[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]
[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]


#### Hashing

So here we create our numerical values for our vocabulary but unlike n-grams we get a code for - not pairwise words - but for each single word.

In [11]:
one_hot_dict = {
    'the': 0,
    'cat': 1,
    'sat': 2,
    'on': 3,
    'mat': 4,
}

numerical_encoded_sentence = [one_hot_dict[word] 
                              for word in 'the cat sat on the mat'.split(' ')]
print(numerical_encoded_sentence)

[0, 1, 2, 3, 0, 4]


In [12]:
print(tf.keras.utils.to_categorical(numerical_encoded_sentence))

[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]]


#### Embeddings

##### Dimensionality Reduction:

Traditional one-hot encoding results in high-dimensional vectors (length equals the size of the vocabulary) which are mostly zeros, leading to sparsity and inefficiency.
Embeddings, on the other hand, represent words in a lower-dimensional continuous space, typically ranging from 50 to 300 dimensions. This significantly reduces the size of the representation.

##### Capturing Semantic Information:

Unlike one-hot vectors, embeddings are designed to capture semantic information, meaning that words with similar meanings are represented by similar vectors. This is a powerful feature that n-grams and hashing techniques do not inherently provide.
For example, in a well-trained embedding space, words like 'king' and 'queen' would have vectors that are close to each other.

##### Training:

Embeddings are learned from data, often using neural networks. Models like Word2Vec, GloVe, or FastText analyze the contexts in which words appear and use this information to construct the embedding space.
This learning process allows embeddings to capture subtle semantic and syntactic relationships between words.

##### Distinction from N-Grams:

N-grams capture local context and word order by treating sequences of words as single units. However, they don't inherently capture semantic similarity between words and result in high-dimensional, sparse representations.
Embeddings focus more on the semantic relationships and provide dense, lower-dimensional representations.

##### Distinction from Hashing:

Hashing techniques (like hash-trick in feature engineering) are used to handle large vocabularies efficiently but typically involve loss of information and do not account for semantics.
Word embeddings, in contrast, are a more nuanced way of representing text, preserving and leveraging semantic information.
In essence, embeddings represent a significant advancement in the field of NLP, offering a way to efficiently and effectively capture the nuances of language, far beyond the capabilities of older techniques like n-grams or hashing.

##### Hashing and N-Grams:

Frequency Focus: 

Both hashing and n-grams primarily capture the frequencies of word occurrences or sequences. N-grams consider the local order of words, offering a bit more context than individual word frequencies.
Sparsity and Dimensionality: These methods often lead to high-dimensional, sparse representations. In large vocabularies, this can result in memory inefficiency and computational challenges.

##### Potential Collisions:

Collisions occur if different words are assigned the same integer. With a vocabulary size of 10, if your dataset has more than 10 unique words, there's a high chance of collisions.

In [13]:
reviews = ['nice food',
        'amazing restaurant',
        'too good',
        'just loved it!',
        'will go again',
        'horrible food',
        'never go there',
        'poor service',
        'poor quality',
        'needs improvement']
sentiment = np.array([1,1,1,1,1,0,0,0,0,0])

In [14]:
vocabulary_size = 4
print(one_hot("nice food",vocabulary_size))
print(one_hot("amazing restaurant",vocabulary_size))
print("Is there a problem?...collisions?")

print("")
print("Trying with a larger vocabulary")
vocabulary_size = 10
print(one_hot("nice food",vocabulary_size))
print(one_hot("amazing restaurant",vocabulary_size))

[3, 2]
[3, 3]
Is there a problem?...collisions?

Trying with a larger vocabulary
[3, 8]
[9, 6]


##### Vector representation of the vocabulary

In [15]:
encoded_reviews = [one_hot(d, vocabulary_size) for d in reviews]
print(f'one hot encoding/hashing: {encoded_reviews}')
print("Any collisions?")

max_length = 4
padded_reviews = pad_sequences(encoded_reviews, maxlen=max_length, padding='post')
print(f'one hot encoding/hashing: {padded_reviews}')

#Note that there can be a "collision": Some words are encoded with the same integer!!
#Increasing the vocabulary will reduce the likelihood of a collision...but what are
#the effects of this downstream?..a lager embedding layer/matrix?

one hot encoding/hashing: [[3, 8], [9, 6], [8, 6], [9, 1, 5], [2, 1, 3], [5, 8], [1, 1, 5], [3, 4], [3, 5], [5, 2]]
Any collisions?
one hot encoding/hashing: [[3 8 0 0]
 [9 6 0 0]
 [8 6 0 0]
 [9 1 5 0]
 [2 1 3 0]
 [5 8 0 0]
 [1 1 5 0]
 [3 4 0 0]
 [3 5 0 0]
 [5 2 0 0]]


##### Approach that eliminate collisions without increasing the vocabulary

In [16]:
MAX_VOCAB_SIZE = 40
encoder = tf.keras.layers.TextVectorization(
    max_tokens=MAX_VOCAB_SIZE)
encoder.adapt(reviews)
vocab = np.array(encoder.get_vocabulary())
print(f'length of vacabulary: {len(vocab)}')

encoded_example = encoder(reviews).numpy()
max_length = 4
padded_reviews = pad_sequences(encoded_example, maxlen=max_length,
                               padding='post')
print(f'one hot encoding/hashing: {reviews}')
print(f'one hot encoding/hashing:\n {padded_reviews}')

length of vacabulary: 22
one hot encoding/hashing: ['nice food', 'amazing restaurant', 'too good', 'just loved it!', 'will go again', 'horrible food', 'never go there', 'poor service', 'poor quality', 'needs improvement']
one hot encoding/hashing:
 [[11  4  0  0]
 [20  9  0  0]
 [ 6 19  0  0]
 [15 14 16  0]
 [ 5  3 21  0]
 [18  4  0  0]
 [12  3  7  0]
 [ 2  8  0  0]
 [ 2 10  0  0]
 [13 17  0  0]]


##### Bulding a simple model with an embedding layer

In [17]:
embedding_dimension = 5
embedding_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=len(vocab), output_dim=embedding_dimension,
                             input_length=max_length,name="embedding"),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

embedding_model.compile(optimizer='adam', loss='binary_crossentropy', 
                        metrics=['accuracy'])
print(embedding_model.summary())

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 5)              110       
                                                                 
 flatten (Flatten)           (None, 20)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 21        
                                                                 
Total params: 131
Trainable params: 131
Non-trainable params: 0
_________________________________________________________________
None


Defining input and output and fitting/evaluating the model:

In [18]:
X = padded_reviews
y = sentiment

In [19]:
embedding_model.fit(X, y, epochs=50, verbose=1)
# evaluate the model

loss, accuracy = embedding_model.evaluate(X, y)
accuracy

Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


1.0

Comparing the estimated word vectors

In [20]:
weights =embedding_model.get_layer('embedding').get_weights()[0]
#Horrible
print(f'Horrible: {weights[padded_reviews[5][0]]}')
#Poor
print(f'Poor: {weights[padded_reviews[7][0]]}')
#Good
print(f'Nice: {weights[padded_reviews[0][0]]}')
#Amazing
print(f'Amazing: {weights[padded_reviews[1][0]]}')

Horrible: [ 0.06484096  0.06660457  0.02772433 -0.06045873  0.01426301]
Poor: [ 0.04897718  0.05049273  0.09752455 -0.00280119  0.0686097 ]
Nice: [-0.02680342 -0.037039   -0.09945454  0.00261691 -0.06602365]
Amazing: [-0.09734542 -0.02814195 -0.06347525  0.018133   -0.08245863]


#### A recurrent model with an embedding layer

In [21]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=1000, output_dim=128),
    tf.keras.layers.GRU(64),
    tf.keras.layers.Dense(10, activation='softmax'),
    ])

model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         128000    
                                                                 
 gru_4 (GRU)                 (None, 64)                37248     
                                                                 
 dense_2 (Dense)             (None, 10)                650       
                                                                 
Total params: 165,898
Trainable params: 165,898
Non-trainable params: 0
_________________________________________________________________


#### A convolutional recurrent model for sequence 
- Not a part of Christians lecture...

#### RNN for predicting - generating text
- Important apply for exercise 5?