In [1]:
import tensorflow as tf
import pandas as pd


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


## Natural Language Processing

Natural Language Processing (or NLP for short) is a discipline in computing that deals with the communications between natural (human) languages and computer languages. A common example of NLP is something like spell check or autocomplete. Essentially NLP is the field that focuses on how computers can understand and/or process natural/human languages. 

## Recurrent Neural Networks

RNNs are capable of processing sequential data such as text or characters 

- Sentiment Analysis
- Character Generation

## Sequence Data

In [2]:
vocab = {}
word_encoding = 1
def bag_of_words(text):
    global word_encoding


    words = text.lower().split(" ")
    bag = {}

    for word in words:
        if word in vocab:
            encoding - vocab[word]
        else:
            vocab[word] = word_encoding
            encoding = word_encoding
            word_encoding += 1
    
        if encoding in bag:
            bag[encoding] += 1
        else:
            bag[encoding] = 1
    return bag

text = "this is a test to see if this test will work is is test"
bag = bag_of_words(text)
print(bag)
print(vocab)     

{1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 3, 8: 1, 9: 4}
{'this': 1, 'is': 2, 'a': 3, 'test': 4, 'to': 5, 'see': 6, 'if': 7, 'will': 8, 'work': 9}


## Sentiment Analysis

The definition is the process of computationally identifying and categorizing opinions expressed in a piece of text, especially to determine whether the writer's attitude towards a particular topic, or product is positive, negative, or neutral.

## Movie Review Dataset

In [3]:
from keras.datasets import imdb
from keras.preprocessing import sequence
import tensorflow as tf
import os 
import numpy as np

VOCAB_SIZE = 88584

MAXLEN = 250
BATCH_SIZE = 64

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words= VOCAB_SIZE)


In [4]:
train_data[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 22665,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 21631,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 31050,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5

## Move Preprocessing

If we have a look at some of our loaded reviews we'll notice that they are different lengths. This is an issue. We cannot pass different lengths of data into our neural network. Therefore we must make each review the same length. To do this we will follow the procedure below.

- If the review is greater than 250 words then trim off each the extra words. 
- If the review is less than 250 words add the necessary amount of 0's to make it equal to 250

In [5]:
train_data = sequence.pad_sequences(train_data, MAXLEN)
test_data = sequence.pad_sequences(test_data, MAXLEN)

## Creating the Model

We will use a word embedding layer as the first layer in our model and add an LSTM layer afterward that feeds into a dense node to get our predicted sentiment.

32 stands for the output dimension of the vectors generated by the embedding layer. 

In [6]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 32),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [7]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          2834688   
                                                                 
 lstm (LSTM)                 (None, 32)                8320      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 2843041 (10.85 MB)
Trainable params: 2843041 (10.85 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Training

In [8]:
model.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=['acc'])

history = model.fit(train_data, train_labels, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
results = model.evaluate(test_data, test_labels)
print(results)

[0.47918543219566345, 0.858959972858429]


## Making Predictions

In [10]:
import keras

word_index = imdb.get_word_index()

def encode_text(text):
    tokens = keras.preprocessing.text.text_to_word_sequence(text)
    tokens = [word_index[word] if word in word_index else 0 for word in tokens]
    return sequence.pad_sequences([tokens], MAXLEN)[0]

text = "that movie was just amazing, so amazing"
encoded = encode_text(text)
print(encoded)

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0  12  17  13  4

In [11]:
reverse_word_index = {value: key for (key, value) in word_index.items()}

def decode_integers(integers):
    PAD = 0
    text = ""
    for num in integers:
        if num != PAD:
            text += reverse_word_index[num] + ""
    return text[:-1]
print(decode_integers(encoded))

thatmoviewasjustamazingsoamazin


In [12]:
# time to make a prediction

def predict(text):
    encoded_text = encode_text(text)
    pred = np.zeros((1,250))
    pred[0] = encoded_text
    result = model.predict(pred)
    print(result[0])
    
positve_review = "That movie was so awesome! I really loved it and would watch it again because it was amazing"
predict(positve_review)

negative_review = "that movie sucked. I hated it and would not watch it again. Was one of the worst things I've ever watched" 
predict(negative_review)

[0.82422864]
[0.41191435]


## RNN Poem/ Play Generator

Now we are going to use an RNN to generate a play. We will simply show the RNN an example of something we want it to recreate and it will learn how to write a version of it on its own. We'll do this using a character predictive model that will take as input a variable-length sequence and predict the next character. We can use the model many times in a row with the output from the last predictions as the input for the next call to generate a sequence.

In [13]:
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os 
import numpy as np

## Dataset

In [14]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

## Loading Your own Data

In [15]:
# from google.colab import files
# path_to_file = list(files.upload().keys())[0]

## Read Contents of File

In [16]:
# Read, then decode for py2 compat.

text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it 
print ('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [17]:
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



## Encoding

Since this text isn't encoded yet well need to do that ourself. We are going to encode each unique character as a different ineger

In [18]:
vocab = sorted(set(text))
# creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

def text_to_int(text):
    return np.array([char2idx[c] for c in text])

text_as_int = text_to_int(text)

In [19]:
# lets look at how part of our text is encoded
print("Text:", text[:13])
print("Encoded:", text_to_int(text[:13]))

Text: First Citizen
Encoded: [18 47 56 57 58  1 15 47 58 47 64 43 52]


And here we will make a function that can convert our numeric values to text.

In [20]:
def int_to_text(ints):
    try:
        ints = ints.numpy()
    except:
        pass 
    return ''.join(idx2char[ints])

print(int_to_text(text_as_int[:13]))

First Citizen


## Creating Training Examples

Remembering our task is to feed the model a sequence and have it return to us the next character. This means we need to split our text data from above into many shorter sequences that we can pass to the model as training examples.

The training we will prepare will use a seq_length sequence as input and a seq_length sequence as the output where that sequence is the original sequence shifted one letter to the right. 

In [21]:
seq_length = 100 # length of sequence for a training example
examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

Next we can use the batch method to turn this stream of characters into batches of desired length

In [22]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

Now we need to use these sequences of length 101 and split them into input and output 

In [23]:
def split_input_target(chunk): # for the example:hello
    input_text = chunk[:-1] # hello
    target_text = chunk[1:] # ello
    return input_text, target_text # hell, ello

dataset = sequences.map(split_input_target) # we use map to apply the above function to every entry

In [24]:
for x, y in dataset.take(2):
    print("\n\nEXAMPLE\n")
    print("INPUT")
    print(int_to_text(x))
    print("\nOUTPUT")
    print(int_to_text(y))



EXAMPLE

INPUT
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You

OUTPUT
irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 


EXAMPLE

INPUT
are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you 

OUTPUT
re all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you k


Fianlly we need to make training batches

In [25]:
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab) # vocab is the number of unique characers
EMBEDDING_DIM = 256
RNN_UNITS = 1024

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequence,
# so it doesn
# t attempt to shuffle the entire sequence in memory. Instead, 
# it maintains a buffer in which it shuffles elements)

BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

## Building the Model

Now it is time to build the model. We will use an embedding layer LSTM, and one dense layer that contains a node for each unique character in our training data. The dense layer will give us a probability. 

In [26]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units,
                         return_sequences=True,
                         stateful=True,
                         recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (64, None, 256)           16640     
                                                                 
 lstm_1 (LSTM)               (64, None, 1024)          5246976   
                                                                 
 dense_1 (Dense)             (64, None, 65)            66625     
                                                                 
Total params: 5330241 (20.33 MB)
Trainable params: 5330241 (20.33 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Creating a Loss Function

Now we are going to create our loss function for this problem. This is because our model will output a (64, sequence_length, 65) shaped tensor that represents the probability distribution of each character at each timestamp for every sequence in the batch.

However, before we do that let's have a look at a sample input and the output from our untrained model. This is so we can understand what the model is actually giving us.

In [27]:
for input_example_batch, target_example_batch in data.take(1):
    example_batch_predictions = model(input_example_batch) # ask our model for a prediciton on our first batch of training data
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)") # print out the output shape

(64, 100, 65) # (batch_size, sequence_length, vocab_size)


In [28]:
# we can see that the prediction is an array of 64 array, one for each entry in the batch
print(len(example_batch_predictions))
print(example_batch_predictions)

64
tf.Tensor(
[[[ 7.52468314e-03  7.47854589e-04  3.03820800e-03 ...  4.69247065e-03
    2.25057499e-03 -1.71929516e-03]
  [ 3.83482873e-03  1.91756699e-03 -1.16433250e-03 ...  4.75279661e-03
   -6.09286479e-04  1.50424056e-03]
  [-2.46454380e-03 -1.70865585e-03  9.38076293e-04 ...  4.78767604e-03
   -7.91998580e-03  3.94922495e-03]
  ...
  [-4.95320652e-04 -2.10995926e-03  6.88036764e-03 ...  4.72197589e-03
    1.10435777e-03 -1.68420398e-03]
  [ 5.39307063e-03 -4.19754535e-03  3.63111077e-03 ...  5.97755192e-03
   -2.14624126e-03 -4.24108002e-03]
  [-1.26293593e-03 -7.78077543e-03  6.24626363e-03 ...  5.08348178e-03
   -7.26723671e-03 -5.92700962e-04]]

 [[ 2.14243727e-03 -2.79189786e-04  2.96757440e-03 ...  8.01330898e-05
    2.29353923e-03  8.90440890e-04]
  [ 2.27318960e-04  1.33099267e-04 -2.00211187e-04 ...  2.57162703e-03
    3.06119770e-03  1.68464391e-03]
  [-7.50330184e-03  1.02394726e-04  1.75802957e-03 ...  4.57954500e-03
    4.85904049e-03  1.19408139e-03]
  ...
  [-3.420

In [29]:
#  lets examine one prediction
pred = example_batch_predictions[0]
print(len(pred))
print(pred)
# notice this is a 2d array of length 100, where each interior is the prediction for the next

100
tf.Tensor(
[[ 0.00752468  0.00074785  0.00303821 ...  0.00469247  0.00225057
  -0.0017193 ]
 [ 0.00383483  0.00191757 -0.00116433 ...  0.0047528  -0.00060929
   0.00150424]
 [-0.00246454 -0.00170866  0.00093808 ...  0.00478768 -0.00791999
   0.00394922]
 ...
 [-0.00049532 -0.00210996  0.00688037 ...  0.00472198  0.00110436
  -0.0016842 ]
 [ 0.00539307 -0.00419755  0.00363111 ...  0.00597755 -0.00214624
  -0.00424108]
 [-0.00126294 -0.00778078  0.00624626 ...  0.00508348 -0.00726724
  -0.0005927 ]], shape=(100, 65), dtype=float32)


In [30]:
# and finally well look ata prediction at the first timestep
time_pred = pred[0]
print(len(time_pred))
print(time_pred)
# and of course its 65 values representing the probabillity of each character occuring next

65
tf.Tensor(
[ 7.5246831e-03  7.4785459e-04  3.0382080e-03 -3.4172705e-03
 -5.7150521e-03 -2.4804785e-03 -5.2539934e-03  5.0792156e-04
 -8.5141417e-04 -1.1780384e-04 -6.6540819e-03  4.1186796e-03
  8.3103720e-03  2.1683308e-03 -3.5964467e-03  4.3471921e-03
  3.5418123e-03 -7.9033207e-03  1.8901996e-03  4.2487960e-03
 -8.4591418e-04 -1.8868948e-03 -1.0837242e-03 -2.7089042e-03
 -5.0206273e-03 -1.2774453e-03  8.9711370e-04 -9.2898821e-03
 -1.0478126e-03 -1.4666410e-03 -3.4905844e-03  4.8223003e-03
  1.5959326e-03 -1.9189032e-03  2.1222052e-03 -5.9363537e-04
 -5.4394943e-05  5.3489273e-03  4.7227899e-03 -1.8341374e-03
 -7.5931605e-03 -2.0364928e-03  7.1149152e-03 -9.2580519e-04
 -1.1237874e-04 -1.0232350e-03 -1.7354682e-03 -1.2888259e-03
  5.9911227e-03 -5.4257149e-03  5.7432945e-03  2.9295245e-03
 -2.1900458e-05  1.7815575e-03 -8.8213728e-04  6.0356851e-03
 -3.2999432e-03 -4.5606797e-03  4.4665062e-03 -1.2756911e-03
  2.8614053e-03  6.8711741e-03  4.6924707e-03  2.2505750e-03
 -1.719295

In [31]:
# If we want to determine the predicted character we need to sample the output distribution (pick a value based on probabilites)
sampled_indices = tf.random.categorical(pred, num_samples=1)

# now we can reshpae that array and ocnvert all the integers to numbers to see the actual characters
sampled_indices = np.reshape(sampled_indices, (1, -1))[0]
predicted_chars = int_to_text(sampled_indices)

predicted_chars # and this is what the model preidcted for training sequence 1

"A,g,pGhVyNijQ?RHYT.epfZPxku3xAeGf'R3fBLfNyQTn?'PD,-Fnipet,e\noA.HgoJ3mzlefKDCajo,$.oLOwEb?atB\nAKo\nHln"

So now we need to create our own loss function thaht can compare that output to the expected output and give us some numeric value representing how close the two were.

In [32]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

## Compiling the Model

At this point we can think of our problem as a classification problem where the model predicts the probability of each unique letter coming next.

In [33]:
model.compile(optimizer='adam', loss=loss)

## Creating Checkpoints

Now we are going to setup and configure our model to save checkpoints as it trains. This will allow us to load our model from a checkpoint and continue training it.

In [34]:
# Durectiry where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True
)

## Training

Finally we will start training the model

In [35]:
history = model.fit(data, epochs=5, callbacks=[checkpoint_callback])

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Loading the Model

We'll rebuild the model from a checkpoint using a batch_size of 1 so that we can feed one piece of text to the model and have it make a prediction.

In [36]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size=1)

Once the model is finished training we can find the lastest checkpoint that stores the models weights using the follow lines.

In [37]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

We can load any checkpoint we want by specifying the exact file to load.

In [None]:
checkpoint_num = 10
model.load_weights(tf.train.load_checkpoint("./training_checkpoints/ckpt_" + str(checkpoint_num)))
model.build(tf.TensorShape([1, None]))

## Generating Text

In [39]:
def generate_text(model, start_string):
    # Evaluation steps (generating text using the learned model)
    
    # Number of characters to generate 
    num_generate = 800
    
    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    
    # Empty string to store our results 
    text_generated = []
    
    # Low temperatures results in more predictable text.
    # Higher temperatures results in moer suprising text.
    # Experiment to find the best setting.
    
    temperature = 1.0
    
    # Here batch size == 1
    model.reset_states()
    for i in range (num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0 )
        
        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        
        # we pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)
        
        text_generated.append(idx2char[predicted_id])
    return (start_string + ''.join(text_generated))

In [40]:
inp = input("Type a starting string: ")
print(generate_text(model, inp))

gangue she had,
The nabling naturl: light in caniears
That if is lipht's kindless headerst hamph
And save the curmen shord unto the sturn and men.

WORWER:
The griWedest that you say it is.

LEONTES:
We should
gentle a thousands; my die eye wears
What I may need newled it, to do you;
If I absuge, and in myself consperitring
The deforus shall op's hourd after me,
And I have put wear so great thy much,
Encend five; but seem swe withal son cry accose
Pusit but the crows. Whose presently?

First Wellon:
What he is not thy heaven, death?

ROMEO:
O bear, granden some vows bet thee, comestur: though are galle's barigain
In show me did all mase.

CLARENCE:
With a villain!
Canst, I am some fime axaling-all break.

ROMEO:
Out formest than and false bearing;
To ston in requifuse night not come here,
bow 
