In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import tensorflow as tf

In [3]:
import numpy as np
import os
import time

### Read the data

In [25]:
text_path = "/content/drive/MyDrive/Colab Notebooks/NLP/2/10-Text Generator(Attar's Poem)/Sample/naserkhosro.txt"
text = open(text_path, 'rb').read().decode(encoding='utf-8')

In [26]:
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

Length of text: 633429 characters


In [27]:
# Take a look at the first 250 characters in text
print(text[:250])

ای قبهٔ گردندهٔ بی روزن خضرا
با قامت فرتوتی و با قوت برنا
فرزند توایم ای فلک، ای مادر بدمهر
ای مادر ما چونکه همی کین کشی از ما؟
فرزند تو این تیره تن خامش خاکی است
پاکیزه خرد نیست نه این جوهر گویا
تن خانهٔ این گوهر والای شریف است
تو مادر این خانهٔ این


In [28]:
# The unique characters in the file
vocab = sorted(set(text))
print('{} unique characters'.format(len(vocab)))

50 unique characters


## Process the text

### Vectorize the text

Before training, we need to map strings to a numerical representation. Create two lookup tables: one mapping characters to numbers, and another for numbers to characters.

In [29]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

Now we have an integer representation for each character. Notice that we mapped the character as indexes from 0 to `len(unique)`.

In [30]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\n':   0,
  ' ' :   1,
  '!' :   2,
  '(' :   3,
  ')' :   4,
  '.' :   5,
  ':' :   6,
  '«' :   7,
  '»' :   8,
  '،' :   9,
  '؛' :  10,
  '؟' :  11,
  'ء' :  12,
  'آ' :  13,
  'ؤ' :  14,
  'ئ' :  15,
  'ا' :  16,
  'ب' :  17,
  'ة' :  18,
  'ت' :  19,
  ...
}


In [31]:
# Show how the first 13 characters from the text are mapped to integers
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:30]), text_as_int[:30]))

'ای قبهٔ گردندهٔ بی روزن خضرا\nب' ---- characters mapped to int ---- > [16 49  1 37 17 41 43  1 48 26 24 40 24 41 43  1 17 49  1 26 42 27 40  1
 23 31 26 16  0 17]


### Create training examples and targets

Next divide the text into example sequences. Each input sequence will contain `seq_length` characters from the text.

For each input sequence, the corresponding targets contain the same length of text, except shifted one character to the right.

So break the text into chunks of `seq_length+1`. For example, say `seq_length` is 4 and our text is "Hello". The input sequence would be "Hell", and the target sequence "ello".

To do this first use the `tf.data.Dataset.from_tensor_slices` function to convert the text vector into a stream of character indices.

In [32]:
# The maximum length sentence we want for a single input in characters
seq_length = 100

In [33]:
# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [34]:
for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

ا
ی
 
ق
ب


The `batch` method lets us easily convert these individual characters to sequences of the desired size.

In [35]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

In [36]:
for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))
    print("===")

'ای قبهٔ گردندهٔ بی روزن خضرا\nبا قامت فرتوتی و با قوت برنا\nفرزند توایم ای فلک، ای مادر بدمهر\nای مادر م'
===
'ا چونکه همی کین کشی از ما؟\nفرزند تو این تیره تن خامش خاکی است\nپاکیزه خرد نیست نه این جوهر گویا\nتن خان'
===
'هٔ این گوهر والای شریف است\nتو مادر این خانهٔ این گوهر والا\nچون کار خود امروز در این خانه بسازم\nمفرد ب'
===
'روم، خانه سپارم به تو فردا\nزندان تو آمد پسرا این تن و، زندان\nزیبا نشود گرچه بپوشیش به دیبا\nدیبای سخن '
===
'پوش به جان بر، که تو را جان\nهرگز نشود ای پسر از دیبا زیبا\nاین بند نبینی که خداوند نهاده است\nبر ما که '
===


For each sequence, duplicate and shift it to form the input and target text by using the `map` method to apply a simple function to each batch:

In [37]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

Print the first examples input and target values:

In [38]:
for input_example, target_example in  dataset.take(1):
    print('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'ای قبهٔ گردندهٔ بی روزن خضرا\nبا قامت فرتوتی و با قوت برنا\nفرزند توایم ای فلک، ای مادر بدمهر\nای مادر '
Target data: 'ی قبهٔ گردندهٔ بی روزن خضرا\nبا قامت فرتوتی و با قوت برنا\nفرزند توایم ای فلک، ای مادر بدمهر\nای مادر م'


In [39]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 16 (np.str_('ا'))
  expected output: 49 (np.str_('ی'))
Step    1
  input: 49 (np.str_('ی'))
  expected output: 1 (np.str_(' '))
Step    2
  input: 1 (np.str_(' '))
  expected output: 37 (np.str_('ق'))
Step    3
  input: 37 (np.str_('ق'))
  expected output: 17 (np.str_('ب'))
Step    4
  input: 17 (np.str_('ب'))
  expected output: 41 (np.str_('ه'))


### Create training batches

We used `tf.data` to split the text into manageable sequences. But before feeding this data into the model, we need to shuffle the data and pack it into batches.

In [40]:
BATCH_SIZE = 32
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

dataset

<_BatchDataset element_spec=(TensorSpec(shape=(32, 100), dtype=tf.int64, name=None), TensorSpec(shape=(32, 100), dtype=tf.int64, name=None))>

## Build The Model

Use `tf.keras.Sequential` to define the model. For this simple example three layers are used to define our model:

* `tf.keras.layers.Embedding`: The input layer. A trainable lookup table that will map the numbers of each character to a vector with `embedding_dim` dimensions;
* `tf.keras.layers.GRU`: A type of RNN with size `units=rnn_units` (You can also use a LSTM layer here.)
* `tf.keras.layers.Dense`: The output layer, with `vocab_size` outputs.



+ return_sequences: Whether to return the last output
    in the output sequence, or the full sequence.
    
+ return_state: Whether to return the last state
    in addition to the output.

In [41]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [42]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    inputs = tf.keras.Input(batch_shape=(batch_size, None))
    x = tf.keras.layers.Embedding(vocab_size, embedding_dim)(inputs)
    x = tf.keras.layers.GRU(rnn_units,
                            return_sequences=True,
                            stateful=True)(x)
    outputs = tf.keras.layers.Dense(vocab_size)(x)
    model = tf.keras.Model(inputs, outputs)
    return model


In [43]:
model = build_model(
                    vocab_size = len(vocab),
                    embedding_dim = embedding_dim,
                    rnn_units = rnn_units,
                    batch_size = BATCH_SIZE)

## Try the model

Now run the model to see that it behaves as expected.

First check the shape of the output:

In [44]:
for input_example_batch, target_example_batch in dataset.take(1):
    print(input_example_batch.shape)
    example_batch_predictions = model.predict(input_example_batch)
    print(example_batch_predictions.shape)

(32, 100)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
(32, 100, 50)


In the above example the sequence length of the input is `100` but the model can be run on inputs of any length:

In [45]:
model.summary()

To get actual predictions from the model we need to sample from the output distribution, to get actual character indices. This distribution is defined by the logits over the character vocabulary.

Note: It is important to _sample_ from this distribution as taking the _argmax_ of the distribution can easily get the model stuck in a loop.

Try it for the first example in the batch:

In [46]:
example_batch_predictions

array([[[ 7.8392401e-04, -1.4282959e-02, -5.0824303e-03, ...,
          2.0767143e-03, -1.3874118e-02, -3.2270160e-03],
        [ 6.9255042e-03, -2.4337061e-03,  2.6898317e-03, ...,
         -1.8207788e-03, -3.3074433e-03, -4.8244251e-03],
        [-5.5788299e-03,  2.0956341e-04, -3.7915658e-03, ...,
          3.4684665e-04, -3.6644940e-03, -5.7875793e-03],
        ...,
        [ 7.3813619e-03,  4.0533924e-03, -9.2794299e-03, ...,
          1.6204957e-02, -1.1179513e-02, -1.3463888e-03],
        [ 5.5944417e-03,  1.1395666e-02,  6.0529732e-03, ...,
          9.6706748e-03,  9.0573281e-03, -1.3590965e-02],
        [-6.4886170e-03,  1.0035698e-02, -2.0695808e-03, ...,
          8.4276004e-03,  1.5949975e-03, -1.2189129e-02]],

       [[ 7.8392401e-04, -1.4282959e-02, -5.0824303e-03, ...,
          2.0767143e-03, -1.3874118e-02, -3.2270160e-03],
        [-8.9413347e-03, -5.7290751e-03, -6.5885503e-03, ...,
          2.8229016e-03, -8.9362916e-03, -4.2078095e-03],
        [-1.3693246e-02, 

In [47]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices

<tf.Tensor: shape=(100, 1), dtype=int64, numpy=
array([[32],
       [43],
       [ 6],
       [14],
       [43],
       [ 8],
       [30],
       [13],
       [29],
       [45],
       [40],
       [18],
       [ 3],
       [34],
       [27],
       [46],
       [ 6],
       [45],
       [47],
       [23],
       [18],
       [34],
       [12],
       [23],
       [12],
       [28],
       [21],
       [13],
       [39],
       [ 5],
       [41],
       [ 8],
       [46],
       [ 9],
       [37],
       [21],
       [19],
       [45],
       [38],
       [ 2],
       [22],
       [36],
       [45],
       [25],
       [18],
       [48],
       [18],
       [17],
       [27],
       [31],
       [39],
       [ 0],
       [ 9],
       [38],
       [29],
       [ 6],
       [49],
       [27],
       [ 0],
       [11],
       [18],
       [35],
       [11],
       [37],
       [39],
       [29],
       [17],
       [44],
       [ 6],
       [13],
       [19],
       [46],
       [46],
   

This gives us, at each timestep, a prediction of the next character index:

In [48]:
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices

array([32, 43,  6, 14, 43,  8, 30, 13, 29, 45, 40, 18,  3, 34, 27, 46,  6,
       45, 47, 23, 18, 34, 12, 23, 12, 28, 21, 13, 39,  5, 41,  8, 46,  9,
       37, 21, 19, 45, 38,  2, 22, 36, 45, 25, 18, 48, 18, 17, 27, 31, 39,
        0,  9, 38, 29,  6, 49, 27,  0, 11, 18, 35, 11, 37, 39, 29, 17, 44,
        6, 13, 19, 46, 46, 31, 44, 13, 21,  0, 31,  6, 19, 34,  3,  8,  2,
        4, 30,  9, 31, 21, 24, 16, 16, 26, 46, 20, 28, 31, 28,  1])

Decode these to see the text predicted by this untrained model:

In [49]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 'ای قبهٔ گردندهٔ بی روزن خضرا\nبا قامت فرتوتی و با قوت برنا\nفرزند توایم ای فلک، ای مادر بدمهر\nای مادر '

Next Char Predictions: 
 'طٔ:ؤٔ»صآشچنة(عزژ:چکخةعءخءسجآم.ه»ژ،قجتچل!حفچذةگةبزضم\n،لش:یز\n؟ةغ؟قمشبپ:آتژژضپآج\nض:تع(»!)ص،ضجداارژثسضس '
