In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import tensorflow as tf

In [3]:
import numpy as np
import os
import time

### Read the data

In [4]:
text_path = "/content/drive/MyDrive/Colab Notebooks/NLP/2/10-Text Generator(Attar's Poem)/Sample/naserkhosro.txt"
text = open(text_path, 'rb').read().decode(encoding='utf-8')

In [5]:
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

Length of text: 633429 characters


In [6]:
# Take a look at the first 250 characters in text
print(text[:250])

ای قبهٔ گردندهٔ بی روزن خضرا
با قامت فرتوتی و با قوت برنا
فرزند توایم ای فلک، ای مادر بدمهر
ای مادر ما چونکه همی کین کشی از ما؟
فرزند تو این تیره تن خامش خاکی است
پاکیزه خرد نیست نه این جوهر گویا
تن خانهٔ این گوهر والای شریف است
تو مادر این خانهٔ این


In [7]:
# The unique characters in the file
vocab = sorted(set(text))
print('{} unique characters'.format(len(vocab)))

50 unique characters


## Process the text

### Vectorize the text

Before training, we need to map strings to a numerical representation. Create two lookup tables: one mapping characters to numbers, and another for numbers to characters.

In [8]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

Now we have an integer representation for each character. Notice that we mapped the character as indexes from 0 to `len(unique)`.

In [9]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\n':   0,
  ' ' :   1,
  '!' :   2,
  '(' :   3,
  ')' :   4,
  '.' :   5,
  ':' :   6,
  '«' :   7,
  '»' :   8,
  '،' :   9,
  '؛' :  10,
  '؟' :  11,
  'ء' :  12,
  'آ' :  13,
  'ؤ' :  14,
  'ئ' :  15,
  'ا' :  16,
  'ب' :  17,
  'ة' :  18,
  'ت' :  19,
  ...
}


In [10]:
# Show how the first 13 characters from the text are mapped to integers
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:30]), text_as_int[:30]))

'ای قبهٔ گردندهٔ بی روزن خضرا\nب' ---- characters mapped to int ---- > [16 49  1 37 17 41 43  1 48 26 24 40 24 41 43  1 17 49  1 26 42 27 40  1
 23 31 26 16  0 17]


### Create training examples and targets

Next divide the text into example sequences. Each input sequence will contain `seq_length` characters from the text.

For each input sequence, the corresponding targets contain the same length of text, except shifted one character to the right.

So break the text into chunks of `seq_length+1`. For example, say `seq_length` is 4 and our text is "Hello". The input sequence would be "Hell", and the target sequence "ello".

To do this first use the `tf.data.Dataset.from_tensor_slices` function to convert the text vector into a stream of character indices.

In [11]:
# The maximum length sentence we want for a single input in characters
seq_length = 100

In [12]:
# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [13]:
for i in char_dataset.take(10):
    print(i.numpy(), idx2char[i.numpy()])

16 ا
49 ی
1  
37 ق
17 ب
41 ه
43 ٔ
1  
48 گ
26 ر


### Sliding Window for Sequence Generation

Instead of using simple batching, we use the **sliding window** method to generate overlapping sequences of length `seq_length + 1`.  
This ensures that every character in the text contributes to multiple training samples, preserving the continuity of the text.  
Compared to fixed-size batching, this approach creates **richer and more context-aware training data**, helping the model better capture character-level dependencies in the text.


In [14]:
# Sliding window: ساخت پنجره‌های طول ثابت با گام 1
sequences = char_dataset.window(size=seq_length + 1, shift=1, drop_remainder=True)

In [15]:
# هر پنجره را به یک تانسور تبدیل می‌کنیم تا بتوان پردازشش کرد
sequences = sequences.flat_map(lambda window: window.batch(seq_length + 1))

For each sequence, duplicate and shift it to form the input and target text by using the `map` method to apply a simple function to each batch:

In [16]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

Print the first examples input and target values:

In [17]:
# بررسی ۳ نمونه اولیه
for input_example, target_example in dataset.take(3):
    print('Input: ', repr(''.join(idx2char[input_example.numpy()])))
    print('Target:', repr(''.join(idx2char[target_example.numpy()])))
    print('-' * 30)

Input:  'ای قبهٔ گردندهٔ بی روزن خضرا\nبا قامت فرتوتی و با قوت برنا\nفرزند توایم ای فلک، ای مادر بدمهر\nای مادر '
Target: 'ی قبهٔ گردندهٔ بی روزن خضرا\nبا قامت فرتوتی و با قوت برنا\nفرزند توایم ای فلک، ای مادر بدمهر\nای مادر م'
------------------------------
Input:  'ی قبهٔ گردندهٔ بی روزن خضرا\nبا قامت فرتوتی و با قوت برنا\nفرزند توایم ای فلک، ای مادر بدمهر\nای مادر م'
Target: ' قبهٔ گردندهٔ بی روزن خضرا\nبا قامت فرتوتی و با قوت برنا\nفرزند توایم ای فلک، ای مادر بدمهر\nای مادر ما'
------------------------------
Input:  ' قبهٔ گردندهٔ بی روزن خضرا\nبا قامت فرتوتی و با قوت برنا\nفرزند توایم ای فلک، ای مادر بدمهر\nای مادر ما'
Target: 'قبهٔ گردندهٔ بی روزن خضرا\nبا قامت فرتوتی و با قوت برنا\nفرزند توایم ای فلک، ای مادر بدمهر\nای مادر ما '
------------------------------


### Create Training Batches

After generating overlapping input-target pairs using the **sliding window**, we now prepare the data for efficient training.

Using `tf.data.Dataset.batch`, we group these pairs into batches of size `BATCH_SIZE`.  
This batching ensures optimized parallel processing on GPUs and uniform batch sizes by setting `drop_remainder=True`.  
The final `dataset` contains batches of sequences, each ready to be fed into the model during training.


In [18]:
BATCH_SIZE = 32
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

dataset

<_BatchDataset element_spec=(TensorSpec(shape=(32, None), dtype=tf.int64, name=None), TensorSpec(shape=(32, None), dtype=tf.int64, name=None))>

## Build the Model

We define the model using `tf.keras.Sequential`, composed of three main layers:

- **Embedding Layer**: Maps character indices into dense vectors of size `embedding_dim`. This layer is trainable and learns character-level embeddings.
- **GRU Layer**: A type of recurrent neural network that captures sequence dependencies. We use `return_sequences=True` so the model outputs a prediction at each time step. The `stateful=True` option helps the model retain memory across batches for better long-term dependency learning.
- **Dense Layer**: Outputs logits for each character in the vocabulary, one per time step.

The model is built using the specified `vocab_size`, `embedding_dim`, and `rnn_units`.

### Flexible Model Builder with GRU or LSTM

We enhance the model-building function to support both GRU and LSTM layers, selectable via a `rnn_type` argument.

- `GRU`: Efficient and faster to train
- `LSTM`: Better for capturing long-term dependencies

Dropout is added to reduce overfitting. Layer normalization is included after the RNN layer for better training stability.
This setup allows easy experimentation and comparison between different RNN architectures.


In [19]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [43]:
def build_lstm_model(vocab_size, embedding_dim, rnn_units, batch_size, dropout_rate=0.2):
    inputs = tf.keras.Input(batch_shape=(batch_size, None), dtype=tf.int32)

    x = tf.keras.layers.Embedding(vocab_size, embedding_dim)(inputs)

    x = tf.keras.layers.LSTM(
        rnn_units,
        return_sequences=True,
        stateful=True,
        dropout=dropout_rate,
        recurrent_initializer='glorot_uniform'
    )(x)

    x = tf.keras.layers.LayerNormalization()(x)
    outputs = tf.keras.layers.Dense(vocab_size)(x)

    model = tf.keras.Model(inputs, outputs)
    return model


In [44]:
model_lstm = build_model(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE,
    rnn_type='LSTM'
)

### Try the Model

Before training, we can run the model on a sample input batch to verify its behavior.

- The model outputs a tensor of shape `(batch_size, sequence_length, vocab_size)`.
- To simulate actual text generation, we sample from the output distribution at each time step using `tf.random.categorical`, instead of taking `argmax`, to avoid repetitive predictions.
- Finally, we decode the predicted character indices to see the raw output of the untrained model.


In [45]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model_gru.predict(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 167ms/step
(32, 100, 50) # (batch_size, sequence_length, vocab_size)


In [46]:
example_batch_predictions

array([[[ -0.5249215 ,   2.3918264 ,  -7.1202517 , ...,  -1.3167013 ,
          -1.5620731 ,   1.0766512 ],
        [  0.92533475,   4.5751505 ,  -2.1935277 , ...,  -2.408623  ,
          -1.4269058 ,  -1.8004861 ],
        [ -3.5202813 ,  -2.406284  ,  -3.0853682 , ...,   2.2221081 ,
           1.5361933 ,  -0.11064744],
        ...,
        [ -0.3903244 ,   1.8764384 ,  -7.3837485 , ...,  -1.6681508 ,
           1.0220612 ,  -0.39580935],
        [  0.8346012 ,   4.417794  ,  -7.175815  , ...,  -3.0666664 ,
          -1.3717612 ,   0.90533733],
        [ -3.5669644 ,  -2.6329465 ,  -5.0795445 , ...,   2.2521503 ,
           1.518972  ,   0.1873501 ]],

       [[  0.87647027,   3.5626388 ,  -5.010244  , ...,   3.0610664 ,
          -1.7341214 ,  -2.6249466 ],
        [ -4.1322823 ,  -2.9421353 ,  -4.549287  , ...,   2.796818  ,
           1.4297993 ,  -0.24430805],
        [ -5.850661  ,  -2.8289795 ,  -7.173169  , ...,  -1.6420285 ,
           0.01687189,   2.2343833 ],
        ...,


In [47]:
model_lstm.summary()

In [48]:
sampled_indices = tf.random.categorical(
    logits=example_batch_predictions[0],  # shape = (sequence_length, vocab_size)
    num_samples=1
)

In [49]:
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [50]:
sampled_indices

array([ 1, 39, 39, 16, 49, 43,  1, 26, 36, 36,  0,  1,  0,  0,  0, 19, 34,
        0, 38, 42, 40,  1,  1, 16, 41, 38,  1, 40, 16, 24, 32, 24, 30, 47,
       41,  1, 39, 34,  1, 42, 40,  1,  1, 17,  1, 24, 49, 29, 28, 36, 38,
       41, 30, 48,  1, 16,  0, 39, 33, 28, 40, 24,  0, 16, 41,  0, 40, 24,
       16, 16, 28, 40, 38, 26, 16,  0,  1, 40, 27, 40, 39, 40, 40, 26,  1,
       48, 30,  1,  0, 43,  9, 49, 48,  1, 45, 26, 24, 26,  1, 23])

In [51]:
input_text = ''.join([idx2char[i] for i in input_example_batch[0].numpy()])

In [52]:
predicted_text = ''.join([idx2char[i] for i in sampled_indices])

In [53]:
print("Input: \n", repr(input_text))
print()
print("Next Char Predictions: \n", repr(predicted_text))

Input: 
 'ای قبهٔ گردندهٔ بی روزن خضرا\nبا قامت فرتوتی و با قوت برنا\nفرزند توایم ای فلک، ای مادر بدمهر\nای مادر '

Next Char Predictions: 
 ' ممایٔ رفف\n \n\n\nتع\nلون  اهل نادطدصکه مع ون  ب دیشسفلهصگ ا\nمظسند\nاه\nندااسنلرا\n نزنمننر گص \nٔ،یگ چردر خ'


## Train the model

### Loss Function and Model Compilation

We define a custom loss function using `sparse_categorical_crossentropy`, suitable for multi-class classification tasks with integer labels.  
Since the model outputs raw logits (not softmax probabilities), we set `from_logits=True`.

We then compile the model using the Adam optimizer and our loss function to prepare it for training.


In [54]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [55]:
example_batch_loss = loss(target_example_batch, example_batch_predictions)

In [56]:
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Example batch loss (mean):", tf.reduce_mean(example_batch_loss).numpy())

Prediction shape:  (32, 100, 50)  # (batch_size, sequence_length, vocab_size)
Example batch loss (mean): 2.1003053


In [57]:
# LSTM
model_lstm.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=loss,
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
)

### Configuring Checkpoints

Each model (GRU and LSTM) is trained separately and stores its weights in a dedicated checkpoint directory.  
We use `ModelCheckpoint` with `save_weights_only=True` to save only the model parameters after each epoch.  
This setup allows independent training, evaluation, and recovery for each architecture.


In [58]:
# مسیر ذخیره برای مدل LSTM
checkpoint_dir_lstm = r"/content/drive/MyDrive/Colab Notebooks/NLP/2/10-Text Generator(Attar's Poem)/Sample/checkpoint/training_checkpoints_lstm"
checkpoint_prefix_lstm = os.path.join(checkpoint_dir_lstm, "ckpt_{epoch}.weights.h5")

checkpoint_callback_lstm = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix_lstm,
    save_weights_only=True
)

### Smart Training Wrapper

We define a `train_model` function that automatically checks whether the dataset is repeated.  
If repeated, it computes and sets `steps_per_epoch` accordingly.  
This makes the training process more robust and avoids infinite training loops.


In [61]:
def train_model(model, dataset, checkpoint_callback,
                text_as_int_len, seq_length, batch_size,
                epochs=10):

    steps_per_epoch = (text_as_int_len - (seq_length + 1)) // batch_size
    print(f"✅ Computed steps_per_epoch = {steps_per_epoch}")

    history = model.fit(
        dataset,
        epochs=epochs,
        steps_per_epoch=steps_per_epoch,
        callbacks=[checkpoint_callback]
    )

    return history

In [63]:
EPOCHS = 10

history_lstm = train_model(
    model=model_lstm,
    dataset=dataset,
    checkpoint_callback=checkpoint_callback_lstm,
    text_as_int_len=len(text_as_int),
    seq_length=seq_length,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS

)

✅ Computed steps_per_epoch = 19791
Epoch 1/10
[1m19791/19791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m996s[0m 50ms/step - loss: 2.0306 - sparse_categorical_accuracy: 0.4199
Epoch 2/10




[1m19791/19791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12us/step - loss: 0.0000e+00 - sparse_categorical_accuracy: 0.0000e+00
Epoch 3/10
[1m19791/19791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1002s[0m 51ms/step - loss: 1.7884 - sparse_categorical_accuracy: 0.4799
Epoch 4/10
[1m19791/19791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11us/step - loss: 0.0000e+00 - sparse_categorical_accuracy: 0.0000e+00
Epoch 5/10
[1m19791/19791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1040s[0m 51ms/step - loss: 1.7424 - sparse_categorical_accuracy: 0.4919
Epoch 6/10
[1m19791/19791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12us/step - loss: 0.0000e+00 - sparse_categorical_accuracy: 0.0000e+00
Epoch 7/10
[1m19791/19791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m998s[0m 50ms/step - loss: 1.7132 - sparse_categorical_accuracy: 0.4984
Epoch 8/10
[1m19791/19791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13us/step - loss: 0.0000e+00 - spa

In [67]:
def generate_text_from_checkpoint(checkpoint_path, start_string, build_fn, char2idx, idx2char,
                                  temperature=1.0, num_generate=1000):
    """
    Generate text using a trained model checkpoint.

    Parameters:
    - checkpoint_path: str
        Path to the saved `.weights.h5` file.
    - start_string: str
        Initial seed string to start text generation.
    - build_fn: function
        A function to build the model (e.g., build_lstm_model) with batch_size=1.
    - char2idx: dict
        Dictionary mapping characters to integer indices.
    - idx2char: np.array or list
        List or array mapping indices back to characters.
    - temperature: float
        Sampling randomness. Lower = more conservative, higher = more diverse.
    - num_generate: int
        Total number of characters to generate after the seed string.

    Returns:
    - str
        Full generated text string starting from `start_string`.
    """
    model = build_fn(
        vocab_size=len(idx2char),
        embedding_dim=256,
        rnn_units=1024,
        batch_size=1
    )
    model.load_weights(checkpoint_path)
    model.build(tf.TensorShape([1, None]))

    # Find the LSTM layer to reset its state
    for layer in model.layers:
        if isinstance(layer, tf.keras.layers.LSTM):
            layer.reset_states()
            break  # Only reset the first LSTM (common case)

    # Convert seed string to input tensor
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    for _ in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0) / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])

    return start_string + ''.join(text_generated)


In [68]:
import os

# لیست همه فایل‌ها در پوشه‌ی checkpoint
checkpoint_files = sorted([
    f for f in os.listdir(checkpoint_dir_lstm)
    if f.endswith(".weights.h5")
])

# گرفتن جدیدترین فایل
latest_checkpoint = os.path.join(checkpoint_dir_lstm, checkpoint_files[-1])
print(f"🧠 Latest checkpoint: {latest_checkpoint}")


🧠 Latest checkpoint: /content/drive/MyDrive/Colab Notebooks/NLP/2/10-Text Generator(Attar's Poem)/Sample/checkpoint/training_checkpoints_lstm/ckpt_9.weights.h5


In [69]:
generated_text = generate_text_from_checkpoint(
    checkpoint_path=latest_checkpoint,
    start_string="ای دل: ",
    build_fn=build_lstm_model,
    char2idx=char2idx,
    idx2char=idx2char,
    temperature=0.8,
    num_generate=1000
)

print(generated_text)

ای دل: دوروی
اگر بند اامسی که مقرون کجائی
چه طمع خورشید دهم خواعد جان است
تو رف جهل همه جمله ای امروز
از بهر صومرا در
نتوانی امام ایمن
تو امروز بی طمع تا رفت در این ام
مرا یار فرمانده سخن تاج اند
تا گوئی که قران روان همی ایمن
ز بهر مردم نفس در آر او را
هر احسان چو تو امامه ای مرا مادر
ای
از گهر آمد
از من فزون از اول همچون خوی؟
دشمن در این اندر سوی آن درون دون کاج تو
مرد عاجز در این افگند جوان
سبک در جهان داد بر این جهان مرد
ای عامه مادر دیم از بلب او اند
شاه گمتا بر جان تو بد افتخار و بهمان
اگر در این مکه اندر این سخن باغبان درو بی فنی ثی
از محل الواسوی امروز
فرزند این جهان مهمان چون تو ک
به آهخته ای چرخ چنان تا پشتی
معدهٔ مقرون ای بدمخواه و جان است
این سپذر
تو دریا کنی عهد جسمی و تن
اسپ ما بر خرد در دعام
در تو ماور در بمان شد شد کان
همچون ز کلام او دواقوال اندر فائی
ای درد
معصفرمان از دل تو فربه در جمالی
ای حجت
اند
هرگز از زاعت ماه تا همه
ای مادر ا تو بی نظیر
به دین مده او سرت سؤال ظوی
ورنه در دل اهل صف تا مر در زمین م
ایزد
بسیار یابگان تو جهان با مؤذن افگند
درویش و آب سؤالت اعدام
ای 