In [None]:
import os
import re

def convert_persian_numbers(text):
    persian_nums = '۰۱۲۳۴۵۶۷۸۹'
    english_nums = '0123456789'
    for p, e in zip(persian_nums, english_nums):
        text = text.replace(p, e)
    return text

def normalize_punctuation(text):
    replacements = {
        '«': '"', '»': '"',
        '“': '"', '”': '"',
        '‘': "'", '’': "'",
        '؛': ';', '،': ',',
        '؟': '?', '‌': ' ',
        '…': '...', 'ـ': '',
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text

def clean_text(text):
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        line = line.strip()
        if not line:
            continue
        line = convert_persian_numbers(line)
        line = normalize_punctuation(line)
        line = re.sub(r'[^\x00-\x7F\u0600-\u06FF ]', '', line)
        line = re.sub(r'\s+', ' ', line)
        if line:
            cleaned_lines.append(line)
    return '\n'.join(cleaned_lines)

def clean_txt_files(input_paths, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    for path in input_paths:
        with open(path, 'r', encoding='utf-8') as f:
            raw_text = f.read()

        cleaned_text = clean_text(raw_text)

        file_name = os.path.basename(path)
        output_path = os.path.join(output_dir, f"cleaned_{file_name}")

        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_text)

        print(f"cleaned and saved: {output_path}")

In [None]:
input_files = ['/content/drive/MyDrive/TextGenerator/Dataset/persianPoet.txt', '/content/drive/MyDrive/TextGenerator/Dataset/naserkhosro.txt', '/content/drive/MyDrive/TextGenerator/Dataset/attar.txt']
clean_txt_files(input_files, output_dir='/content/drive/MyDrive/TextGenerator/Dataset/CleanedTexts')

cleaned and saved: /content/drive/MyDrive/TextGenerator/Dataset/CleanedTexts/cleaned_persianPoet.txt
cleaned and saved: /content/drive/MyDrive/TextGenerator/Dataset/CleanedTexts/cleaned_naserkhosro.txt
cleaned and saved: /content/drive/MyDrive/TextGenerator/Dataset/CleanedTexts/cleaned_attar.txt


In [None]:
def merge_cleaned_files(input_paths, output_path):
    with open(output_path, 'w', encoding='utf-8') as outfile:
        for path in input_paths:
            with open(path, 'r', encoding='utf-8') as infile:
                text = infile.read().strip()
                if text:
                    outfile.write(text + '\n\n')
    print(f"merged and saved: {output_path}")

In [None]:
cleaned_files = [
    '/content/drive/MyDrive/TextGenerator/Dataset/CleanedTexts/cleaned_persianPoet.txt',
    '/content/drive/MyDrive/TextGenerator/Dataset/CleanedTexts/cleaned_naserkhosro.txt',
    '/content/drive/MyDrive/TextGenerator/Dataset/CleanedTexts/cleaned_attar.txt'
]

merged_output = '/content/drive/MyDrive/TextGenerator/Dataset/merged_poets.txt'
merge_cleaned_files(cleaned_files, merged_output)

merged and saved: /content/drive/MyDrive/TextGenerator/Dataset/merged_poets.txt


In [1]:
import tensorflow as tf
import numpy as np
import os
import time

In [2]:
with open("/content/drive/MyDrive/TextGenerator/Dataset/merged_poets.txt", encoding='utf-8') as f:
    text = f.read()

In [3]:
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

Length of text: 10206685 characters


In [4]:
# Take a look at the first 250 characters in text
print(text[:250])

اول دفتر به نام ايزد دانا صانع پروردگار حی توانا
اکبر و اعظم خدای عالم و آدم صورت خوب آفريد و سيرت زيبا
از در بخشندگی و بنده نوازی مرغ هوا را نصيب و ماهی دريا
قسمت خود میخورند منعم و درويش روزی خود میبرند پشه و عنقا
حاجت موری به علم غيب بداند در بن چ


In [5]:
# The unique characters in the file
vocab = sorted(set(text))
print('{} unique characters'.format(len(vocab)))

63 unique characters


In [6]:
vocab

['\n',
 ' ',
 '!',
 '"',
 '(',
 ')',
 ',',
 '-',
 '.',
 ':',
 ';',
 '?',
 'ء',
 'آ',
 'أ',
 'ؤ',
 'إ',
 'ئ',
 'ا',
 'ب',
 'ة',
 'ت',
 'ث',
 'ج',
 'ح',
 'خ',
 'د',
 'ذ',
 'ر',
 'ز',
 'س',
 'ش',
 'ص',
 'ض',
 'ط',
 'ظ',
 'ع',
 'غ',
 'ف',
 'ق',
 'ك',
 'ل',
 'م',
 'ن',
 'ه',
 'و',
 'ي',
 'ً',
 'ٌ',
 'ٍ',
 'َ',
 'ُ',
 'ِ',
 'ّ',
 'ْ',
 'ٔ',
 'پ',
 'چ',
 'ژ',
 'ک',
 'گ',
 'ۀ',
 'ی']

In [7]:
# Create a mapping from characters to unique integer indices
char2idx = {u: i for i, u in enumerate(vocab)}
print(char2idx)
print('\n\n')
# Create a reverse mapping from indices back to characters
idx2char = np.array(vocab)
print(idx2char)
print('\n\n')
# Convert the entire text into a sequence of integer indices
text_as_int = np.array([char2idx[c] for c in text])
print(text_as_int)

{'\n': 0, ' ': 1, '!': 2, '"': 3, '(': 4, ')': 5, ',': 6, '-': 7, '.': 8, ':': 9, ';': 10, '?': 11, 'ء': 12, 'آ': 13, 'أ': 14, 'ؤ': 15, 'إ': 16, 'ئ': 17, 'ا': 18, 'ب': 19, 'ة': 20, 'ت': 21, 'ث': 22, 'ج': 23, 'ح': 24, 'خ': 25, 'د': 26, 'ذ': 27, 'ر': 28, 'ز': 29, 'س': 30, 'ش': 31, 'ص': 32, 'ض': 33, 'ط': 34, 'ظ': 35, 'ع': 36, 'غ': 37, 'ف': 38, 'ق': 39, 'ك': 40, 'ل': 41, 'م': 42, 'ن': 43, 'ه': 44, 'و': 45, 'ي': 46, 'ً': 47, 'ٌ': 48, 'ٍ': 49, 'َ': 50, 'ُ': 51, 'ِ': 52, 'ّ': 53, 'ْ': 54, 'ٔ': 55, 'پ': 56, 'چ': 57, 'ژ': 58, 'ک': 59, 'گ': 60, 'ۀ': 61, 'ی': 62}



['\n' ' ' '!' '"' '(' ')' ',' '-' '.' ':' ';' '?' 'ء' 'آ' 'أ' 'ؤ' 'إ' 'ئ'
 'ا' 'ب' 'ة' 'ت' 'ث' 'ج' 'ح' 'خ' 'د' 'ذ' 'ر' 'ز' 'س' 'ش' 'ص' 'ض' 'ط' 'ظ'
 'ع' 'غ' 'ف' 'ق' 'ك' 'ل' 'م' 'ن' 'ه' 'و' 'ي' 'ً' 'ٌ' 'ٍ' 'َ' 'ُ' 'ِ' 'ّ'
 'ْ' 'ٔ' 'پ' 'چ' 'ژ' 'ک' 'گ' 'ۀ' 'ی']



[18 45 41 ... 43  0  0]


In [8]:
len(text_as_int)

10206685

In [9]:
len(char2idx)

63

In [10]:
print('{')
for char,_ in zip(char2idx, range(63)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\n':   0,
  ' ' :   1,
  '!' :   2,
  '"' :   3,
  '(' :   4,
  ')' :   5,
  ',' :   6,
  '-' :   7,
  '.' :   8,
  ':' :   9,
  ';' :  10,
  '?' :  11,
  'ء' :  12,
  'آ' :  13,
  'أ' :  14,
  'ؤ' :  15,
  'إ' :  16,
  'ئ' :  17,
  'ا' :  18,
  'ب' :  19,
  'ة' :  20,
  'ت' :  21,
  'ث' :  22,
  'ج' :  23,
  'ح' :  24,
  'خ' :  25,
  'د' :  26,
  'ذ' :  27,
  'ر' :  28,
  'ز' :  29,
  'س' :  30,
  'ش' :  31,
  'ص' :  32,
  'ض' :  33,
  'ط' :  34,
  'ظ' :  35,
  'ع' :  36,
  'غ' :  37,
  'ف' :  38,
  'ق' :  39,
  'ك' :  40,
  'ل' :  41,
  'م' :  42,
  'ن' :  43,
  'ه' :  44,
  'و' :  45,
  'ي' :  46,
  'ً' :  47,
  'ٌ' :  48,
  'ٍ' :  49,
  'َ' :  50,
  'ُ' :  51,
  'ِ' :  52,
  'ّ' :  53,
  'ْ' :  54,
  'ٔ' :  55,
  'پ' :  56,
  'چ' :  57,
  'ژ' :  58,
  'ک' :  59,
  'گ' :  60,
  'ۀ' :  61,
  'ی' :  62,
  ...
}


In [11]:
# Show how the first 13 characters from the text are mapped to integers
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:30]), text_as_int[:30]))

'اول دفتر به نام ايزد دانا صانع' ---- characters mapped to int ---- > [18 45 41  1 26 38 21 28  1 19 44  1 43 18 42  1 18 46 29 26  1 26 18 43
 18  1 32 18 43 36]


In [12]:
text_as_int

array([18, 45, 41, ..., 43,  0,  0])

In [13]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

In [14]:
# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [15]:
for i in char_dataset.take(10):
    print(idx2char[i.numpy()])

ا
و
ل
 
د
ف
ت
ر
 
ب


In [16]:
# Group into sequences
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

In [17]:
for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))
    print("\n")

'اول دفتر به نام ايزد دانا صانع پروردگار حی توانا\nاکبر و اعظم خدای عالم و آدم صورت خوب آفريد و سيرت زي'


'با\nاز در بخشندگی و بنده نوازی مرغ هوا را نصيب و ماهی دريا\nقسمت خود میخورند منعم و درويش روزی خود میبر'


'ند پشه و عنقا\nحاجت موری به علم غيب بداند در بن چاهی به زير صخره صما\nجانور از نطفه میکند شکر از نی برگ'


'تر از چوب خشک و چشمه ز خارا\nشربت نوش آفريد از مگس نحل نخل تناور کند ز دانه خرما\nاز همگان بینياز و بر '


'همه مشفق از همه عالم نهان و بر همه پيدا\nپرتو نور سرادقات جلالش از عظمت ماورای فکرت دانا\nخود نه زبان د'




In [18]:
# Input/Target split
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [19]:
dataset

<_MapDataset element_spec=(TensorSpec(shape=(100,), dtype=tf.int64, name=None), TensorSpec(shape=(100,), dtype=tf.int64, name=None))>

In [20]:
for input_example, target_example in  dataset.take(1):
    print('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'اول دفتر به نام ايزد دانا صانع پروردگار حی توانا\nاکبر و اعظم خدای عالم و آدم صورت خوب آفريد و سيرت ز'
Target data: 'ول دفتر به نام ايزد دانا صانع پروردگار حی توانا\nاکبر و اعظم خدای عالم و آدم صورت خوب آفريد و سيرت زي'


In [21]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 18 (np.str_('ا'))
  expected output: 45 (np.str_('و'))
Step    1
  input: 45 (np.str_('و'))
  expected output: 41 (np.str_('ل'))
Step    2
  input: 41 (np.str_('ل'))
  expected output: 1 (np.str_(' '))
Step    3
  input: 1 (np.str_(' '))
  expected output: 26 (np.str_('د'))
Step    4
  input: 26 (np.str_('د'))
  expected output: 38 (np.str_('ف'))


In [22]:
# Shuffle and batch
BATCH_SIZE = 64 # This mean we have 64 pair of Input data and Target data of 100 character
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [23]:
from tensorflow.keras import layers

# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [24]:
def build_model(vocab_size, embedding_dim, rnn_units):
    model = tf.keras.Sequential([
        layers.Input(shape=(None,)),
        layers.Embedding(vocab_size, embedding_dim),
        layers.LSTM(rnn_units, return_sequences=True),
        layers.Dense(vocab_size)
    ])
    return model

model = build_model(vocab_size, embedding_dim, rnn_units)

In [26]:
dataset.take(1)

<_TakeDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [27]:
model.summary()

In [28]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [29]:
model.compile(optimizer='adam', loss=loss)

In [30]:
import os
checkpoint_dir = '/content/drive/MyDrive/TextGenerator/TrainingCheckpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

In [31]:
checkpoint_path = os.path.join(checkpoint_dir, "best_model.weights.h5")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    monitor='val_loss',
    save_best_only=True,
    save_weights_only=True,
    mode='min',
    verbose=1
)

In [32]:
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True,
    verbose=1
)

In [33]:
EPOCHS = 30

In [34]:
total_batches = len(dataset)
val_size = int(0.1 * total_batches)

val_dataset = dataset.take(val_size)
train_dataset = dataset.skip(val_size)

history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=EPOCHS,
    callbacks=[checkpoint_callback, early_stopping_callback]
)

Epoch 1/30
[1m1422/1422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step - loss: 2.2211
Epoch 1: val_loss improved from inf to 2.16851, saving model to /content/drive/MyDrive/TextGenerator/TrainingCheckpoints/best_model.weights.h5
[1m1422/1422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 67ms/step - loss: 2.2208 - val_loss: 2.1685
Epoch 2/30
[1m1422/1422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - loss: 1.7708
Epoch 2: val_loss improved from 2.16851 to 2.00915, saving model to /content/drive/MyDrive/TextGenerator/TrainingCheckpoints/best_model.weights.h5
[1m1422/1422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 74ms/step - loss: 1.7707 - val_loss: 2.0091
Epoch 3/30
[1m1422/1422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - loss: 1.6838
Epoch 3: val_loss improved from 2.00915 to 1.95129, saving model to /content/drive/MyDrive/TextGenerator/TrainingCheckpoints/best_model.weights.h5
[1m1422/1422[0m [32m

In [42]:
def build_inference_model(vocab_size, embedding_dim, rnn_units):
    model = tf.keras.Sequential([
        layers.Input(batch_shape=(1, None)),
        layers.Embedding(vocab_size, embedding_dim),
        layers.LSTM(rnn_units, return_sequences=True, stateful=True),
        layers.Dense(vocab_size)
    ])
    return model


In [43]:
inference_model = build_inference_model(vocab_size, embedding_dim, rnn_units)
inference_model.load_weights(checkpoint_path)

In [46]:
model.layers[1].reset_states()

In [47]:
def generate_text(model, start_string, num_generate=500, temperature=1.0):
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    text_generated = []

    model.layers[1].reset_states()

    for _ in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])

    return start_string + ''.join(text_generated)

In [54]:
print(generate_text(inference_model, start_string="ای دریغا", num_generate=100, temperature=0.8))

ای دریغا در دو عالم بی نشان
جمله اندر بحر افتاده بزن
رفت مردی بانگ زد بر وی بسی
بارها گردید در زندان او
تیغ 
