In [2]:
import tensorflow as tf
import os
import numpy as np
import pandas as pd
import re
import string
from tensorflow.keras import layers
from tensorflow.keras import losses

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '15'


In [3]:
file_path = pd.read_csv('shahname.csv', encoding='utf-8')
column_data = file_path['Text']

In [5]:
raw_data_ds = tf.data.Dataset.from_tensor_slices(column_data.astype(str))

In [6]:
for example in raw_data_ds.take(1):
    print(example.numpy().decode())

به نام خداوند جان و خرد


In [8]:
def custom_standardization(input_data):
    s0 = tf.strings.regex_replace(input_data, 'ي', 'ی')
    s1 = tf.strings.regex_replace(s0, '\xa0', ' ')
    s2 = tf.strings.regex_replace(s1, '\u200c', ' ')
    s3 = tf.strings.regex_replace(s2, 'آ', 'ا')
    s4 = tf.strings.regex_replace(s3, 'َ', ' ')
    s5 = tf.strings.regex_replace(s4, 'ُ', ' ')
    s6 = tf.strings.regex_replace(s5, 'ِ', ' ')
    s7 = tf.strings.regex_replace(s6, 'ة', 'ه')
    s8 = tf.strings.regex_replace(s7, 'هٔ', 'ه')
    s9 = tf.strings.regex_replace(s8, 'ك', 'ک')
    s10 = tf.strings.regex_replace(s9, '؛', ' ')
    s11 = tf.strings.regex_replace(s10, 'ّ', ' ')
    s12 = tf.strings.regex_replace(s11, 'ْ', ' ')
    s13 = tf.strings.regex_replace(s12, '،', ' ')
    s14 = tf.strings.regex_replace(s13, 'ء', ' ')
    s15 = tf.strings.regex_replace(s14, '«', ' ')
    s16 = tf.strings.regex_replace(s15, '»', ' ')
    s17 = tf.strings.regex_replace(s16, 'أ', 'ا')
    s18 = tf.strings.regex_replace(s17, 'ؤ', 'و')
    s19 = tf.strings.regex_replace(s18, '؟', ' ')
    s20 = tf.strings.regex_replace(s19, '!', ' ')
    s21 = tf.strings.regex_replace(s20, ':', ' ')
    return tf.strings.regex_replace(s21, 'ئ', 'ی')

In [9]:
raw_data_ds = raw_data_ds.map(custom_standardization)

In [10]:
for text in raw_data_ds.take(2):
    print("Original: ", text.numpy().decode('utf-8'))

Original:  به نام خداوند جان و خرد
Original:  کز این برتر اندیشه بر نگذرد


In [11]:
max_features = 20000
embedding_dim = 128
sequence_length = 5

In [12]:
vectorize_layer = layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length
)

vectorize_layer.adapt(raw_data_ds.batch(1024))

vocab = vectorize_layer.get_vocabulary()
print("Vocabulary size (# of distinct words): ", len(vocab))

Vocabulary size (# of distinct words):  16918


In [13]:
print("1287 ---> ",vectorize_layer.get_vocabulary()[1287])
print(" 313 ---> ",vectorize_layer.get_vocabulary()[313])

1287 --->  خروشید
 313 --->  امدند


In [14]:
vocab[:5]

['', '[UNK]', 'و', 'به', 'که']

In [15]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [16]:
vectorized_sequences = raw_data_ds.map(lambda x: vectorize_layer(x))
dataset = vectorized_sequences.map(split_input_target)

In [17]:
for input_example, target_example in dataset.batch(1).take(1):
    print("Input:", input_example.numpy()[0])
    print("Target:", target_example.numpy()[0])


Input: [  3  79 366 115]
Target: [ 79 366 115   2]


In [18]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

In [19]:
dataset = (
    dataset
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, None), dtype=tf.int64, name=None), TensorSpec(shape=(64, None), dtype=tf.int64, name=None))>

In [20]:
for input_example, target_example in dataset.take(1):
    print("Input shape:", input_example.shape)
    print("Target shape:", target_example.shape)

Input shape: (64, 4)
Target shape: (64, 4)


In [21]:
vocab_size = len(vocab)

embedding_dim = 25

rnn_units = 1024

In [22]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [23]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 25)            422950    
                                                                 
 gru (GRU)                   (64, None, 1024)          3228672   
                                                                 
 dense (Dense)               (64, None, 16918)         17340950  
                                                                 
Total params: 20992572 (80.08 MB)
Trainable params: 20992572 (80.08 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [25]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [26]:
model.compile(optimizer='adam', loss=loss)

In [27]:
checkpoint_dir = './training_checkpoints'

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [28]:
history = model.fit(dataset, epochs=30, callbacks=[checkpoint_callback])

Epoch 1/30


2024-06-08 08:21:42.637803: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
2024-06-08 08:21:42.674111: I external/local_xla/xla/service/service.cc:168] XLA service 0x7fbd018babc0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-06-08 08:21:42.674137: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce MX330, Compute Capability 6.1
2024-06-08 08:21:42.678639: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-06-08 08:21:42.706551: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
I0000 00:00:1717849302.744840   15314 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [31]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt_30'

In [32]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [33]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (1, None, 25)             422950    
                                                                 
 gru_1 (GRU)                 (1, None, 1024)           3228672   
                                                                 
 dense_1 (Dense)             (1, None, 16918)          17340950  
                                                                 
Total params: 20992572 (80.08 MB)
Trainable params: 20992572 (80.08 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [34]:
def generate_text(model, start_string):
    num_generate = 100

    input_eval = [vocab.index(s) for s in start_string.split()]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    temperature = 1.0

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(vocab[predicted_id])

    return ' '.join(start_string.split() + text_generated)



In [42]:
print(generate_text(model, start_string=u"به نام خدا"))

به نام خدا و ز گفتار او ماند اندر جهان را به خون ریختن زخم او کرد نرم باش ان لشکر رومی گزیده سوار و پیاده ببستند با درد بدخواه و ما را نیستی هرچ باید که پیدا شد ان موبدان بند صد اشتر ز درگاه برخاست و اسان شود زین سپس راه هرمزد چون باد و بر ما گشادست راه و هم دوزخ ازو خاک دریا همی رفت تا پیش گرد دامن گرد جوشان دم او بر زاد فرخ زاد هرمزد با ما به افسون دل و بیست ماهوی زین و مردی و بی بن به خون ریختن لشکر اندر فراز امدند
