<a href="https://colab.research.google.com/github/ojuba-org/arabic-ml-data/blob/master/arabic_poems_tf1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Tensorflow documentation

* [v2](https://www.tensorflow.org/tutorials/text/text_generation)
* [v1](https://github.com/tensorflow/docs/blob/master/site/en/r1/tutorials/sequences/text_generation.ipynb)

In [1]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [2]:
import sys
import os
import time

In [3]:
import re
import unicodedata

In [4]:
import numpy as np

In [5]:
import tensorflow as tf

In [6]:
tf.enable_eager_execution()

In [7]:
#import tensorflow.compat.v1 as tf

In [8]:
print("tf version = {} and py version = {}".format(tf.__version__, sys.version))

tf version = 1.15.2 and py version = 3.7.10 (default, Feb 20 2021, 21:17:23) 
[GCC 7.5.0]


In [9]:
assert sys.version_info.major == 3, 'please use python 3'
assert tf.test.gpu_device_name()!='', 'no GPU, please enable GPU'

In [10]:
#from tensorflow.keras.layers.experimental import preprocessing


In [11]:
! curl -sSLO https://github.com/ojuba-org/arabic-ml-data/archive/refs/heads/master.zip

In [16]:
! unzip -q master.zip "arabic-ml-data-master/corpora/poems/*/*.txt"

In [17]:
! cat arabic-ml-data-master/corpora/poems/*/*.txt > poems.txt; wc -l poems.txt

273190 poems.txt


In [18]:
fn = 'poems.txt'
with open(fn, 'r') as f:
  text = f.read()

In [19]:
en_re = re.compile('^[a-zA-Z].*$', re.M)
dline_regex = re.compile('^\d+$', re.M)
spaces_regex = re.compile('[ \t]+', re.M)
dots_regex = re.compile('\.{2,}', re.M)
leading_digits_regex = re.compile('^ *[0-9]+', re.M)
special_regex = re.compile('[-–_"\(\)\[\]\<\>\*\+/\\:,،=«»“”|\u2019\u200d\u200f\u202c\u202e\u25a1\ufd3e\ufd3f]+', re.M)


In [20]:
def clean_txt(body):
    body = unicodedata.normalize('NFKC', body)
    body = body.replace('ـ', '').replace('?', '؟').replace(';', '؛').replace(',', '،').replace('\\', ' ')
    body = special_regex.sub(' ', body)
    body = leading_digits_regex.sub('', body)
    body = dots_regex.sub('…', body).replace('.', ' ')
    body = dline_regex.sub('', body)
    body = spaces_regex.sub(' ', body)
    return body.strip()


In [21]:
text = clean_txt(en_re.sub('', text))


In [22]:
text[:100]

'الناس ثلاث امواتٍ\n\nفي اوطاني\n\nوالميت معناه قتيل\n\nقسم يقتله اصحاب الفيل\n\nوالثاني تقتله اسرائيل\n\nوالثا'

In [23]:
vocab = sorted(set(text))

In [24]:
len(vocab)

64

In [25]:
vocab

['\n',
 ' ',
 '!',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '؛',
 '؟',
 'ء',
 'آ',
 'أ',
 'ؤ',
 'إ',
 'ئ',
 'ا',
 'ب',
 'ة',
 'ت',
 'ث',
 'ج',
 'ح',
 'خ',
 'د',
 'ذ',
 'ر',
 'ز',
 'س',
 'ش',
 'ص',
 'ض',
 'ط',
 'ظ',
 'ع',
 'غ',
 'ف',
 'ق',
 'ك',
 'ل',
 'م',
 'ن',
 'ه',
 'و',
 'ى',
 'ي',
 'ً',
 'ٌ',
 'ٍ',
 'َ',
 'ُ',
 'ِ',
 'ّ',
 'ْ',
 'ٱ',
 'پ',
 'چ',
 'ڤ',
 '…']

In [26]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [27]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//seq_length

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()])

ا
ل
ن
ا
س


In [28]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))


'الناس ثلاث امواتٍ\n\nفي اوطاني\n\nوالميت معناه قتيل\n\nقسم يقتله اصحاب الفيل\n\nوالثاني تقتله اسرائيل\n\nوالثال'
'ث تقتله عربائيل\n\nوعربائيل بلادي\n\nتمتد من الكعبه حتى النيل\n\nوالله اشتقنا\n\nوالله اشتقنا\n\nللموت بلا تنكي'
'ل\n\nوالله اشتقنا\n\nثم اشتقنا واشتقنا\n\nانقذنا يا عزرائيلحالةُ البَحْرِ زَبَدْ \n\nحالةُ البَرِّ نَكَدْ \n\nح'
'الةُ الجَوِّ رَمَدْ \n\nحالةُ الحالِ احتلالٌ\n\nحالةُ الحَلِّ عُقَدْ\n\nطُولُها ألفُ أبَدْ \n\nحالةُ العِزَّة'
'ِ جَزْرٌ\n\nحالةُ الذُُلَّةِ مَدْ \n\nوَفَياتُ اليَومِ \n\nلا قَلَّ ولا زادَ العَدَدْ\n\nنَفْسُ مَن كانوا مسا'


In [None]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)



In [None]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)



In [None]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'الناس ثلاث امواتٍ\n\nفي اوطاني\n\nوالميت معناه قتيل\n\nقسم يقتله اصحاب الفيل\n\nوالثاني تقتله اسرائيل\n\nوالثا'
Target data: 'لناس ثلاث امواتٍ\n\nفي اوطاني\n\nوالميت معناه قتيل\n\nقسم يقتله اصحاب الفيل\n\nوالثاني تقتله اسرائيل\n\nوالثال'


In [None]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 21 ('ا')
  expected output: 44 ('ل')
Step    1
  input: 44 ('ل')
  expected output: 46 ('ن')
Step    2
  input: 46 ('ن')
  expected output: 21 ('ا')
Step    3
  input: 21 ('ا')
  expected output: 33 ('س')
Step    4
  input: 33 ('س')
  expected output: 1 (' ')


In [None]:
BATCH_SIZE = 64
steps_per_epoch = examples_per_epoch//BATCH_SIZE

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<DatasetV1Adapter shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [None]:
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 512

In [None]:
if tf.test.is_gpu_available():
  rnn = tf.keras.layers.CuDNNGRU
else:
  import functools
  rnn = functools.partial(
    tf.keras.layers.GRU, recurrent_activation='sigmoid')

In [None]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    rnn(rnn_units,
        return_sequences=True,
        recurrent_initializer='glorot_uniform',
        stateful=True),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model


In [None]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)


In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")


(64, 100, 64) # (batch_size, sequence_length, vocab_size)


In [None]:
model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16384     
_________________________________________________________________
cu_dnngru (CuDNNGRU)         (64, None, 512)           1182720   
_________________________________________________________________
dense (Dense)                (64, None, 64)            32832     
Total params: 1,231,936
Trainable params: 1,231,936
Non-trainable params: 0
_________________________________________________________________


In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()


In [None]:
sampled_indices

array([40, 57, 26, 45, 29, 40, 13, 31,  7, 20, 63, 50,  6, 36, 45, 25, 43,
        5, 60, 13,  2,  0, 58,  9, 57, 53, 30, 48, 42, 15, 39, 24, 46, 42,
       17, 57, 62, 49, 22,  0, 37, 14, 21,  9, 43, 40, 46, 50, 34, 20, 11,
       50, 45, 27, 57, 28, 27, 36,  2, 16, 37, 60,  4, 44, 34, 36, 42,  3,
       36, 56,  9, 21, 57, 32,  4, 13, 53, 38,  1, 47,  7,  7, 10,  3, 62,
       18, 50, 18, 26, 61, 47, 46, 47, 33, 12, 38, 46, 14,  5, 38])

In [None]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))


Input: 
 'ُمُّكَ النَفسُ قَديماً أَكرَمَت\n\nوَأَبوكَ الفَضلُ خَيرُ المُنجِبين\n\nنَسَبُ البَدرِ أَوِ الشَمسِ إِذا'

Next Char Predictions: 
 'غّجمدغ؛ر4ئ…ي3ضمثك2پ؛!\nْ6ٍّذوقءعتنقأّڤىب\nط؟ا6كغنيشئ8يمحّخحض!آطپ1لشضق0ضِ6اّز1؛ٍظ ه4470ڤؤيؤجچهنهس9ظن؟2ظ'


In [None]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())


Prediction shape:  (64, 100, 64)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.1591134


In [None]:
model.compile(
    optimizer = tf.train.AdamOptimizer(),
    loss = loss)


In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)


In [None]:
db=dataset.repeat()

In [None]:
! rm training_checkpoints/*

In [None]:
EPOCHS=20


In [None]:
history = model.fit(db, epochs=EPOCHS, steps_per_epoch=steps_per_epoch, callbacks=[checkpoint_callback])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
prefix=tf.train.latest_checkpoint(checkpoint_dir)
print(prefix)


./training_checkpoints/ckpt_20


In [None]:
! tar -czvf trained.tgz ./training_checkpoints/checkpoint "$prefix"*

./training_checkpoints/checkpoint
./training_checkpoints/ckpt_20.data-00000-of-00002
./training_checkpoints/ckpt_20.data-00001-of-00002
./training_checkpoints/ckpt_20.index


In [None]:
model2 = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model2.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model2.build(tf.TensorShape([1, None]))

In [None]:
model2.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (1, None, 256)            16384     
_________________________________________________________________
cu_dnngru_6 (CuDNNGRU)       (1, None, 512)            1182720   
_________________________________________________________________
dense_6 (Dense)              (1, None, 64)             32832     
Total params: 1,231,936
Trainable params: 1,231,936
Non-trainable params: 0
_________________________________________________________________


In [None]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 1000

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)
  print(input_eval)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a multinomial distribution to predict the word returned by the model
      predictions = predictions / temperature
      predicted_id = tf.multinomial(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted word as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))


In [None]:
print(generate_text(model2, start_string=u"سألتُ"))

tf.Tensor([[33 17 44 24 55]], shape=(1, 5), dtype=int32)
سألتُ هاويتي 

هل ثمَّيتُنَا 

كَم نَفَت صَفَحاتٍ لِلمُمَثِّليلَ بِضَرزامِكَ الغُرِّ أَرفى ال

حُرَّةِ العُصفودَ وَإِن

تَبكي بِها تِلكَ الكُؤوسُ بِهِ

فَهيَ ذِمَرٌ عَلى أَمرِهِ

حِمامُ الفيلِ مِن بُردَةٍ

يَمضِ ما في كادِ طِفلالِ

كُنتُ أَدعو رُواةُ المَنظَرُ

فَيا لَيتَ شِعرِيَ وَالضُحى

يَتيمَةً بَيتَنا بِالمَشيبِ

خَليفَةَ الحَقِّ البُراقِ وَسُجّينا

يَبكي الرِجالَ وَقَد تَنودَ بِقائِما

وَالعُذرُ يا نَبَّلَت حَتَماما

يا لَيتَ شَرَكاً وَالحالِدِ الوَقَر

سَهِرَ الحَربُ حَولَ الجِكَلُ الحَياةُ

وَتَضحى لِسانَها أَنّي في وَطَن

أُم أَبَى لِستطاح الخَيرِ

وَباتنا حَمؤولاً بِالكَذُب

وَأَجَّ عَلى الزُهرِ موتَما اِتجادَ أَو

كَفى بِشَرقِ رِداءِ ما خَفٍ

فيهِ مِن تِركانِها وَخَذَلتُكَب

اِذَّذَّهُ لِلذُدَر

هُم أَوجَهُ العَهد وَالبِلادُ

مَناحِها أَلقاكَ أَسعَدُها السَفينا

فُجِئتَ بِها حَسوناً مَشى… موقظه من الكرمل

هو منه تعطل فيه إذا يطفيهِ مسترجلاتُ

من صَنَحْ ورسالة

فسِر بعد أوسل وحشية نمرٍ؟

إنَّ مناهنتي

أنا المخلَّفُ ليست