<a href="https://colab.research.google.com/github/ojuba-org/arabic-ml-data/blob/master/tashkeel_tf2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Tensorflow documentation

* [v2](https://www.tensorflow.org/tutorials/text/text_generation)
* [v1](https://github.com/tensorflow/docs/blob/master/site/en/r1/tutorials/sequences/text_generation.ipynb)

In [1]:
%tensorflow_version 2.x

In [2]:
import sys
import os
import time
import random

In [3]:
import re
import unicodedata

In [4]:
import numpy as np

In [5]:
import tensorflow as tf

In [77]:
K = tf.keras
KL = K.layers

In [6]:
#tf.enable_eager_execution()

In [7]:
#import tensorflow.compat.v1 as tf

In [8]:
print("tf version = {} and py version = {}".format(tf.__version__, sys.version))

tf version = 2.7.0 and py version = 3.7.12 (default, Jan 15 2022, 18:48:18) 
[GCC 7.5.0]


In [9]:
assert sys.version_info.major == 3, 'please use python 3'
assert tf.test.gpu_device_name()!='', 'no GPU, please enable GPU'

In [10]:
#from tensorflow.keras.layers.experimental import preprocessing


In [40]:
! curl -sSLO https://github.com/ojuba-org/arabic-ml-data/archive/refs/heads/master.zip

In [48]:
! unzip -o -q master.zip "arabic-ml-data-master/corpora/quran/*.txt"

In [49]:
! cat arabic-ml-data-master/corpora/quran/*.txt > data.txt; wc -l data.txt

6236 data.txt


In [50]:
lines=[l.strip() for l in open('data.txt', 'r') if l.strip()]
s=set()
for l in lines: s.update(l)

In [51]:
alpha=sorted(s)

In [52]:
tashkeel_set = { ch for ch in alpha if unicodedata.category(ch)=='Mn' }

In [53]:
def tokenize_tashkeel(line):
  offs=[ ix for ix, ch in enumerate(line) if ch not in tashkeel_set ]
  offs.append(len(line))
  return [ line[off: offs[ix+1]] for ix, off in enumerate(offs[:-1]) ]

In [54]:
(
  VOC_PAD,
  VOC_START,
  VOC_UNKNOWN,
  VOC_UNKNOWN_TASHKEEL, VOC_UNKNOWN_SHADDA,
  VOC_UNKNOWN_LETTER, VOC_UNKNOWN_HAMZA, VOC_UNKNOWN_TEH_OR_HAA, VOC_UNKNOWN_MAQSORA,
  VOC_FATHATAN, VOC_DAMMATAN, VOC_KASRATAN, 
  VOC_FATHA, VOC_DAMMA, VOC_KASRA,
  VOC_SHADDA_FATHA, VOC_SHADDA_DAMMA, VOC_SHADDA_KASRA,
  VOC_SUKUN,
  VOC_STOP,
) = range(20)

In [55]:
chars = [ ch for ch in alpha if unicodedata.category(ch)!='Mn']

In [56]:
ch2id = { ch: ix+VOC_STOP+1 for ix, ch in enumerate(chars)}

In [57]:
ch2id.update({
  "": VOC_START,
  "\n": VOC_STOP,
  "\u064B": VOC_FATHATAN, "\u064C": VOC_DAMMATAN, "\u064D": VOC_KASRATAN,
  "\u064E": VOC_FATHA, "\u064F": VOC_DAMMA, "\u0650": VOC_KASRA,
  "\u0651\u064E": VOC_SHADDA_FATHA, "\u0651\u064F": VOC_SHADDA_DAMMA, "\u0651\u0650": VOC_SHADDA_KASRA,
  "\u0651": VOC_SUKUN,
  "\u061f": VOC_UNKNOWN,
  "\u0640\u061f\u0640": VOC_UNKNOWN_TASHKEEL,
  "\u0651\u061f": VOC_UNKNOWN_SHADDA,
})

In [58]:
id2ch = { v: k for k, v in ch2id.items() }

In [100]:
id2ch2 = dict(id2ch)
id2ch2.update({
    VOC_START: "", "\n": VOC_STOP, VOC_UNKNOWN: "", VOC_UNKNOWN_TASHKEEL: "", VOC_UNKNOWN_SHADDA: "ّ",
})

In [59]:
voc_size = max(ch2id.values())+1

In [60]:
id2ch.update({ ix:'\u061f' for ix in range(voc_size) if ix not in id2ch})

In [61]:
id_repr = { globals()[i]: i for i in dir() if i.startswith('VOC_')}
id_repr.update({i: unicodedata.name(id2ch[i]) for i in range(VOC_STOP+1, voc_size)})

In [62]:
def tokenize_one_id(sub):
  return (ch2id.get(sub[0], VOC_UNKNOWN),
          ch2id.get(sub[1:] or "\u0640\u061f\u0640", VOC_UNKNOWN_TASHKEEL),
  )

def tokenize_tashkeel_id(line):
  return [(VOC_START, VOC_UNKNOWN_TASHKEEL)]+[tokenize_one_id(sub) for sub in tokenize_tashkeel(line.strip()+"\n")]

def tr_all_unknown(seq):
  return [ (con_id, VOC_UNKNOWN_TASHKEEL) for con_id, vo_id in seq ]

def pad_one(tuples, size):
  l = len(tuples)
  return (tuples + [(VOC_PAD, VOC_PAD)]*(size-l)) if l<size else tuples


In [63]:
line=lines[0]
print(line)
print(len(line))
ll = [ len(tokenize_tashkeel(l)) for l in lines ]
print(max(ll))
print(np.percentile(ll, (80, 90, 95, 99)))


بِسْمِ اللَّهِ الرَّحْمَنِ الرَّحِيمِ
37
679
[ 94.  125.5 154.  242. ]


In [69]:
# consonant, no tashkeel
max_con = 200
filtered = [ tokenize_tashkeel_id(l) for l in lines if len(tokenize_tashkeel_id(l))<max_con ]
random.shuffle(filtered)

tokens=tokenize_tashkeel(line)
print(tokens)
print([ len(token) for token in tokens])
print(voc_size)

['بِ', 'سْ', 'مِ', ' ', 'ا', 'ل', 'لَّ', 'هِ', ' ', 'ا', 'ل', 'رَّ', 'حْ', 'مَ', 'نِ', ' ', 'ا', 'ل', 'رَّ', 'حِ', 'ي', 'مِ']
[2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 1, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 2]
57


In [70]:
for i, (con_ix, vol_id) in enumerate(tokenize_tashkeel_id(line)):
  print(i, con_ix, id_repr[con_ix], vol_id, id_repr[vol_id])

0 1 VOC_START 3 VOC_UNKNOWN_TASHKEEL
1 28 ARABIC LETTER BEH 14 VOC_KASRA
2 39 ARABIC LETTER SEEN 3 VOC_UNKNOWN_TASHKEEL
3 51 ARABIC LETTER MEEM 14 VOC_KASRA
4 20 SPACE 3 VOC_UNKNOWN_TASHKEEL
5 27 ARABIC LETTER ALEF 3 VOC_UNKNOWN_TASHKEEL
6 50 ARABIC LETTER LAM 3 VOC_UNKNOWN_TASHKEEL
7 50 ARABIC LETTER LAM 15 VOC_SHADDA_FATHA
8 53 ARABIC LETTER HEH 14 VOC_KASRA
9 20 SPACE 3 VOC_UNKNOWN_TASHKEEL
10 27 ARABIC LETTER ALEF 3 VOC_UNKNOWN_TASHKEEL
11 50 ARABIC LETTER LAM 3 VOC_UNKNOWN_TASHKEEL
12 37 ARABIC LETTER REH 15 VOC_SHADDA_FATHA
13 33 ARABIC LETTER HAH 3 VOC_UNKNOWN_TASHKEEL
14 51 ARABIC LETTER MEEM 12 VOC_FATHA
15 52 ARABIC LETTER NOON 14 VOC_KASRA
16 20 SPACE 3 VOC_UNKNOWN_TASHKEEL
17 27 ARABIC LETTER ALEF 3 VOC_UNKNOWN_TASHKEEL
18 50 ARABIC LETTER LAM 3 VOC_UNKNOWN_TASHKEEL
19 37 ARABIC LETTER REH 15 VOC_SHADDA_FATHA
20 33 ARABIC LETTER HAH 14 VOC_KASRA
21 56 ARABIC LETTER YEH 3 VOC_UNKNOWN_TASHKEEL
22 51 ARABIC LETTER MEEM 14 VOC_KASRA
23 19 VOC_STOP 3 VOC_UNKNOWN_TASHKEEL


In [71]:
for i, (con_ix, vol_id) in enumerate(tr_all_unknown(tokenize_tashkeel_id(line))):
  print(i, con_ix, id_repr[con_ix], vol_id, id_repr[vol_id])

0 1 VOC_START 3 VOC_UNKNOWN_TASHKEEL
1 28 ARABIC LETTER BEH 3 VOC_UNKNOWN_TASHKEEL
2 39 ARABIC LETTER SEEN 3 VOC_UNKNOWN_TASHKEEL
3 51 ARABIC LETTER MEEM 3 VOC_UNKNOWN_TASHKEEL
4 20 SPACE 3 VOC_UNKNOWN_TASHKEEL
5 27 ARABIC LETTER ALEF 3 VOC_UNKNOWN_TASHKEEL
6 50 ARABIC LETTER LAM 3 VOC_UNKNOWN_TASHKEEL
7 50 ARABIC LETTER LAM 3 VOC_UNKNOWN_TASHKEEL
8 53 ARABIC LETTER HEH 3 VOC_UNKNOWN_TASHKEEL
9 20 SPACE 3 VOC_UNKNOWN_TASHKEEL
10 27 ARABIC LETTER ALEF 3 VOC_UNKNOWN_TASHKEEL
11 50 ARABIC LETTER LAM 3 VOC_UNKNOWN_TASHKEEL
12 37 ARABIC LETTER REH 3 VOC_UNKNOWN_TASHKEEL
13 33 ARABIC LETTER HAH 3 VOC_UNKNOWN_TASHKEEL
14 51 ARABIC LETTER MEEM 3 VOC_UNKNOWN_TASHKEEL
15 52 ARABIC LETTER NOON 3 VOC_UNKNOWN_TASHKEEL
16 20 SPACE 3 VOC_UNKNOWN_TASHKEEL
17 27 ARABIC LETTER ALEF 3 VOC_UNKNOWN_TASHKEEL
18 50 ARABIC LETTER LAM 3 VOC_UNKNOWN_TASHKEEL
19 37 ARABIC LETTER REH 3 VOC_UNKNOWN_TASHKEEL
20 33 ARABIC LETTER HAH 3 VOC_UNKNOWN_TASHKEEL
21 56 ARABIC LETTER YEH 3 VOC_UNKNOWN_TASHKEEL
22 51 ARABIC L

In [72]:
max_len = 2*max_con
max_w_pad = max_len+1

latent_dim = voc_size
num_encoder_tokens = voc_size
num_decoder_tokens = voc_size

In [75]:
input_output_a = [ (pad_one(tr_all_unknown(l), max_con), pad_one(l, max_con)) for l in filtered ]
input_output_a = np.array(input_output_a).reshape(-1, 2, max_len)
input_output_a = np.pad(input_output_a, ((0,0), (0,0), (0,1)))
print(input_output_a.shape)


(6102, 2, 401)


In [78]:
encoder_input_data = K.utils.to_categorical(input_output_a[:,0,:], voc_size)
print(encoder_input_data.shape)
decoder_input_data_ = input_output_a[:,1,:]
decoder_input_data = K.utils.to_categorical(decoder_input_data_, voc_size)
print(decoder_input_data.shape)
decoder_target_data = K.utils.to_categorical(np.pad(decoder_input_data_[:,1:], ((0,0), (0,1))), voc_size)
print(decoder_target_data.shape)

(6102, 401, 57)
(6102, 401, 57)
(6102, 401, 57)


In [79]:
# Define an input sequence and process it.
encoder_inputs = KL.Input(shape=(None, num_encoder_tokens))
encoder = KL.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = KL.Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = KL.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = KL.Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = K.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, None, 57)]   0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, None, 57)]   0           []                               
                                                                                                  
 lstm (LSTM)                    [(None, 57),         26220       ['input_3[0][0]']                
                                 (None, 57),                                                      
                                 (None, 57)]                                                      
                                                                                            

In [80]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [81]:
weights_fn = 'lstm-weights.h5'
if os.path.exists(weights_fn):
    model.load_weights(weights_fn)

In [None]:
cb = K.callbacks.ModelCheckpoint(filepath=weights_fn, save_weights_only=True, verbose=0)

In [92]:
batch_size = 512
epochs = 30

# training part

In [93]:
# epochs=epochs, steps_per_epoch=steps_per_epoch,
model.fit(
  [encoder_input_data, decoder_input_data],
  decoder_target_data, batch_size=batch_size, epochs=epochs, validation_split=0.2,
  callbacks=[cb]
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f5669880cd0>

# Inference model

In [87]:
# Define sampling models
encoder_model = K.Model(encoder_inputs, encoder_states)

decoder_state_input_h = KL.Input(shape=(latent_dim,))
decoder_state_input_c = KL.Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = K.Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)


In [88]:
encoder_model.summary()
decoder_model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, None, 57)]        0         
                                                                 
 lstm (LSTM)                 [(None, 57),              26220     
                              (None, 57),                        
                              (None, 57)]                        
                                                                 
Total params: 26,220
Trainable params: 26,220
Non-trainable params: 0
_________________________________________________________________
Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, None, 57)]   0           []                               
     

In [89]:
I = np.identity(voc_size)

def my_to_categorical(v):
  return np.array([ I[ix] for ix in v])


In [106]:
def tashkeel(line):
  my_input_seq = np.array(tokenize_tashkeel_id(line)).reshape(-1)
  max_len = len(my_input_seq)
  
  my_input = my_to_categorical(my_input_seq)
  my_input = np.expand_dims(my_input, 0)
  
  sampled = []
  sampled_ix = VOC_START
  states_value = encoder_model.predict(my_input)
  for i, ix in enumerate(my_input_seq):
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    if ix>VOC_STOP: sampled_ix = ix
    print(id_repr[sampled_ix])
    sampled.append(sampled_ix)
    target_seq[0, 0, sampled_ix] = 1.
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
    states_value = [h, c]
    # Sample a token
    sampled_ix = np.argmax(output_tokens[0, -1, :])
    if sampled_ix == VOC_STOP or len(sampled) >= max_len: break
  return "".join([ id2ch2.get(ix, "") for ix in sampled ])

In [110]:
line = "ذهب إلى السوق"
print(tashkeel(line))

VOC_START
VOC_UNKNOWN_TASHKEEL
ARABIC LETTER THAL
VOC_FATHA
ARABIC LETTER HEH
VOC_DAMMA
ARABIC LETTER BEH
VOC_UNKNOWN_TASHKEEL
SPACE
VOC_UNKNOWN_TASHKEEL
ARABIC LETTER ALEF WITH HAMZA BELOW
VOC_KASRA
ARABIC LETTER LAM
VOC_SHADDA_FATHA
ARABIC LETTER ALEF MAKSURA
VOC_UNKNOWN_TASHKEEL
SPACE
VOC_UNKNOWN_TASHKEEL
ARABIC LETTER ALEF
VOC_UNKNOWN_TASHKEEL
ARABIC LETTER LAM
VOC_UNKNOWN_TASHKEEL
ARABIC LETTER SEEN
VOC_SHADDA_FATHA
ARABIC LETTER WAW
VOC_FATHA
ARABIC LETTER QAF
VOC_FATHA
ARABIC LETTER TEH MARBUTA
VOC_FATHA
ذَهُب إِلَّى السَّوَقَةَ


In [99]:
ch2id

{'': 1,
 '\n': 19,
 ' ': 20,
 '؟': 2,
 'ء': 21,
 'آ': 22,
 'أ': 23,
 'ؤ': 24,
 'إ': 25,
 'ئ': 26,
 'ا': 27,
 'ب': 28,
 'ة': 29,
 'ت': 30,
 'ث': 31,
 'ج': 32,
 'ح': 33,
 'خ': 34,
 'د': 35,
 'ذ': 36,
 'ر': 37,
 'ز': 38,
 'س': 39,
 'ش': 40,
 'ص': 41,
 'ض': 42,
 'ط': 43,
 'ظ': 44,
 'ع': 45,
 'غ': 46,
 'ـ؟ـ': 3,
 'ف': 47,
 'ق': 48,
 'ك': 49,
 'ل': 50,
 'م': 51,
 'ن': 52,
 'ه': 53,
 'و': 54,
 'ى': 55,
 'ي': 56,
 'ً': 9,
 'ٌ': 10,
 'ٍ': 11,
 'َ': 12,
 'ُ': 13,
 'ِ': 14,
 'ّ': 18,
 'ّ؟': 4,
 'َّ': 15,
 'ُّ': 16,
 'ِّ': 17}

In [94]:


more = True

while more:
    output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

    # Sample a token
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    print(id_repr[sampled_token_index])
    sampled.append(sampled_token_index)

    more = sampled_token_index != VOC_STOP and len(sampled) < max_len
    # Update the target sequence (of length 1).
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, sampled_token_index] = 1.
    # Update states
    states_value = [h, c]

shape: (18,)
my input shape: (18, 57)
my input shape: (1, 18, 57)
my target shape: (1, 1, 57)
VOC_UNKNOWN_TASHKEEL
ARABIC LETTER WAW
VOC_FATHA
ARABIC LETTER ALEF
VOC_UNKNOWN_TASHKEEL
ARABIC LETTER LAM
VOC_UNKNOWN_TASHKEEL
ARABIC LETTER LAM
VOC_SHADDA_FATHA
ARABIC LETTER HEH
VOC_FATHA
SPACE
VOC_UNKNOWN_TASHKEEL
ARABIC LETTER WAW
VOC_FATHA
ARABIC LETTER ALEF
VOC_UNKNOWN_TASHKEEL
ARABIC LETTER LAM
VOC_UNKNOWN_TASHKEEL
ARABIC LETTER LAM


# Below cells are not yet used

In [None]:
import functools

In [None]:
rnn = functools.partial(tf.keras.layers.GRU, recurrent_activation='sigmoid')

In [44]:
tf.keras.utils.to_categorical([0, 1, 2, 3], num_classes=voc_size, dtype='float32')

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0

In [None]:
def build_model(vocab_size, embedding_dim, rnn_units):
  model = tf.keras.Sequential([
    rnn(rnn_units,
        return_sequences=True,
        recurrent_initializer='glorot_uniform',
        stateful=True),
  ])
  return model
