<a href="https://colab.research.google.com/github/ojuba-org/arabic-ml-data/blob/master/tashkeel_inference_tf2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Tensorflow documentation

* [v2](https://www.tensorflow.org/tutorials/text/text_generation)
* [v1](https://github.com/tensorflow/docs/blob/master/site/en/r1/tutorials/sequences/text_generation.ipynb)

In [1]:
%tensorflow_version 2.x

In [2]:
import sys
import os
import time
import random

In [3]:
import re
import unicodedata

In [4]:
import numpy as np

In [5]:
import tensorflow as tf

In [6]:
K = tf.keras
KL = K.layers

In [7]:
print("tf version = {} and py version = {}".format(tf.__version__, sys.version))

tf version = 2.7.0 and py version = 3.7.12 (default, Jan 15 2022, 18:48:18) 
[GCC 7.5.0]


In [8]:
assert sys.version_info.major == 3, 'please use python 3'
assert tf.test.gpu_device_name()!='', 'no GPU, please enable GPU'

In [9]:
! mkdir pretrained || : ; curl -sSL -o pretrained/tashkeel-lstm-weights.h5 https://github.com/ojuba-org/arabic-ml-data/raw/master/pretrained/tashkeel-lstm-weights.h5

mkdir: cannot create directory ‘pretrained’: File exists


In [10]:
tashkeel_set = {'ً', 'َ', 'ِ', 'ّ', 'ٌ', 'ُ', 'ٍ', 'ْ'}

In [11]:
def tokenize_tashkeel(line):
  offs=[ ix for ix, ch in enumerate(line) if ch not in tashkeel_set ]
  offs.append(len(line))
  return [ line[off: offs[ix+1]] for ix, off in enumerate(offs[:-1]) ]

In [12]:
(
  VOC_PAD,
  VOC_START,
  VOC_UNKNOWN,
  VOC_UNKNOWN_TASHKEEL, VOC_UNKNOWN_SHADDA,
  VOC_UNKNOWN_LETTER, VOC_UNKNOWN_HAMZA, VOC_UNKNOWN_TEH_OR_HAA, VOC_UNKNOWN_MAQSORA,
  VOC_FATHATAN, VOC_DAMMATAN, VOC_KASRATAN, 
  VOC_FATHA, VOC_DAMMA, VOC_KASRA,
  VOC_SHADDA_FATHA, VOC_SHADDA_DAMMA, VOC_SHADDA_KASRA,
  VOC_SUKUN,
  VOC_STOP,
) = range(20)

In [13]:
#alpha=sorted(s)
#tashkeel_set = { ch for ch in alpha if unicodedata.category(ch)=='Mn' }
#chars = [ ch for ch in alpha if unicodedata.category(ch)!='Mn']
chars = [' ', 'ء', 'آ', 'أ', 'ؤ', 'إ', 'ئ', 'ا', 'ب', 'ة', 'ت', 'ث', 'ج', 'ح', 'خ', 'د',
 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ى', 'ي']
chars.sort()

In [14]:
ch2id = { ch: ix+VOC_STOP+1 for ix, ch in enumerate(chars)}

In [15]:
ch2id.update({
  "": VOC_START,
  "\n": VOC_STOP,
  "\u064B": VOC_FATHATAN, "\u064C": VOC_DAMMATAN, "\u064D": VOC_KASRATAN,
  "\u064E": VOC_FATHA, "\u064F": VOC_DAMMA, "\u0650": VOC_KASRA,
  "\u0651\u064E": VOC_SHADDA_FATHA, "\u0651\u064F": VOC_SHADDA_DAMMA, "\u0651\u0650": VOC_SHADDA_KASRA,
  "\u0651": VOC_SUKUN,
  "\u061f": VOC_UNKNOWN,
  "\u0640\u061f\u0640": VOC_UNKNOWN_TASHKEEL,
  "\u0651\u061f": VOC_UNKNOWN_SHADDA,
})

In [16]:
id2ch = { v: k for k, v in ch2id.items() }

In [17]:
id2ch2 = dict(id2ch)
id2ch2.update({
    VOC_START: "", "\n": VOC_STOP, VOC_UNKNOWN: "", VOC_UNKNOWN_TASHKEEL: "", VOC_UNKNOWN_SHADDA: "ّ",
})

In [18]:
voc_size = max(ch2id.values())+1

In [19]:
id2ch.update({ ix:'\u061f' for ix in range(voc_size) if ix not in id2ch})

In [20]:
id_repr = { globals()[i]: i for i in dir() if i.startswith('VOC_')}
id_repr.update({i: unicodedata.name(id2ch[i]) for i in range(VOC_STOP+1, voc_size)})

In [21]:
def tokenize_one_id(sub):
  return (ch2id.get(sub[0], VOC_UNKNOWN),
          ch2id.get(sub[1:] or "\u0640\u061f\u0640", VOC_UNKNOWN_TASHKEEL),
  )

def tokenize_tashkeel_id(line):
  return [(VOC_START, VOC_UNKNOWN_TASHKEEL)]+[tokenize_one_id(sub) for sub in tokenize_tashkeel(line.strip()+"\n")]

def tr_all_unknown(seq):
  return [ (con_id, VOC_UNKNOWN_TASHKEEL) for con_id, vo_id in seq ]

def pad_one(tuples, size):
  l = len(tuples)
  return (tuples + [(VOC_PAD, VOC_PAD)]*(size-l)) if l<size else tuples


In [22]:
max_con = 200
max_len = 2*max_con
max_w_pad = max_len+1

latent_dim = voc_size
num_encoder_tokens = voc_size
num_decoder_tokens = voc_size

In [23]:
# Define an input sequence and process it.
encoder_inputs = KL.Input(shape=(None, num_encoder_tokens))
encoder = KL.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = KL.Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = KL.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = KL.Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = K.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, 57)]   0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None, 57)]   0           []                               
                                                                                                  
 lstm (LSTM)                    [(None, 57),         26220       ['input_1[0][0]']                
                                 (None, 57),                                                      
                                 (None, 57)]                                                      
                                                                                              

In [24]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [25]:
weights_fn = 'pretrained/tashkeel-lstm-weights.h5'
if os.path.exists(weights_fn):
    model.load_weights(weights_fn)

# Inference model

In [26]:
# Define sampling models
encoder_model = K.Model(encoder_inputs, encoder_states)

decoder_state_input_h = KL.Input(shape=(latent_dim,))
decoder_state_input_c = KL.Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = K.Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)


In [27]:
encoder_model.summary()
decoder_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None, 57)]        0         
                                                                 
 lstm (LSTM)                 [(None, 57),              26220     
                              (None, 57),                        
                              (None, 57)]                        
                                                                 
Total params: 26,220
Trainable params: 26,220
Non-trainable params: 0
_________________________________________________________________
Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, None, 57)]   0           []                               
     

In [28]:
I = np.identity(voc_size)

def my_to_categorical(v):
  return np.array([ I[ix] for ix in v])


In [32]:
def tashkeel(line, echo=False):
  my_input_seq = np.array(tokenize_tashkeel_id(line)).reshape(-1)
  max_len = len(my_input_seq)
  
  my_input = my_to_categorical(my_input_seq)
  my_input = np.expand_dims(my_input, 0)
  
  sampled = []
  sampled_ix = VOC_START
  states_value = encoder_model.predict(my_input)
  for i, ix in enumerate(my_input_seq):
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    if ix>VOC_STOP: sampled_ix = ix
    if echo: print(id_repr[sampled_ix])
    sampled.append(sampled_ix)
    target_seq[0, 0, sampled_ix] = 1.
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
    states_value = [h, c]
    # Sample a token
    sampled_ix = np.argmax(output_tokens[0, -1, :])
    if sampled_ix == VOC_STOP or len(sampled) >= max_len: break
  return "".join([ id2ch2.get(ix, "") for ix in sampled ])

In [33]:
line = "إنك لعلى خلق عظيم"
print(tashkeel(line))

إِنَّكَ لَعَلَى خَلَق عَظِيمٌ


In [35]:
line=input("enter arabic line: ")
line=line.strip()
print(tashkeel(line))

enter arabic line: سبحان الذي أسرى
سَبَحاًنَ الذَي أَسرَى 
