In [1]:
!pip install pyarabic
!git clone https://github.com/moaaztaha/Arabic-English-Translation-Transformers

Collecting pyarabic
[?25l  Downloading https://files.pythonhosted.org/packages/7b/e2/46728ec2f6fe14970de5c782346609f0636262c0941228f363710903aaa1/PyArabic-0.6.10.tar.gz (108kB)
[K     |███                             | 10kB 23.3MB/s eta 0:00:01[K     |██████                          | 20kB 16.7MB/s eta 0:00:01[K     |█████████                       | 30kB 14.2MB/s eta 0:00:01[K     |████████████                    | 40kB 13.1MB/s eta 0:00:01[K     |███████████████                 | 51kB 7.9MB/s eta 0:00:01[K     |██████████████████              | 61kB 9.2MB/s eta 0:00:01[K     |█████████████████████           | 71kB 8.7MB/s eta 0:00:01[K     |████████████████████████        | 81kB 9.5MB/s eta 0:00:01[K     |███████████████████████████     | 92kB 8.8MB/s eta 0:00:01[K     |██████████████████████████████  | 102kB 7.7MB/s eta 0:00:01[K     |████████████████████████████████| 112kB 7.7MB/s 
[?25hBuilding wheels for collected packages: pyarabic
  Building wheel for py

In [2]:
# modules
import random
import string
import re
from pathlib import Path
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import pandas as pd
from tqdm import tqdm

import gensim
import numpy as np

import pyarabic.araby as araby
from pyarabic.araby import strip_tashkeel, strip_tatweel

### Data Preprocessing 

In [3]:
ar = pd.read_table('/content/Arabic-English-Translation-Transformers/ArabicNewData.txt', delimiter='\\n', names=['ar'])
en = pd.read_table('/content/Arabic-English-Translation-Transformers/EnglishNewData.txt', delimiter='\\n', names=['en'])

en['ar'] = ar['ar']
df = en.copy()
df = df.iloc[:35118]

  return read_csv(**locals())


### Arabic Preprocessing
- removing tashkeel
- removing tatweel
- normalize hamza 
- split 'ال'

In [4]:
morphs = [strip_tashkeel, strip_tatweel]

def fix_ar(sent):
  sent = split_al_sent(sent)
  tokens = araby.tokenize(sent, morphs=morphs)
  sent = araby.normalize_hamza(' '.join(tokens), method='tasheel')
  return sent

In [5]:
def split_al(word):
    if word.startswith('ال'):
        return word[:2], word[2:]
    else: 
        return word

def split_al_sent(sent):
    ww = []
    for word in sent.split():
        out = split_al(word)
        if type(out) is tuple:
            for w in out:
                ww.append(w)
        else:
            ww.append(word)
    return ' '.join(w for w in ww)

In [6]:
df['ar'] = df.apply(lambda row: fix_ar(row.ar), axis=1)

In [7]:
# getting text pairs
text_pairs = []
for idx, row in df.iterrows():
  en, ar = row['en'], row['ar']
  ar = "[start] " + ar + " [end]"
  text_pairs.append((en, ar))

In [8]:
for idx, row in df.iterrows():
  if len(row.ar.split()) < 1:
    print(row.ar, '\n*')
    print(row.en)

In [9]:
for _ in range(2):
    print(random.choice(text_pairs))

("Chief Secretary Kim hasn't given me any files or information on the architect yet.", '[start] انا لم احصل علي ال ملفات من . ال سكرتير كيم بعد [end]')
('Excellent.', '[start] ممتاز [end]')


In [10]:
len(text_pairs)

35118

In [11]:
# spliting data into train and validate
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) -  num_val_samples
train_pairs = text_pairs[: num_train_samples]
val_pairs = text_pairs[num_train_samples: num_train_samples + num_val_samples]

In [12]:
print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")

35118 total pairs
29851 training pairs
5267 validation pairs


#### Vectorizing the text data 

In [13]:
strip_chars = string.punctuation
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")


vocab_size = 20000
sequence_length = 50
batch_size = 265

def custom_standardization(input_string):
    return tf.strings.regex_replace(input_string, "[%s]" % re.escape(strip_chars), "")

eng_vectorization = TextVectorization(
            # max_tokens=vocab_size, 
            output_mode='int', 
            output_sequence_length=sequence_length)

ar_vectorization = TextVectorization(
    # max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
    max_tokens=vocab_size)

eng_texts = [pair[0] for pair in text_pairs]
ar_texts = [pair[1] for pair in text_pairs]
eng_vectorization.adapt(eng_texts)
ar_vectorization.adapt(ar_texts)

In [14]:
len(ar_vectorization.get_vocabulary()), len(eng_vectorization.get_vocabulary())

(20000, 13164)

In [15]:
len(ar_vectorization.get_vocabulary()), len(eng_vectorization.get_vocabulary())

(20000, 13164)

In [16]:
# making the dataset
def format_dataset(eng, ar):
    eng = eng_vectorization(eng)
    ar = ar_vectorization(ar)
    return ({"encoder_inputs": eng, "decoder_inputs": ar[:, :-1],}, ar[:, 1:])


def make_dataset(pairs):
    eng_texts, ar_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    ar_texts = list(ar_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, ar_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()

In [17]:
train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [18]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (265, 50)
inputs["decoder_inputs"].shape: (265, 50)
targets.shape: (265, 50)


### Building the Model 

In [44]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):

      config = super().get_config().copy()
      config.update({
          'embed_dim': self.embed_dim,
          'dense_dim': self.dense_dim,
          'num_heads': self.num_heads,
      })
      return config

class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, pretrained=False, weights=False, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        if not pretrained:
          self.token_embeddings = layers.Embedding(
              input_dim=vocab_size, output_dim=embed_dim
          )
        else:
          # pre-trained
          self.token_embeddings = layers.Embedding(
              input_dim=vocab_size, output_dim=embed_dim, weights=[weights]
          ) 

        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
      
    def get_config(self):

      config = super().get_config().copy()
      config.update({
      'sequence_length': self.sequence_length,
      'vocab_size': self.vocab_size,
      'embed_dim': self.embed_dim,
      })
      return config

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)


class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True


    def get_config(self):

      config = super().get_config().copy()
      config.update({
      'embed_dim': self.embed_dim,
      'latent_dim': self.latent_dim,
      'num_heads': self.num_heads,
      })
      return config

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)

In [24]:
ar_vocab_size = len(ar_vectorization.get_vocabulary())
en_vocab_size = len(eng_vectorization.get_vocabulary())
en_vocab_size, ar_vocab_size

(13164, 20000)

In [46]:
# build the model
embed_dim = 300
latent_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
#x = PositionalEmbedding(sequence_length, en_vocab_size, embed_dim, pretrained=True, weights=english_embeddings)(encoder_inputs)
x = PositionalEmbedding(sequence_length, en_vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
#x = PositionalEmbedding(sequence_length, ar_vocab_size, embed_dim, pretrained=True, weights=arabic_embeddings)(decoder_inputs)
x = PositionalEmbedding(sequence_length, ar_vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(ar_vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

In [47]:
googledrive_path = '/content/drive/MyDrive/Transformers/final_vanilla/'

### Callbacks
- Early Stopping 
- Saving weights 
- Learning Rate Schedular 
- Tensorboard

In [48]:
early_stopping_cb = callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=True)
checkpoint_cb = callbacks.ModelCheckpoint(googledrive_path+'/weights_adam.ckpt', monitor='val_accuracy', save_weights_only=True,verbose=True, save_best_only=True)
tensorboard_callback = callbacks.TensorBoard(log_dir=googledrive_path+"/logs")
lr_schr = keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', patience=2, verbose=True, factor=0.3, min_lr=0.0001)
cbs = [early_stopping_cb, checkpoint_cb, tensorboard_callback, lr_schr]

In [49]:
epochs = 100  # This should be at least 30 for convergence

transformer.summary()
transformer.compile(
    "adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(train_ds, epochs=epochs, validation_data=val_ds, callbacks=cbs)

Model: "transformer"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
positional_embedding_2 (Positio (None, None, 300)    3964200     encoder_inputs[0][0]             
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
transformer_encoder_1 (Transfor (None, None, 300)    4119848     positional_embedding_2[0][0]     
________________________________________________________________________________________

<tensorflow.python.keras.callbacks.History at 0x7fc32390f8d0>

In [29]:
# Loading the latest checkpoint
latest = tf.train.latest_checkpoint(googledrive_path)
transformer.load_weights(latest)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fc33e1d80d0>

In [50]:
ar_vocab = ar_vectorization.get_vocabulary()
ar_index_lookup = dict(zip(range(len(ar_vocab)), ar_vocab))
max_decoded_sentence_length = sequence_length


def decode_sequence(input_sentence):
    tokenized_input_sentence = eng_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = ar_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = ar_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return decoded_sentence


test_eng_texts = [pair[0] for pair in val_pairs]
for _ in range(30):
    input_sentence = random.choice(test_eng_texts[:30])
    translated = decode_sequence(input_sentence)
    print(input_sentence, '\n', translated)
    print('*'*50)

Who called them? 
 [start] من هم ؟ [end]
**************************************************
You can't run into the rainy season when you're trying to make an outdoor picture. 
 [start] انت لا تستطيع ان [UNK] [UNK] في موسم انت [UNK] الى ال مجرم [end]
**************************************************
The village of Aceitunilla is in one of the poorest valleys. 
 [start] ان ال جمهور والتصفيق كله لى فى احد [end]
**************************************************
You can't run into the rainy season when you're trying to make an outdoor picture. 
 [start] انت لا تستطيع ان [UNK] [UNK] في موسم انت [UNK] الى ال مجرم [end]
**************************************************
Who's the person that used to deliver milk to that house? 
 [start] من ال شخص ال ذي كان يهتم به ؟ [end]
**************************************************
Though not that one. 
 [start] هذا ليس بالامر [end]
**************************************************
- Southwest? Well, there is nothing for thousands of miles. 
 [start]

In [51]:
def get_bleu():
  
  preds, src = [], []

  with tqdm(total=len(val_pairs), position=0, leave=True) as pbar:
    for en_sent, ar_sent in tqdm(val_pairs, position=0, leave=True):
      translated = decode_sequence(en_sent)
      preds.append(translated.split())
      src.append(ar_sent.split())
      pbar.update()

    return src, preds
    # print_scores(src, preds)

In [52]:
# !!pip install Rouge
from rouge import Rouge
from nltk.translate.bleu_score import corpus_bleu
def print_scores(trgs, preds):
    print('----- Bleu-n Scores -----')
    print("1:", corpus_bleu(trgs, preds, weights=[1.0/1.0])*100)
    print("2:", corpus_bleu(trgs, preds, weights=[1.0/2.0, 1.0/2.0])*100)
    print("3:", corpus_bleu(trgs, preds, weights=[1.0/3.0, 1.0/3.0, 1.0/3.0])*100)
    print("4:", corpus_bleu(trgs, preds)*100)
    print('-'*25)
    print('----- Rouge Scores -----')
    rouge = Rouge()
    scores = rouge.get_scores([" ".join(i) for i in preds ], [" ".join(i) for i in src ], avg=True)
    for key, item in scores.items():
      print(key,':',item)

In [53]:
src, preds = get_bleu()

100%|██████████| 5267/5267 [19:34<00:00,  4.48it/s]
100%|██████████| 5267/5267 [19:34<00:00,  4.48it/s]


In [54]:
print_scores(src, preds)

----- Bleu-n Scores -----
1: 2.788899627226287


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


2: 16.699998883911
3: 30.325708872140204
4: 40.86563211784568
-------------------------
----- Rouge Scores -----
rouge-1 : {'f': 0.48335406725543123, 'p': 0.5094369421223762, 'r': 0.4848706965425694}
rouge-2 : {'f': 0.15959647282114406, 'p': 0.1671309138689211, 'r': 0.15888596579843062}
rouge-l : {'f': 0.4879406998746469, 'p': 0.5158916953942765, 'r': 0.4849411907950849}
