In [1]:
!git clone https://github.com/moaaztaha/Arabic-English-Translation-Transformers

Cloning into 'Arabic-English-Translation-Transformers'...
remote: Enumerating objects: 36, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 36 (delta 11), reused 32 (delta 7), pack-reused 0[K
Unpacking objects: 100% (36/36), done.


In [1]:
# modules
import random
import string
import re
from pathlib import Path
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import pandas as pd
from tqdm import tqdm

### Data Preprocessing 

In [2]:
en = pd.read_table('/content/Arabic-English-Translation-Transformers/data/eng/ac-test.en', delimiter='\\n', names=['en'])
ar = pd.read_table('/content/Arabic-English-Translation-Transformers/data/ara/test.en_ref.ar', delimiter='\\n', names=['ar'])
en['ar'] = ar['ar']
df = en.copy()
df.head()

  return read_csv(**locals())


Unnamed: 0,en,ar
0,"THE COUNCIL OF THE EUROPEAN ECONOMIC COMMUNITY,",مجلس الجماعة الاقتصادية الأوروبية
1,Whereas the adoption of a common transport pol...,حيث أن اعتماد سياسة نقل مشتركة تنطوي من بين أم...
2,Article 1,المادة 1
3,3. The types of carriage listed in Annex II sh...,3. لا تخضع أنواع النقل المدرجة في الملحق الثان...
4,Member States shall inform the Commission of t...,تبلغ الدول الأعضاء المفوضية الأوروبية بالتدابي...


In [3]:
text_pairs = []
for idx, row in df.iterrows():
    # split sentences
    if '.' in row['en'] and '.' in row['ar'] and len(row['en'].split()) > 100:
        en_sents = row['en'].split('.')
        ar_sents = row['ar'].split('.')
    
        for en_sent, ar_sent in zip(en_sents, ar_sents):
            ar_sent = "[start] " + ar_sent + " [end]"
            text_pairs.append((en_sent, ar))
    else:
        en, ar = row['en'], row['ar']
        ar = "[start] " + ar + " [end]"
        text_pairs.append((en, ar))

In [4]:
for _ in range(2):
    print(random.choice(text_pairs))

('Article 2', '[start] المادة 2 [end]')
('2. Member States shall also notify the Commission every month of the quantities sold during the previous month which may qualifiy for the allowance, broken down by commercial category and type of processing carried out, and of the expenditure relating to the grant of the allowance in question.', '[start] 2. يجب على الدول الأعضاء أيضاً إبلاغ المفوضية الأوروبية كل شهر بالكميات المباعة خلال الشهر السابق والتي قد تؤهل إلى الحصول على بدلات، مع توزيعها بحسب الفئة التجارية ونوع عملية المعالجة، وبالنفقات المتعلقة بمنح البدل المعني. [end]')


In [5]:
len(text_pairs)

4262

In [6]:
en = pd.read_table('/content/Arabic-English-Translation-Transformers/data/eng/ac-dev.en', delimiter='\\n', names=['en'])
ar = pd.read_table('/content/Arabic-English-Translation-Transformers/data/ara/tune.en_ref.ar', delimiter='\\n', names=['ar'])
#2725 to 3742
ar.drop(ar.loc[2725:3742].index,inplace=True)
#2720 to 3707
en.drop(en.loc[2725:3742].index,inplace=True)
en['ar'] = ar['ar']
df = en.copy()
df.head()

  return read_csv(**locals())


Unnamed: 0,en,ar
0,Having regard to the Treaty establishing the E...,مع الأخذ في الاعتبار المعاهدة التي أنشئت بموجب...
1,Whereas the progressive establishment of the c...,وحيث أنه لا يجب أن تواجه عملية الإنشاء التدريج...
2,"1. Each Member State shall, by the end of 1962...",1. يتعيّن على كلّ دولة عضو بحلول نهاية العام 1...
3,4. The two Annexes to this Directive shall for...,4 يشكّل الملحقان المرفقان بهذا التوجيه جزءاً ل...
4,Article 3,المادة 3


In [7]:
for idx, row in df.iterrows():
    # split sentences
    if '.' in row['en'] and '.' in row['ar'] and len(row['en'].split()) > 100:
        en_sents = row['en'].split('.')
        ar_sents = row['ar'].split('.')
    
        for en_sent, ar_sent in zip(en_sents, ar_sents):
            ar_sent = "[start] " + ar_sent + " [end]"
            text_pairs.append((en_sent, ar))
    else:
        en, ar = row['en'], row['ar']
        ar = "[start] " + ar + " [end]"
        text_pairs.append((en, ar))

In [65]:
random.choice(text_pairs)

('(f) the product or its ingredients have not been subjected to treatments involving the use of ionizing radiation;',
 '[start] (و) لم يخضع المنتج أو مكوناته للعلاجات التي تنطوي على استخدام  الإشعاعات المؤينة. [end]')

In [8]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) -  num_val_samples
train_pairs = text_pairs[: num_train_samples]
val_pairs = text_pairs[num_train_samples: num_train_samples + num_val_samples]

In [9]:
print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")

7421 total pairs
6308 training pairs
1113 validation pairs


#### Vectorizing the text data 

In [31]:
strip_chars = string.punctuation
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")


# vocab_size = 10000
sequence_length = 50
batch_size = 265

def custom_standardization(input_string):
    return tf.strings.regex_replace(input_string, "[%s]" % re.escape(strip_chars), "")

eng_vectorization = TextVectorization(
            # max_tokens=vocab_size, 
            output_mode='int', 
            output_sequence_length=sequence_length)

ar_vectorization = TextVectorization(
    # max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization)

eng_texts = [pair[0] for pair in text_pairs]
ar_texts = [pair[1] for pair in text_pairs]
eng_vectorization.adapt(eng_texts)
ar_vectorization.adapt(ar_texts)

In [32]:
def format_dataset(eng, ar):
    eng = eng_vectorization(eng)
    ar = ar_vectorization(ar)
    return ({"encoder_inputs": eng, "decoder_inputs": ar[:, :-1],}, ar[:, 1:])


def make_dataset(pairs):
    eng_texts, ar_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    ar_texts = list(ar_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, ar_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()

In [33]:
train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [34]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (265, 50)
inputs["decoder_inputs"].shape: (265, 50)
targets.shape: (265, 50)


### Building the Model 

In [35]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):

      config = super().get_config().copy()
      config.update({
          'embed_dim': self.embed_dim,
          'dense_dim': self.dense_dim,
          'num_heads': self.num_heads,
      })
      return config

class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, pretrained=False, weights=False, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        if not pretrained:
          self.token_embeddings = layers.Embedding(
              input_dim=vocab_size, output_dim=embed_dim
          )
        else:
          # pre-trained
          self.token_embeddings = layers.Embedding(
              input_dim=vocab_size, output_dim=embed_dim, weights=[weights]
          ) 

        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
      
    def get_config(self):

      config = super().get_config().copy()
      config.update({
      'sequence_length': self.sequence_length,
      'vocab_size': self.vocab_size,
      'embed_dim': self.embed_dim,
      })
      return config

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)


class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True


    def get_config(self):

      config = super().get_config().copy()
      config.update({
      'embed_dim': self.embed_dim,
      'latent_dim': self.latent_dim,
      'num_heads': self.num_heads,
      })
      return config

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)

In [36]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip -q glove.6B.zip

In [37]:
import os
path_to_glove_file = os.path.join(
    os.path.expanduser("~"), "/content/glove.6B.300d.txt"
)

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [38]:
vocab = eng_vectorization.get_vocabulary()
word_index = dict(zip(vocab, range(len(vocab))))

In [49]:
num_tokens = len(vocab)
embedding_dim = 300
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        embedding_matrix[i] = np.random.uniform(-.1, .1, size=(embedding_dim))
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 6455 words (1522 misses)


In [50]:
num_tokens

7977

In [51]:
len(vocab)

7977

In [52]:
ar_vocab_size = len(ar_vectorization.get_vocabulary())

In [53]:
embed_dim = 300
latent_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length, num_tokens, embed_dim, pretrained=True, weights=embedding_matrix)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = PositionalEmbedding(sequence_length, ar_vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(ar_vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

In [54]:
googledrive_path = '/content/drive/MyDrive/Transformers/adam_100words2ds_pre_en/'

In [55]:
from keras import callbacks
early_stopping_cb = callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=True)
checkpoint_cb = callbacks.ModelCheckpoint(googledrive_path+'weights_adam.ckpt', monitor='val_accuracy', save_weights_only=True,verbose=True, save_best_only=True)
tensorboard_callback = callbacks.TensorBoard(log_dir=googledrive_path+"logs")
cbs = [early_stopping_cb, checkpoint_cb, tensorboard_callback]

In [56]:
epochs = 100  # This should be at least 30 for convergence

transformer.summary()
transformer.compile(
    "adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(train_ds, epochs=epochs, validation_data=val_ds, callbacks=cbs)

Model: "transformer"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
positional_embedding_4 (Positio (None, None, 300)    2408100     encoder_inputs[0][0]             
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
transformer_encoder_2 (Transfor (None, None, 300)    4119848     positional_embedding_4[0][0]     
________________________________________________________________________________________

<tensorflow.python.keras.callbacks.History at 0x7f3503e7d950>

In [47]:
latest = tf.train.latest_checkpoint(googledrive_path)
transformer.load_weights(latest)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f35141efb10>

In [60]:
ar_vocab = ar_vectorization.get_vocabulary()
ar_index_lookup = dict(zip(range(len(ar_vocab)), ar_vocab))
max_decoded_sentence_length = sequence_length


def decode_sequence(input_sentence):
    tokenized_input_sentence = eng_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = ar_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = ar_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return decoded_sentence


test_eng_texts = [pair[0] for pair in val_pairs]
for _ in range(30):
    input_sentence = random.choice(test_eng_texts[:30])
    translated = decode_sequence(input_sentence)
    print(input_sentence, '\n', translated)
    print('*'*50)

 Difficulties may be encountered in ensuring that new types of fuel meet current technical standards, which, to a large extent, have been developed for conventional fossil fuels 
 [start] 3 قد تكون الكميات المخزنة في ذلك بالشكل المناسب من أجل ضمان شروط ظهور المصطلح على الوقود [end]
**************************************************
 Difficulties may be encountered in ensuring that new types of fuel meet current technical standards, which, to a large extent, have been developed for conventional fossil fuels 
 [start] 3 قد تكون الكميات المخزنة في ذلك بالشكل المناسب من أجل ضمان شروط ظهور المصطلح على الوقود [end]
**************************************************
(vi) principles for monitoring and evaluating EURES activities; 
 [start] 6 لتنفيذ المهام المسندة إلى الوكالة بشكل صحيح، [end]
**************************************************
- "type approval" in United Kingdom law. 
 [start] الموافقة على النوع في منطقة الآلية عندما [end]
**************************************************
THIRD

In [89]:
def get_bleu():
  
  preds, src = [], []

  with tqdm(total=100, position=0, leave=True) as pbar:
    for en_sent, ar_sent in tqdm(random.choices(val_pairs, k=100), position=0, leave=True):
      translated = decode_sequence(en_sent)
      preds.append(translated)
      src.append(ar_sent)
      pbar.update()

    return src, preds
    # print_scores(src, preds)



In [90]:
def print_scores(trgs, preds):
    print('----- Bleu-n Scores -----')
    print("1:", corpus_bleu(trgs, preds, weights=[1.0/1.0])*100)
    print("2:", corpus_bleu(trgs, preds, weights=[1.0/2.0, 1.0/2.0])*100)
    print("3:", corpus_bleu(trgs, preds, weights=[1.0/3.0, 1.0/3.0, 1.0/3.0])*100)
    print("4:", corpus_bleu(trgs, preds)*100)
    print('-'*25)

In [91]:
src, preds = get_bleu()

100%|██████████| 100/100 [00:58<00:00,  1.72it/s]
100%|██████████| 100/100 [00:58<00:00,  1.72it/s]


In [92]:
from nltk.translate.bleu_score import corpus_bleu

In [93]:
print_scores(preds, src)

----- Bleu-n Scores -----
1: 20.97472102026873


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


2: 45.79816701601575
3: 59.41535974766278
4: 67.67434300827439
-------------------------


In [88]:
print_scores(preds, src)

----- Bleu-n Scores -----
1: 17.369519832985386


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


2: 41.676755911401486
3: 55.79508424061051
4: 64.55753705912385
-------------------------


In [85]:
preds[1]

'[start] ب النكهة الجيدة، عندما يتم جمع الرسوم عندما تذبح الحيوانات المشار إليها في النقطة أ و [end]'

In [86]:
src[1]

'[start] (ب) تم الامتثال لفترة تحويل من 12 شهرا على الأقل قبل موسم الحصاد. [end]'