# ARABISH - is an Arabic to English machine translation model 

Model based on the architecture of Transformer proposed by the famous paper ``Attention is all you need!``


## Section 0: Depencancies and Constants

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

import datasets
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

from pathlib import Path

import os

from typing import List, Dict, Tuple, Any

## Section 1: Data Preprocessing, Tokenization, and Prepartion 

### 1.1 Downloading Dataset from Hugging Face ☺

CCMatrix Dataset is has been extracted from web crawls using the margin-based bitext mining techniques described at https://github.com/facebookresearch/LASER/tree/master/tasks/CCMatrix.
* 90 languages, 1,197 bitexts
* total number of files: 90
* total number of tokens: 112.14G
* total number of sentence fragments: 7.37G

* Languages
Configs are generated for all language pairs in both directions. You can find the valid pairs in Homepage section of Dataset Description: https://opus.nlpl.eu/CCMatrix.php E.g.

```bash
print(next(iter(dataset['train'])))
```

```bash
    {
        "id": 1,
        "score": 1.2498379,
        "translation": 
        {
            "en": "This uncertainty was very difficult for them.”",
            "ar": "كانت حالة عدم اليقين هذه صعبة للغاية بالنسبة لهم.”"
        }
    }
```



In [None]:
ds_raw = load_dataset("yhavinga/ccmatrix", "en-ar")

Downloading data:   0%|          | 0.00/5.01G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
print(next(iter(ds_raw['train']))['translation'])

{'ar': 'وأقسم سبحانه وتعالى على تزكية نفسه صلى الله عليه وسلم وعصمتها من الآثام لمقامه الشريف، فالله سبحانه وتعالى زكى فؤاده ولسانه وجوارحه صلى الله عليه وسلم.', 'en': 'by [the plaintiff] in the [present lawsuit].‘‖ (Id. at p.'}


In [None]:
i = 0
for item in ds_raw['train']:
    print(item['translation']['en'])
    print(item['translation']['ar'])
    print('----------------------------')
    i += 1
    if i > 5: 
        break

by [the plaintiff] in the [present lawsuit].‘‖ (Id. at p.
وأقسم سبحانه وتعالى على تزكية نفسه صلى الله عليه وسلم وعصمتها من الآثام لمقامه الشريف، فالله سبحانه وتعالى زكى فؤاده ولسانه وجوارحه صلى الله عليه وسلم.
----------------------------
* I swear by [this] countryside,
أقسم الله بهذا البلد الحرام ، وهو ( مكة ) .
----------------------------
Here in the earth all nations hate each other, and every one of them hates the Jew.
هنا في الأرض جميع الأمم يكرهون بعضهم بعضا، وكل واحد منهم يكره اليهود.
----------------------------
Whom should I persuade (now again)
من الذي ينبغي أن أقنعه (الآن مرة أخرى)
----------------------------
The left who founded your party once knew this."
اليسار الذي أسس حزبك عرف ذلك مرة”.
----------------------------
This uncertainty was very difficult for them.”
كانت حالة عدم اليقين هذه صعبة للغاية بالنسبة لهم."
----------------------------


In [66]:
os.mkdir('dataset')

### 1.2 Make data tokenization functions ☺

Using `tokenizers` and `datasets` from *Hugging Face*




In [15]:
%%writefile dataset/data_genarator.py

import datasets

def data_genarator(dataset: datasets.dataset_dict.DatasetDict,
                     lang: str):
    """"
    Genrate all sentences in a given dataset. 
    This function pass through the whole dataset rows as a genrator to yield every row in the translation for a spcific language to be processed
    
    Examples
    Here are some examples of the inputs that are accepted::

        genrator_dataset(dataset_raw, 'en')
        genrator_dataset(dataset_raw, 'ar')


    Args
        dataset : :datasets.dataset_dict.DatasetDict
            The Raw Dataset that should be iterated over.
        lang: str
            The Language argument in the dataset fields 
    Returns
        iter(next(dataset['train]))
    """
    for item in dataset:
        yield item['translation'][lang]

Overwriting dataset/data_genarator.py


In [16]:
%%writefile dataset/tokenizer.py

from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace

def tokenizer() -> Tokenizer:
    """
    Function to create WordLevel Tokenizer. 
    The function has adds 4 special tokens for the dataset:
        1- [UNK]: Unknown token for tokens that are not recognized in the dataset
        2- [PAD]: Padding token to keep the size of sequance constant
        3- [SOS]: Start Of Sentence token to indicate the sentance start
        4- [EOS]: End Of Sentence token to indicaaate the sentence end

    Example:
        toeknizer = tokenizer()
    
    Returns:
        tokenizer: Tokenizer

        A word-level tokenizer tokenizer. 

    """
    
    tokenizer = Tokenizer(WordLevel(unk_token='[UNK]'))
    tokenizer.pre_tokenizer =Whitespace()
    return tokenizer

Overwriting dataset/tokenizer.py


In [17]:
%%writefile dataset/train_tokenizer.py
from tokenizers import Tokenizer
from tokenizers.trainers import WordLevelTrainer
from data_genarator import data_genarator
import datasets

def train_tokenizer(tokenizer: Tokenizer,
                    dataset: datasets.dataset_dict.DatasetDict,
                    lang: str) -> Tokenizer:
    """
    Function to tokenize a certain dataset. 
    The function creates a WordLevelTrainer and train the tokeizer to the given dataset.

    Args:
        tokenizer: Tokenizer
            the tokenizer that should be trained
        dataset: datasets.dataset_dict.DatasetDict
            The dataset that should be tokenized
        lang: str
            The language of the tokenizer (This variable used only for naming)
    
    Example:
        train_tokenizer(english_tokenizer, dataset_raw, 'en')
    
    Returns:
        tokenizer: Tokenizer

        A tokenizer that already tokenized a certain dataset. 

    """
    
    trainer = WordLevelTrainer(special_tokens=['[UNK]', '[PAD]', '[SOS]', '[EOS]'], min_frequency=2)
    tokenizer.train_from_iterator(data_genarator(dataset,lang), trainer=trainer)

    return tokenizer

Overwriting dataset/train_tokenizer.py


In [18]:
%%writefile dataset/save_tokenizer.py
from tokenizers import Tokenizer
from pathlib import Path
def save_tokenizer(tokenizer: Tokenizer,
                   tokenizer_path: Path) -> None:
    """
    Function to save a tokenizer in a json file.
    The tokenizer be saved in a naming convention (tokenizername_language.json)
    Args:
        tokenizer: Tokenizer
            The tokenizer to be saved 
        tokenizer_path: Path
            the Path that the tokenizer should be saved it.
    
    Example:
        tokenizer('tokenizer', 'en) ----> tokeinzer_en.json
    
    Returns:
        None
    """
    tokenizer.save(str(tokenizer_path))

Overwriting dataset/save_tokenizer.py


In [19]:
%%writefile dataset/make_or_load_tokenizer.py

from tokenizers import Tokenizer
from tokenizer import tokenizer
from train_tokenizer import train_tokenizer
from save_tokenizer import save_tokenizer
from pathlib import Path
import datasets

def make_or_load_tokenizer(tokenizer_name:str, 
                           lang: str,
                           dataset: datasets.dataset_dict.DatasetDict) -> Tokenizer:
    """
    Function to build a WordLevel tokenizer, train it on a given dataset, and save it for later use.
    if it already exits, just load it.

    Args:
        tokenizer_name: str
            the name of the tokenizer file
        lang: str
            The language of the tokenizer (This variable used only for naming)
        dataset: datasets.dataset_dict.DatasetDict
            The dataset that should be tokenized
        
    Example:
        make_or_load_tokenizer('tokenizer', 'en', ds_raw)
    
    Returns:
        tokenizer: Tokenizer

        A tokenizer that already tokenized a certain dataset and saved for later use. 

    """
    tokenizer_path = Path(tokenizer_name.format(lang))
    if not Path.exists(tokenizer_path):
        tokenizer = tokenizer()

        train_tokenizer(tokenizer = tokenizer,
                        dataset=dataset,
                        lang=lang)
        save_tokenizer(tokenizer=tokenizer,tokenizer_path=tokenizer_path)
        
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

Overwriting dataset/make_or_load_tokenizer.py


### 1.3 Makeing dataset loaders 


In [20]:
%%writefile dataset/causal_mask.py

import torch
def causal_mask(size: int) -> torch.Tensor:
    """
    Function to provide a mask that masks or covers the future inputs and keep them away from the attention calculations.
    Args:
        size: int
            the size of the mask 
    Returns: 
        mask: torch.Tensor
            mask of True only at the future input (mask is squared (size*size) and has outer batch dimension)
    """
    mask = torch.triu( torch.ones(size=(1,size,size)), diagonal=1).type(torch.int64)
    return mask == 0

Writing dataset/causal_mask.py


In [21]:
%%writefile dataset/BilingualDataset.py
import torch
from torch.utils.data import Dataset
import datasets
from tokenizers import Tokenizer
from causal_mask import causal_mask
class BilingualDataset(Dataset):
    """
    Class inherit from ```bash torch.utils.data.Dataset``` to create encompase a raw data into a dataset
    valid for use in dataloaders.
    
    The class has a constructor, __len__() method, and __getitem__() method.

    The BilingualDataset class is ment to take tokenized data and store them as dataloaders,
    it adds spical tokens to the raw tokens and keeps each sequance in a fixed constant length.

    """
    def __init__(self,
                 datasaet: datasets.dataset_dict.DatasetDict,
                 src_tokenizer: Tokenizer,
                 trg_tokenizer: Tokenizer, 
                 seq_len: int):
        """
        Constructor for the BilingualDataset class to create dataset instance.
        the constructor saves the attuributes to each given instance and creates some attributes to be used.

        Args:
            dataset: a raw data set of any format like ```bash datasets.dataset_dict.DatasetDict```
                the dataset should be in a format of bilingual data;
                ```bash
                {
                    "id": 1,
                    "score": 1.2498379,
                    "translation": 
                    {
                        "en": "This uncertainty was very difficult for them.”",
                        "ar": "كانت حالة عدم اليقين هذه صعبة للغاية بالنسبة لهم.”"
                    }
                }
                ``` 
            src_tokenizer: Tokenizer
                the tokenizer should be used to tokenize the source language dataset
            trg_tokenizer: Tokenizer
                the tokenizer should be used to tokenize the target language dataset
            seq_len: int
                the maximum sequance length for every input or output.
        Example:
            dataset = BilingualDataset(raw_ds, tokenizer_en, tokenizer_ar, 200)
        
        Returns: None
        """
        super().__init__()

        self.ds = datasaet
        self.src_tokenizer = src_tokenizer
        self.trg_tokenizer = trg_tokenizer
        self.seq_len = seq_len

        self.sos_token = torch.tensor([src_tokenizer.token_to_id('[SOS]')], dtype=torch.int64)
        self.pad_token = torch.tensor([src_tokenizer.token_to_id('[PAD]')], dtype=torch.int64)
        self.eos_token = torch.tensor([src_tokenizer.token_to_id('[EOS]')], dtype=torch.int64)

    def __len__(self):
        """
        Function to calculate the length of the dataset.
        Args: None
        Example:
            BilingualDataset.__len__(ds)
        Returns:
            out: int
                the number of rows in the dataset

        """
        return len(self.ds)
    
    def __getitem__(self,
                    index: int) -> Dict:
        """
        Function to retrive datarow from the dataset and tokenize it.

        Args:
            index: int
                the inxed of the row should be tokenized.
        
        example:
            BilingualDataset.__len__(ds, 5)

        Returns:
            out: Dict
                a dictonary the containts the `encoder input tokens`, `decoder_input_tokens`,
                `ecoder_mask` to mask the padding tokens and keep them away from computations.
                `decoder_mask` to mask the padding tokens and the future tokens form the decoder input,
                `label` the true output of the decoder, `src_text` the actual text without encoding, 
                `trg_text` the actual text after decodeing.

        """
        src_txt, trg_txt = self.ds[index]['translation']
        
        enc_input_tokens = self.src_tokenizer.encode(src_txt).ids
        dec_input_tokens = self.trg_tokenizer.encode(trg_txt).ids
         
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2 # 2 for SOS and EOS
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1 # 1 for SOS only
        
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError('Sentence is too long')
        
        encoder_input = torch.cat(
            tensors=[
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64)
            ],
            dim=0
        )

        decoder_input = torch.cat(
            tensors=[
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64)
            ],
            dim=0
        )

        label = torch.cat(
            tensors=[
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64)
            ],
            dim=0
        )

        self.mask = (torch.triu(torch.ones((1, decoder_input.size(0), decoder_input.size(0))), diagonal=1).type(torch.int64)) == 0

        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len
       
        return {
            'encoder_input': encoder_input,
            'decoder_input': decoder_input,
            'encoder_mask': (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(),
            'decoder_mask': (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)),
            'label': label,
            'src_text': src_txt,
            'tgt_text': trg_txt
        }

Writing dataset/BilingualDataset.py


In [23]:
%%writefile dataset/max_seq_len.py
import datasets
from tokenizers import Tokenizer
def calculate_max_seq_len(dataset: datasets.dataset_dict.DatasetDict,
                          src_tokenizer: Tokenizer,
                          trg_tokenizer: Tokenizer,
                          src_lang: str,
                          trg_lang: str,
                          offset: int) -> int:
    """
    Function to calculate the maximum allowable sequance length in the transformer acrhitecture.
    it's calculated to be the longest sequance in the dataset + offset

    Args:
        dataset: datasets.dataset_dict.DatasetDict
            the raw dataset to search through 
        src_tokenizer: Tokenizer
            the tokenizer should be used to tokenize the source language dataset
        trg_tokenizer: Tokenizer
            the tokenizer should be used to tokenize the target language dataset 
        src_lang: str
            the name of source language in the dataset
        trg_lang: str
            the name of target language in the dataset
        offset: int
            the number of offest above the max sequance in the dataset should be added to indicate max sequance
    
    Example:
        calculate_max_seq_len(raw_ds, tokenizer_en, tokenizer_ar, 'en', 'ar', 10)

    Returns: 
        out: int
            the maximum allowable sequance length 
    """

    max_len_src = 0
    max_len_tgt = 0
        
    for item in dataset['train']:
        src_tokens = src_tokenizer.encode(item['translation'][src_lang]).ids
        trg_tokens = trg_tokenizer.encode(item['translation'][trg_lang]).ids
        max_len_src = max(max_len_src, len(src_tokens))
        max_len_tgt = max(max_len_tgt, len(trg_tokens))

    print(f'Max Length of source sentence: {max_len_src}\nMax Length of Target sentence: {max_len_tgt}')

    return max(max_len_src,max_len_tgt) + offset

Writing dataset/max_seq_len.py


In [24]:
%%writefile dataset/dataset_loader.py

import torch
from torch.utils.data import DataLoader, Dataset, random_split
from tokenizers import Tokenizer
from datasets import load_dataset
from make_or_load_tokenizer import make_or_load_tokenizer
from BilingualDataset import BilingualDataset
from max_seq_len import calculate_max_seq_len
def dataset_loader(dataset_name: str,
                 conf: Dict) -> Tuple(DataLoader, DataLoader, Tokenizer, Tokenizer):
    """
    Function the loads the raw dataset, split it into train and validation, create tokenizers and tokenize it,
    encopmase the data into PyTorch Dataset and turn it into dataloaders ready for training.

    Args:
        dataset_name: str 
            the name of the Hugging Face dataset should be downloaded and loaded.
        conf: Dict
            configration of the datasets and tokenizers. Example:
            ```bashconf= 
            {
                'src_lang' : 'en',
                'trg_lang` : 'ar',
                'tokenizer_name: 'tokenizer',
                'seq_len' : 200,
                'batch_size': 8
            }```

    Examples:
        tr_dataloader, val_dataloader, src_tokenizer, trg_tokenizer = load_dataset("dataset", config)
    
    Returns:
        out: Tuple(DataLoader, DataLoader, Tokenizer, Tokenizer)
            training dataloader, validation dataloader, source tokenizer, target tokenizer
    """
    dataset_raw = load_dataset(dataset_name, f"{conf['src_lang']}-{conf['trg_lang']}")

    tokenizer_src = make_or_load_tokenizer(tokenizer_name=conf['tokenizer_name'], 
                           lang=conf['src_lang'],
                           dataset=dataset_raw)
    tokenizer_trg = make_or_load_tokenizer(tokenizer_name=conf['tokenizer_name'], 
                           lang=conf['trg_lang'],
                           dataset=dataset_raw)
    

    train_ds_size = int(0.9 * len(dataset_raw))
    val_ds_size = len(dataset_raw) - train_ds_size
    train_ds_raw, val_ds_raw = random_split(dataset=dataset_raw, lengths=[train_ds_size, val_ds_size])
    
    conf['seq_len'] = calculate_max_seq_len(dataset=dataset_raw,
                                            src_tokenizer=tokenizer_src,
                                            trg_tokenizer=tokenizer_trg,
                                            src_lang=conf['src_lang'],
                                            trg_lang=conf['trg_lang'],
                                            offset=20)
    
    train_dataset = BilingualDataset(datasaet=train_ds_raw,
                                     src_tokenizer=tokenizer_src,
                                     trg_tokenizer=tokenizer_trg,
                                     seq_len=conf['seq_len'])
    val_dataset = BilingualDataset(datasaet=val_ds_raw,
                                     src_tokenizer=tokenizer_src,
                                     trg_tokenizer=tokenizer_trg,
                                     seq_len=conf['seq_len'])
    
    train_dataloader = DataLoader(dataset=train_dataset,
                                  batch_size=conf['batch_size'],
                                  shuffle=True)
    val_dataloader = DataLoader(dataset=val_dataset,
                                  batch_size=1,
                                  shuffle=False)
    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_trg
    
    

Writing dataset/dataset_loader.py


## Section 2: Transformer Components Model Building

Building the Transformer model according to the famous paper *Attention is all you need* 