# ARABISH - is an Arabic to English machine translation model 

Model based on the architecture of Transformer proposed by the famous paper ``Attention is all you need!``


## Section 0: Depencancies and Constants

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

import datasets
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

from pathlib import Path

import os

from typing import List, Dict, Tuple, Any

## Section 1: Data Preprocessing, Tokenization, and Prepartion 

### 1.1 Downloading Dataset from Hugging Face ☺

CCMatrix Dataset is has been extracted from web crawls using the margin-based bitext mining techniques described at https://github.com/facebookresearch/LASER/tree/master/tasks/CCMatrix.
* 90 languages, 1,197 bitexts
* total number of files: 90
* total number of tokens: 112.14G
* total number of sentence fragments: 7.37G

* Languages
Configs are generated for all language pairs in both directions. You can find the valid pairs in Homepage section of Dataset Description: https://opus.nlpl.eu/CCMatrix.php E.g.

```bash
print(next(iter(dataset['train'])))
```

```bash
    {
        "id": 1,
        "score": 1.2498379,
        "translation": 
        {
            "en": "This uncertainty was very difficult for them.”",
            "ar": "كانت حالة عدم اليقين هذه صعبة للغاية بالنسبة لهم.”"
        }
    }
```



In [None]:
ds_raw = load_dataset("yhavinga/ccmatrix", "en-ar")

Downloading data:   0%|          | 0.00/5.01G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
print(next(iter(ds_raw['train']))['translation'])

{'ar': 'وأقسم سبحانه وتعالى على تزكية نفسه صلى الله عليه وسلم وعصمتها من الآثام لمقامه الشريف، فالله سبحانه وتعالى زكى فؤاده ولسانه وجوارحه صلى الله عليه وسلم.', 'en': 'by [the plaintiff] in the [present lawsuit].‘‖ (Id. at p.'}


In [None]:
i = 0
for item in ds_raw['train']:
    print(item['translation']['en'])
    print(item['translation']['ar'])
    print('----------------------------')
    i += 1
    if i > 5: 
        break

by [the plaintiff] in the [present lawsuit].‘‖ (Id. at p.
وأقسم سبحانه وتعالى على تزكية نفسه صلى الله عليه وسلم وعصمتها من الآثام لمقامه الشريف، فالله سبحانه وتعالى زكى فؤاده ولسانه وجوارحه صلى الله عليه وسلم.
----------------------------
* I swear by [this] countryside,
أقسم الله بهذا البلد الحرام ، وهو ( مكة ) .
----------------------------
Here in the earth all nations hate each other, and every one of them hates the Jew.
هنا في الأرض جميع الأمم يكرهون بعضهم بعضا، وكل واحد منهم يكره اليهود.
----------------------------
Whom should I persuade (now again)
من الذي ينبغي أن أقنعه (الآن مرة أخرى)
----------------------------
The left who founded your party once knew this."
اليسار الذي أسس حزبك عرف ذلك مرة”.
----------------------------
This uncertainty was very difficult for them.”
كانت حالة عدم اليقين هذه صعبة للغاية بالنسبة لهم."
----------------------------


In [66]:
os.mkdir('dataset')

### 1.2 Make data tokenization functions ☺

Using `tokenizers` and `datasets` from *Hugging Face*




In [15]:
%%writefile dataset/data_genarator.py

import datasets

def data_genarator(dataset: datasets.dataset_dict.DatasetDict,
                     lang: str):
    """"
    Genrate all sentences in a given dataset. 
    This function pass through the whole dataset rows as a genrator to yield every row in the translation for a spcific language to be processed
    
    Examples
    Here are some examples of the inputs that are accepted::

        genrator_dataset(dataset_raw, 'en')
        genrator_dataset(dataset_raw, 'ar')


    Args
        dataset : :datasets.dataset_dict.DatasetDict
            The Raw Dataset that should be iterated over.
        lang: str
            The Language argument in the dataset fields 
    Returns
        iter(next(dataset['train]))
    """
    for item in dataset:
        yield item['translation'][lang]

Overwriting dataset/data_genarator.py


In [16]:
%%writefile dataset/tokenizer.py

from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace

def tokenizer() -> Tokenizer:
    """
    Function to create WordLevel Tokenizer. 
    The function has adds 4 special tokens for the dataset:
        1- [UNK]: Unknown token for tokens that are not recognized in the dataset
        2- [PAD]: Padding token to keep the size of sequance constant
        3- [SOS]: Start Of Sentence token to indicate the sentance start
        4- [EOS]: End Of Sentence token to indicaaate the sentence end

    Example:
        toeknizer = tokenizer()
    
    Returns:
        tokenizer: Tokenizer

        A word-level tokenizer tokenizer. 

    """
    
    tokenizer = Tokenizer(WordLevel(unk_token='[UNK]'))
    tokenizer.pre_tokenizer =Whitespace()
    return tokenizer

Overwriting dataset/tokenizer.py


In [17]:
%%writefile dataset/train_tokenizer.py
from tokenizers import Tokenizer
from tokenizers.trainers import WordLevelTrainer
from data_genarator import data_genarator
import datasets

def train_tokenizer(tokenizer: Tokenizer,
                    dataset: datasets.dataset_dict.DatasetDict,
                    lang: str) -> Tokenizer:
    """
    Function to tokenize a certain dataset. 
    The function creates a WordLevelTrainer and train the tokeizer to the given dataset.

    Args:
        tokenizer: Tokenizer
            the tokenizer that should be trained
        dataset: datasets.dataset_dict.DatasetDict
            The dataset that should be tokenized
        lang: str
            The language of the tokenizer (This variable used only for naming)
    
    Example:
        train_tokenizer(english_tokenizer, dataset_raw, 'en')
    
    Returns:
        tokenizer: Tokenizer

        A tokenizer that already tokenized a certain dataset. 

    """
    
    trainer = WordLevelTrainer(special_tokens=['[UNK]', '[PAD]', '[SOS]', '[EOS]'], min_frequency=2)
    tokenizer.train_from_iterator(data_genarator(dataset,lang), trainer=trainer)

    return tokenizer

Overwriting dataset/train_tokenizer.py


In [18]:
%%writefile dataset/save_tokenizer.py
from tokenizers import Tokenizer
from pathlib import Path
def save_tokenizer(tokenizer: Tokenizer,
                   tokenizer_path: Path) -> None:
    """
    Function to save a tokenizer in a json file.
    The tokenizer be saved in a naming convention (tokenizername_language.json)
    Args:
        tokenizer: Tokenizer
            The tokenizer to be saved 
        tokenizer_path: Path
            the Path that the tokenizer should be saved it.
    
    Example:
        tokenizer('tokenizer', 'en) ----> tokeinzer_en.json
    
    Returns:
        None
    """
    tokenizer.save(str(tokenizer_path))

Overwriting dataset/save_tokenizer.py


In [19]:
%%writefile dataset/make_or_load_tokenizer.py

from tokenizers import Tokenizer
from tokenizer import tokenizer
from train_tokenizer import train_tokenizer
from save_tokenizer import save_tokenizer
from pathlib import Path
import datasets

def make_or_load_tokenizer(tokenizer_name:str, 
                           lang: str,
                           dataset: datasets.dataset_dict.DatasetDict) -> Tokenizer:
    """
    Function to build a WordLevel tokenizer, train it on a given dataset, and save it for later use.
    if it already exits, just load it.

    Args:
        tokenizer_name: str
            the name of the tokenizer file
        lang: str
            The language of the tokenizer (This variable used only for naming)
        dataset: datasets.dataset_dict.DatasetDict
            The dataset that should be tokenized
        
    Example:
        make_or_load_tokenizer('tokenizer', 'en', ds_raw)
    
    Returns:
        tokenizer: Tokenizer

        A tokenizer that already tokenized a certain dataset and saved for later use. 

    """
    tokenizer_path = Path(tokenizer_name.format(lang))
    if not Path.exists(tokenizer_path):
        tokenizer = tokenizer()

        train_tokenizer(tokenizer = tokenizer,
                        dataset=dataset,
                        lang=lang)
        save_tokenizer(tokenizer=tokenizer,tokenizer_path=tokenizer_path)
        
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

Overwriting dataset/make_or_load_tokenizer.py


### 1.3 Makeing dataset loaders 


In [20]:
%%writefile dataset/causal_mask.py

import torch
def causal_mask(size: int) -> torch.Tensor:
    """
    Function to provide a mask that masks or covers the future inputs and keep them away from the attention calculations.
    Args:
        size: int
            the size of the mask 
    Returns: 
        mask: torch.Tensor
            mask of True only at the future input (mask is squared (size*size) and has outer batch dimension)
    """
    mask = torch.triu( torch.ones(size=(1,size,size)), diagonal=1).type(torch.int64)
    return mask == 0

Writing dataset/causal_mask.py


In [21]:
%%writefile dataset/BilingualDataset.py
import torch
from torch.utils.data import Dataset
import datasets
from tokenizers import Tokenizer
from causal_mask import causal_mask
class BilingualDataset(Dataset):
    """
    Class inherit from ```bash torch.utils.data.Dataset``` to create encompase a raw data into a dataset
    valid for use in dataloaders.
    
    The class has a constructor, __len__() method, and __getitem__() method.

    The BilingualDataset class is ment to take tokenized data and store them as dataloaders,
    it adds spical tokens to the raw tokens and keeps each sequance in a fixed constant length.

    """
    def __init__(self,
                 datasaet: datasets.dataset_dict.DatasetDict,
                 src_tokenizer: Tokenizer,
                 trg_tokenizer: Tokenizer, 
                 seq_len: int):
        """
        Constructor for the BilingualDataset class to create dataset instance.
        the constructor saves the attuributes to each given instance and creates some attributes to be used.

        Args:
            dataset: a raw data set of any format like ```bash datasets.dataset_dict.DatasetDict```
                the dataset should be in a format of bilingual data;
                ```bash
                {
                    "id": 1,
                    "score": 1.2498379,
                    "translation": 
                    {
                        "en": "This uncertainty was very difficult for them.”",
                        "ar": "كانت حالة عدم اليقين هذه صعبة للغاية بالنسبة لهم.”"
                    }
                }
                ``` 
            src_tokenizer: Tokenizer
                the tokenizer should be used to tokenize the source language dataset
            trg_tokenizer: Tokenizer
                the tokenizer should be used to tokenize the target language dataset
            seq_len: int
                the maximum sequance length for every input or output.
        Example:
            dataset = BilingualDataset(raw_ds, tokenizer_en, tokenizer_ar, 200)
        
        Returns: None
        """
        super().__init__()

        self.ds = datasaet
        self.src_tokenizer = src_tokenizer
        self.trg_tokenizer = trg_tokenizer
        self.seq_len = seq_len

        self.sos_token = torch.tensor([src_tokenizer.token_to_id('[SOS]')], dtype=torch.int64)
        self.pad_token = torch.tensor([src_tokenizer.token_to_id('[PAD]')], dtype=torch.int64)
        self.eos_token = torch.tensor([src_tokenizer.token_to_id('[EOS]')], dtype=torch.int64)

    def __len__(self):
        """
        Function to calculate the length of the dataset.
        Args: None
        Example:
            BilingualDataset.__len__(ds)
        Returns:
            out: int
                the number of rows in the dataset

        """
        return len(self.ds)
    
    def __getitem__(self,
                    index: int) -> Dict:
        """
        Function to retrive datarow from the dataset and tokenize it.

        Args:
            index: int
                the inxed of the row should be tokenized.
        
        example:
            BilingualDataset.__len__(ds, 5)

        Returns:
            out: Dict
                a dictonary the containts the `encoder input tokens`, `decoder_input_tokens`,
                `ecoder_mask` to mask the padding tokens and keep them away from computations.
                `decoder_mask` to mask the padding tokens and the future tokens form the decoder input,
                `label` the true output of the decoder, `src_text` the actual text without encoding, 
                `trg_text` the actual text after decodeing.

        """
        src_txt, trg_txt = self.ds[index]['translation']
        
        enc_input_tokens = self.src_tokenizer.encode(src_txt).ids
        dec_input_tokens = self.trg_tokenizer.encode(trg_txt).ids
         
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2 # 2 for SOS and EOS
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1 # 1 for SOS only
        
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError('Sentence is too long')
        
        encoder_input = torch.cat(
            tensors=[
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64)
            ],
            dim=0
        )

        decoder_input = torch.cat(
            tensors=[
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64)
            ],
            dim=0
        )

        label = torch.cat(
            tensors=[
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64)
            ],
            dim=0
        )

        self.mask = (torch.triu(torch.ones((1, decoder_input.size(0), decoder_input.size(0))), diagonal=1).type(torch.int64)) == 0

        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len
       
        return {
            'encoder_input': encoder_input,
            'decoder_input': decoder_input,
            'encoder_mask': (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(),
            'decoder_mask': (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)),
            'label': label,
            'src_text': src_txt,
            'tgt_text': trg_txt
        }

Writing dataset/BilingualDataset.py


In [23]:
%%writefile dataset/max_seq_len.py
import datasets
from tokenizers import Tokenizer
def calculate_max_seq_len(dataset: datasets.dataset_dict.DatasetDict,
                          src_tokenizer: Tokenizer,
                          trg_tokenizer: Tokenizer,
                          src_lang: str,
                          trg_lang: str,
                          offset: int) -> int:
    """
    Function to calculate the maximum allowable sequance length in the transformer acrhitecture.
    it's calculated to be the longest sequance in the dataset + offset

    Args:
        dataset: datasets.dataset_dict.DatasetDict
            the raw dataset to search through 
        src_tokenizer: Tokenizer
            the tokenizer should be used to tokenize the source language dataset
        trg_tokenizer: Tokenizer
            the tokenizer should be used to tokenize the target language dataset 
        src_lang: str
            the name of source language in the dataset
        trg_lang: str
            the name of target language in the dataset
        offset: int
            the number of offest above the max sequance in the dataset should be added to indicate max sequance
    
    Example:
        calculate_max_seq_len(raw_ds, tokenizer_en, tokenizer_ar, 'en', 'ar', 10)

    Returns: 
        out: int
            the maximum allowable sequance length 
    """

    max_len_src = 0
    max_len_tgt = 0
        
    for item in dataset['train']:
        src_tokens = src_tokenizer.encode(item['translation'][src_lang]).ids
        trg_tokens = trg_tokenizer.encode(item['translation'][trg_lang]).ids
        max_len_src = max(max_len_src, len(src_tokens))
        max_len_tgt = max(max_len_tgt, len(trg_tokens))

    print(f'Max Length of source sentence: {max_len_src}\nMax Length of Target sentence: {max_len_tgt}')

    return max(max_len_src,max_len_tgt) + offset

Writing dataset/max_seq_len.py


In [24]:
%%writefile dataset/dataset_loader.py

import torch
from torch.utils.data import DataLoader, Dataset, random_split
from tokenizers import Tokenizer
from datasets import load_dataset
from make_or_load_tokenizer import make_or_load_tokenizer
from BilingualDataset import BilingualDataset
from max_seq_len import calculate_max_seq_len
def dataset_loader(dataset_name: str,
                 conf: Dict) -> Tuple(DataLoader, DataLoader, Tokenizer, Tokenizer):
    """
    Function the loads the raw dataset, split it into train and validation, create tokenizers and tokenize it,
    encopmase the data into PyTorch Dataset and turn it into dataloaders ready for training.

    Args:
        dataset_name: str 
            the name of the Hugging Face dataset should be downloaded and loaded.
        conf: Dict
            configration of the datasets and tokenizers. Example:
            ```bashconf= 
            {
                'src_lang' : 'en',
                'trg_lang` : 'ar',
                'tokenizer_name: 'tokenizer',
                'seq_len' : 200,
                'batch_size': 8
            }```

    Examples:
        tr_dataloader, val_dataloader, src_tokenizer, trg_tokenizer = load_dataset("dataset", config)
    
    Returns:
        out: Tuple(DataLoader, DataLoader, Tokenizer, Tokenizer)
            training dataloader, validation dataloader, source tokenizer, target tokenizer
    """
    dataset_raw = None
    dataset_dir = Path(conf['dataset_dir'])
    if dataset_dir.exists() and dataset_dir.is_dir():
        # Load the dataset from the existing folder
        print("Folder exists. Loading the dataset from the disk...")
        dataset_raw = load_from_disk(str(dataset_dir))
    else:
        print("Folder does not exist. Creating folder and downloading the dataset...")
        dataset_dir.mkdir(parents=True, exist_ok=True)
        dataset_raw = load_dataset(conf['dataset_name'], f"{conf['lang_src']}-{conf['lang_trg']}")
        dataset_raw.save_to_disk(str(dataset_dir))

    tokenizer_src = make_or_load_tokenizer(tokenizer_name=conf['tokenizer_name'], 
                           lang=conf['src_lang'],
                           dataset=dataset_raw)
    tokenizer_trg = make_or_load_tokenizer(tokenizer_name=conf['tokenizer_name'], 
                           lang=conf['trg_lang'],
                           dataset=dataset_raw)
    

    train_ds_size = int(0.9 * len(dataset_raw))
    val_ds_size = len(dataset_raw) - train_ds_size
    train_ds_raw, val_ds_raw = random_split(dataset=dataset_raw, lengths=[train_ds_size, val_ds_size])
    
    conf['seq_len'] = calculate_max_seq_len(dataset=dataset_raw,
                                            src_tokenizer=tokenizer_src,
                                            trg_tokenizer=tokenizer_trg,
                                            src_lang=conf['src_lang'],
                                            trg_lang=conf['trg_lang'],
                                            offset=20)
    
    train_dataset = BilingualDataset(datasaet=train_ds_raw,
                                     src_tokenizer=tokenizer_src,
                                     trg_tokenizer=tokenizer_trg,
                                     seq_len=conf['seq_len'])
    val_dataset = BilingualDataset(datasaet=val_ds_raw,
                                     src_tokenizer=tokenizer_src,
                                     trg_tokenizer=tokenizer_trg,
                                     seq_len=conf['seq_len'])
    
    train_dataloader = DataLoader(dataset=train_dataset,
                                  batch_size=conf['batch_size'],
                                  shuffle=True)
    val_dataloader = DataLoader(dataset=val_dataset,
                                  batch_size=1,
                                  shuffle=False)
    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_trg
    
    

Writing dataset/dataset_loader.py


## Section 2: Transformer Components Model Building

Building the Transformer model according to the famous paper *Attention is all you need* 

The Transformer Has Four main parts: 

1- Encoders Set
2- Decoders Set
3- Input Netowrk 
4- Output Network

The set of encoders contains N blocks of Encoder, Each has the following:

* Self Attention block followed by a Residual Connection and Layer Normalization 
* Feed Forward block followed by a Residual Connection and Layer Normalization 

The set of Decoders contains N blocks of Decoder, Each has the following:

* Masked Self Attention block followed by a Residual Connection and Layer Normalization 
* Cross Attention block followed by a Residual Connection and Layer Normalization
* Feed Forward block followed by a Residual Connection and Layer Normalization 

The Input Network:

* Tokenized inputs are mapped into an Embedding Matrix
* Positional Encodings are added

The Output Netowrk:

* Linear Layer 
* Softmax Layer 
* Highest probabilites are mapped into the embedding materix to extract the tokens then the text

For further information, I reccomend these resources:

* https://machinelearningmastery.com/the-transformer-model/
* https://medium.com/carbon-consulting transformer-architecture-how-transformer-models-work-46fc70b4ea59
* https://www.youtube.com/watch?v=6JGzwI2pNfo&pp=ygUSdHJhbnNmb3JtZXIgYWJvYmty


 

### 2.1 Building the Input Netowrk

The input network has 2 compenent blocks (classes): 

* InputEmbeddings
* PositionalEncoding



In [25]:
os.mkdir('model')

In [26]:
%%writefile model/InputEmbeddings.py

import math
import torch 
import torch.nn as nn

class InputEmbeddings(nn.Module):
    def __init__(self,
                 d_model: int,
                 vocab_size: int):
        """
        Class to create an embedding matrix of the size 
        of the vocabulary and the dimension vector for each token.
        
        Args:
            d_model: int
                The length of the vector to represnt each token
            vocab_size: int
                the number of tokens to embedded in the matrix

        Example: 
            embed = InputEmbeddings(512, 10000)

        Returns: None
        """
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)
        
    def forward(self,x):
        """
        function to embed each token passed through, it uses torch.nn.Embedding
        Args:
            x: torch.tensor
                the token should be embeded
        Returns:
            the embeding vector 
            torch.tensor
        """
        return self.embedding(x) * math.sqrt(self.d_model)

Writing model/InputEmbeddings.py


In [28]:
%%writefile model/PositionalEncoding.py
import math
import torch 
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self,
                 d_model: int,
                 seq_len: int,
                 dropout: float) -> None:
        """
        Class to create positional encodings and add them to the input sequance embedings.
        The class creates a positional encoding matrix using the *Sinusoidal Positional Embedding Function*
        ```bash
        Embedding[i, 2k] = sin(position / (10000^(2k / d_model)))

        Embedding[i, 2k+1] = cos(position / (10000^(2k / d_model)))
        ```
        then it takes a sequance that have been tokenized and retrived its embedings, and add positional encoding to the sequance (sequance only without padding).

        Args:
            d_model: int
                the length of the embeding vector for each token
            seq_len: int
                the maximum allowable sequance length
            dropout: float
                the dropout precentage to avoid overfitting
        """
        super().__init__()

        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        # positional encoding matrix in the shape of (seq_len, d_model)
        pe_matrix = torch.zeros(seq_len, d_model)
        
        # create vector of shape (seq_len,1)
        positions = torch.arange(0,seq_len, dtype=torch.float).unsqueeze_(1)
        denominators = torch.pow(self=10000.0, 
                                 exponent= (2 * torch.arange(0, d_model//2) ) / d_model) # 10000^(2i/d_model), i is the index of embedding
        # apply sin to even pos
        pe[:,0::2] = torch.sin(positions/denominators) # sin(pos/10000^(2i/d_model))
        # apply cos to odd pos
        pe[:,1::2] = torch.cos(positions/denominators) # cos(pos/10000^(2i/d_model))

        # add batch dimenshion  (1,seq_len,d_model) same dimension as the sequance embeddings (to be added over)
        pe = pe.unsqueeze(0)

        self.register_buffer('pe',pe)

    def forward(self,x):
        """
        Function that adds positional encodings to the input embeddings only without the padding.
        """
        x = x + (self.pe[:,:x.shape[1],:]).requires_grad_(False) 
        return self.dropout(x)

Overwriting model/PositionalEncoding.py


### 2.2 Encoder and Decoder Blocks 

The Encoder and Decoder blocks include main components as following:

* MultiHeadAttentionBlock
* ResidualConnection
* LayerNormalization 
* FeedForwardBlock


In [29]:
%%writefile model/MultiHeadAttentionBlock.py
import torch
import torch.nn as nn
import math
class MultiHeadAttentionBlock(nn.Module):
    def __init__(self,
                 d_model: int,
                 h: int,
                 dropout: float) -> None:
        
        """
        Class that creates a Multihead Attention explained in the paper `Attention is all you need`
        
        The math function for it is:
        
        ```bash
        Head(i) = Attention(QW_q, KW_k, VW_v)

        MultiHead(Q,K,V) = Concat(head(1), head(2), ...., head(h))W_o
        ```

        self attention is being computed (i.e., query, key, and value are the same tensor).
        inputs are batched (3D) with batch_first==True

        Args:
            d_model: int 
                the length of the embeding vector for each token
            h: int
                Number of heads 
            dropout: float
                the dropout precentage to avoid overfitting
            
        Returns: 
            out: torch.Tensor
                The Multihead Attention
        """
        
        super().__init__()

        self.d_model = d_model
        self.h = h

        assert d_model % h == 0, 'd_model is not divisble by h'
        self.d_k = d_model // h  # d_k is the length of each head

        self.w_q = nn.Linear(d_model, d_model, bias=False) # Query weight matrix
        self.w_k = nn.Linear(d_model, d_model, bias=False) # Key weigth matrix
        self.w_v = nn.Linear(d_model, d_model, bias=False) # Value weight matrix
        self.w_o = nn.Linear(d_model, d_model, bias=False) # Output weight matrix 

        self.dropout = nn.Dropout(dropout) # Dropout layer to avoid overfitting

    @staticmethod
    def attention(query: torch.Tensor, 
                  key: torch.Tensor, 
                  value: torch.Tensor, 
                  mask: torch.Tensor,
                  dropout: nn.Dropout):
        """
        Function to calculate the attnetion process.

        Args:
            query: torch.Tensor
                The Query embeddings of shape (batch, h, seq_len, d_k)
                Queries are compared against key-value pairs to produce the output. 
                See “Attention Is All You Need” for more details.
            
            key: torch.Tensor
                Key embeddings of shape (batch, h, seq_len, d_k)

            value: torch.Tensor
                Value embeddings of shape  (batch, h, seq_len, d_k)

            mask: torch.Tensor
                 If specified, a mask of shape (batch, h, seq_len) indicating which elements within key to ignore 
                 for the purpose of attention (i.e. treat as “padding”). 
                 Binary masks are supported. For a binary mask, a True value indicates that 
                 the corresponding key value will be ignored for the purpose of attention.

            dropout: nn.Dropout 
                The dropout layer to drop some weights randomly from calculations.

        """

        d_k = query.shape[-1] 

        # Attention_scores shape:  (batch, h, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2,-1)) / math.sqrt(d_k)
        
        if mask is not None: 
            attention_scores.masked_fill_(mask==0, -1e9)

        attention_scores = attention_scores.softmax(dim=-1) # (batch, h, seq_len, seq_len)

        if dropout is not None: 
            attention_scores = dropout(attention_scores)

        return (attention_scores @ value), attention_scores

    def forward(self,
                q: torch.Tensor,
                k: torch.Tensor, 
                v: torch.Tensor, 
                mask: torch.Tensor):
        """
        Fucntion calculates the multihead attention.
        """
        # (batch, seq_len, d_model)  same dimension in and out
        query = self.w_q(q) 
        key = self.w_k(k)
        value = self.w_v(v)

        # Devide it into heads where the length of each head is d_k = d_model//h
        # (batch, seq_len, d_model) ---> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
        query = query.view(query.shape[0],query.shape[1], self.h, self.d_k).transpose(1,2)
        key = key.view(key.shape[0],key.shape[1], self.h, self.d_k).transpose(1,2)
        value = value.view(value.shape[0],value.shape[1], self.h, self.d_k).transpose(1,2)

        head_attention, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)
        
        # (batch, h, seq_len, d_k) ---> (batch, seq_len, h, d_k)---> (batch, seq_len, d_model)
        head_attention = head_attention.transpose(1,2).contiguous().view(head_attention.shape[0], -1, self.h * self.d_k)
        
        return self.w_o(head_attention)

Writing model/MultiHeadAttentionBlock.py


In [39]:
%%writefile model/LayerNormalization.py
import torch
import torch.nn as nn

class LayerNormalization(nn.Module):
    """
    Applies Layer Normalization over a mini-batch of inputs.

    This layer implements the operation as described in the paper *Layer Normalization*

    """
    def __init__(self,
                 eps: float = 10**-6):
        
        """
        Applies Layer Normalization over a mini-batch of inputs.

        This layer implements the operation as described in the paper *Layer Normalization*

        Args:
            eps: float
                a value added to the denominator for numerical stability. Default: 1e-6

        Variabels:
            alpha: the learnable weights of the module
            bias: the learnable bias of the module

        """
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(1)) # multliplicative 
        self.bias = nn.Parameter(torch.zeros(1)) # addatitive 

    
    def forward(self,x: torch.Tensor):
        """
        Apply Layer Normalization by calculating the mean and STD over the layer, 
        and apply the normalization equation:
        ```bash
        layer_norm = Alpha * X` / (std + bais + eps)
        ``` 
        """
        mean = x.mean(dim= -1, keepdim=True) # keepdim=True -> the mean function cancel dim that's applied to but keepdim doesn't 
        std = x.std(dim=-1, keepdim=True)
        return self.alpha * (x-mean) / (std + self.eps) + self.bias

Writing model/LayerNormalization.py


In [41]:
%%writefile model/ResidualConnection.py
import torch
import torch.nn as nn
from LayerNormalization import LayerNormalization

class ResidualConnection(nn.Module):
    """
    Class to create a Residual Connection to add the input to previous layers to thier outputs.
    
    The idea of Residual Connection came from ResNets. 

    ResNet networks are characterized by skip connections, or shortcuts to jump over some layers, 
    this trick gives the ability to train really deep networks without caring about 
    The problem of gradient vanishing.
    """
    def __init__(self,
                 dropout: float)->None:
        """
        Class to create a Residual Connection to add the input to previous layers to thier outputs.
    
        The idea of Residual Connection came from ResNets. 

        ResNet networks are characterized by skip connections, or shortcuts to jump over some layers, 
        this trick gives the ability to train really deep networks without caring about 
        The problem of gradient vanishing.

        Args: 
            dropout: float
                the dropout precentage to avoid overfitting
        """
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization()

    def forward(self, x, sublayer):
        """
        Function makes skip connection and apply layer normlaization. 
        """
        return x + self.dropout(sublayer(self.norm(x)))

Overwriting model/ResidualConnection.py


In [42]:
%%writefile model/FeedForwardBlock.py
import torch
import torch.nn as nn

class FeedForwardBlock(nn.Module):
    def __init__(self,
                 d_model: int,
                 d_ff: int,
                 dropout: float) -> None:
        """
        Class the creates Feed Forward Netowrk, just simple sturcture of two Linear layers and some dropout.

        Args:
            d_model: int 
                the length of the embeding vector for each token
            dropout: float
                the dropout precentage to avoid overfitting
            d_ff: int
                number of neurons in the first layer.             
        """
        super().__init__()

        self.layer_stack = nn.Sequential(
            nn.Linear(in_features=d_model, out_features=d_ff),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(in_features=d_ff, out_features=d_model)
        )

    def forward(self,x):
        # (batch, seq_len, d_model) -> (batch, seq_len, dff) --> (batch, seq_len, d_model)
        return self.layer_stack(x)

Writing model/FeedForwardBlock.py


In [43]:
%%writefile model/EncoderBlock.py
import torch
import torch.nn as nn

from ResidualConnection import ResidualConnection
from MultiHeadAttentionBlock import MultiHeadAttentionBlock
from FeedForwardBlock import FeedForwardBlock

class EncoderBlock(nn.Module):
    def __init__(self,
                 self_attention_block: MultiHeadAttentionBlock,
                 feed_forward_block: FeedForwardBlock,
                 dropout: float) -> None:
        """
        Class to define an Encoder Block. 
        The architecture prposed in the paper *Attention is all you need*

        The Encoder block contains a self_attention block, Residual Connection, LayerNormalization, and FeedForward.

        Args:
            self_attention_block: MultiHeadAttentionBlock
                Block that calculates the attention scores of each to token to the other tokens in the sequance.

            feed_forward_block: FeedForwardBlock
                Linear Network

            dropout: float
                the dropout precentage to avoid overfitting
        """

        super().__init__()
        
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(2)]) # 2 Residual Connections


    def forward(self, x, src_mask):

        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x,x,x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x

Writing model/EncoderBlock.py


In [44]:
%%writefile model/Encoder.py
import torch
import torch.nn as nn
from LayerNormalization import LayerNormalization

class Encoder(nn.Module):
    def __init__(self,
                 layers: nn.ModuleList) -> None:
        
        """
        Class that creates a number of Encoder blocks. 
        
        Args:
            layers: nn.ModuleList
                list of encoder blocks
        """
        
        super().__init__()

        self.layers = layers
        self.norm = LayerNormalization()

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

Writing model/Encoder.py


In [45]:
%%writefile model/DecoderBlock.py
import torch
import torch.nn as nn

from MultiHeadAttentionBlock import MultiHeadAttentionBlock
from FeedForwardBlock import FeedForwardBlock
from ResidualConnection import ResidualConnection

class DecoderBlock(nn.Module):
    def __init__(self,
                 self_attention_block: MultiHeadAttentionBlock,
                 cross_attention_block: MultiHeadAttentionBlock,
                 feed_forward_block: FeedForwardBlock,
                 dropout: float) -> None:
        
        """
        Class to define an Decoder Block. 
        The architecture prposed in the paper *Attention is all you need*

        The Decoder block contains a self_attention block, cross_attention block, 
        Residual Connection, LayerNormalization, and FeedForward.

        Args:
            self_attention_block: MultiHeadAttentionBlock
                Multihead Attetion to calculate the attention in the decoder input sequance.

            cross_attention_block: MultiHeadAttentionBlock
                Multihead Attetion to calculate the attention between the Encoder output and Decoder input

            feed_forward_block: FeedForwardBlock
                Linear Network

            dropout: float
                the dropout precentage to avoid overfitting

        """
        
        super().__init__() 

        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(3)]) # 3 Residual Connections 
    
    def forward(self, x, econder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x,x,x,tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x,econder_output, econder_output,src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x

Writing model/DecoderBlock.py


In [46]:
%%writefile model/Decoder.py
import torch
import torch.nn as nn

from LayerNormalization import LayerNormalization

class Decoder(nn.Module):
    def __init__(self,
                 layers: nn.ModuleList) -> None:
        
        """
        Class that creates a number of Decoder blocks. 
        
        Args:
            layers: nn.ModuleList
                list of decoder blocks
        """
        
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization()

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

Writing model/Decoder.py


### 2.3 Building the Output Netowrk

The input network has one compenent block : 

* Projection Layer 

In [47]:
%%writefile model/ProjectionLayer.py
import torch
import torch.nn as nn

class ProjectionLayer(nn.Module):
    def __init__(self,
                 d_model: int,
                 vocab_size: int) -> None:
        """
        Class of projection layer -> a linear layer followed by a softmax to 
        output the probability of each token. 

        Args:
            d_model: int
                The length of the vector to represnt each token
            vocab_size: int
                the number of tokens to embedded in the matrix

        """

        super().__init__()
        self.proj = nn.Linear(in_features=d_model, out_features=vocab_size)

    def forward(self,x):
        # (Batch, seq_len, d_model) ---> (batch, seq_len, vocab_size)
        return torch.log_softmax(self.proj(x), dim=-1)

Writing model/ProjectionLayer.py


### 2.4 Building the Transformer 

We build the transformer class by collection all these blocks to work together.

In [48]:
%%writefile model/Transformer.py
import torch
import torch.nn as nn

from Encoder import Encoder
from Decoder import Decoder
from InputEmbeddings import InputEmbeddings
from PositionalEncoding import PositionalEncoding
from ProjectionLayer import ProjectionLayer

class Transformer(nn.Module):
    def __init__(self,
                 encoder: Encoder,
                 decoder: Decoder,
                 src_embed: InputEmbeddings,
                 trg_embed: InputEmbeddings,
                 src_pos: PositionalEncoding,
                 trg_pos: PositionalEncoding,
                 projection_layer: ProjectionLayer):
        
        """
        A transformer model.

        User is able to modify the attributes as needed. 
        The architecture is based on the paper “Attention Is All You Need”. 
        Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin.
        
        Args:
            encoder: Encoder
                Encoder that encodes the input tokens 
            decoder: Decoder
                Decoder to decode the encoder output into tokens 
            src_embed: InputEmbeddings
                The embedding matrix for the source inputs 
            trg_embed: InputEmbeddings
                the embeding matrix for the target inputs 
            src_pos: PositionalEncoding
                the source positional encodings 
            trg_pos: PositionalEncoding
                the target positional encodings 
            projection_layer: ProjectionLayer
                layer to project the decoder output into tokens 
        """
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.trg_embed = trg_embed
        self.src_pos = src_pos
        self.trg_pos = trg_pos
        self.projection_layer = projection_layer

    def encode(self, src, src_mask):
        """
        Function that calculates the input embeddings then encodes them and add positional encoding.
        """
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)

    def decode(self, encoder_output, src_mask, trg, trg_mask):
        """
        Function to decode the encoder output
        """
        trg = self.trg_embed(trg)
        trg = self.trg_pos(trg)
        return self.decoder(trg, encoder_output, src_mask, trg_mask)
    
    def project(self, x):
        """
        Function that project the decoder output to tokens 
        """
        return self.projection_layer(x)

Writing model/Transformer.py


### 2.5 Model Builder Function 

In [49]:
%%writefile model/build_transformer.py
import torch
import torch.nn as nn

from InputEmbeddings import InputEmbeddings
from Transformer import Transformer
from PositionalEncoding import PositionalEncoding
from MultiHeadAttentionBlock import MultiHeadAttentionBlock
from DecoderBlock import DecoderBlock
from EncoderBlock import EncoderBlock
from Encoder import Encoder
from Decoder import Decoder
from FeedForwardBlock import FeedForwardBlock
from ProjectionLayer import ProjectionLayer


def build_transformer(src_vocab_size: int,
                      trg_vocab_size: int,
                      src_seq_len: int,
                      trg_seq_len: int,
                      d_model: int = 512,
                      N: int = 6,
                      h: int = 8,
                      dropout: float = 0.1,
                      d_ff: int = 2048) -> Transformer:
    
    """
    Function that build a transformer model. 

    Args:
        src_vocab_size: int
            Size of the source vocab 
        
        trg_vocab_size: int
            Size of the target vocab

        src_seq_len: int
            Maximum sequance length for the source inputs 
        
        trg_seq_len: int
            Maximum sequance length for the target inputs 

        d_model: int = 512
            Size of the embedding vector for each token in the embedding matrix
        
        N: int = 6
            Number of blocks in the Encoder and Decoder 
        
        h: int = 8
            Number of Heads in the Multihead attetion blocks

        dropout: float = 0.1
            Dropout precentage to drop while calculaations randomly to avoid overfitting
                     
        d_ff: int = 2048
            Number of hidden neourns in the projection layer
    
    Example:
        transformer = build_transformer(1000, 1000, 300, 300)

    Returns:
        transformer: Transformer
    """
    # Create the embedding layers
    src_embed = InputEmbeddings(d_model, src_vocab_size)
    trg_embed = InputEmbeddings(d_model, trg_vocab_size)

    # Create the positional encoding layers
    src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
    trg_pos = PositionalEncoding(d_model, trg_seq_len, dropout)

    # Create the encoder blocks
    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)

    # Create the deocder blocks
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model,h,dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)

    # Create the encoder and decoder 
    encoder = Encoder(nn.ModuleList(encoder_blocks))
    decoder = Decoder(nn.ModuleList(decoder_blocks))

    # Create the projection layer
    projection_layer = ProjectionLayer(d_model, trg_vocab_size)

    # Create the transformer
    transformer = Transformer(encoder, decoder, src_embed, trg_embed, src_pos, trg_pos, projection_layer)

    # Intialize the parameters using Xavier intialization 
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    
    return transformer


Writing model/build_transformer.py


## Section 3: Model Training 

In [53]:
os.mkdir('train')

In [55]:
%%writefile train/greedy_decode.py
import torch
import torch.nn as nn

from tokenizers import Tokenizer

from model.Transformer import Transformer
from dataset.causal_mask import causal_mask

def greedy_decode(model: Transformer, 
                  source_tokens: torch.Tensor,
                  source_mask: torch.Tensor,
                  tokenizer_src: Tokenizer,
                  tokenizer_trg: Tokenizer,
                  max_len: int,
                  device: torch.device):

    """
    Function that calculates the output of the transformer in greedy way.
    (output the hieghts probability only)

    Args:
        model: Transformer
            Model that should used for inference 
        
        source_tokens: torch.Tensor
            the input sequance ids 
        
        source_mask: torch.Tensor
            Mask for the input size to avoid calculations for paddings 
        
        tokenizer_src: Tokenizer
            the tokenizer used in the source language
        
        tokenizer_trg: Tokenizer
            the tokenizer used in the target language
        
        max_len: int
            the maximum sequance length allowed

        device: torch.device
            the hardware device that's used in the compuations
        
    Example:
        model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

    Returns:
        out: torch.Tensor
            a sequance of the highest probabities. 
    """
    
    sos_idx = tokenizer_trg.token_to_id('[SOS]') # Start of sentence id (each token has id in the tokenizer)
    eos_idx = tokenizer_trg.token_to_id('[EOS]') # End of sentence id

    # Precompute the encoder output and reuse it for every output predication
    encoder_output = model.encode(source_tokens, source_mask)

    # Initialize the decoder input with the sos token
    decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source_tokens).to(device)

    while True:
        if decoder_input.size(1) == max_len:
            break

        # build mask for target
        decoder_mask = causal_mask(decoder_input.size(1)).type_as(source_mask).to(device)

        # calculate output
        out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

        # get next token
        prob = model.project(out[:, -1])

        _, next_word = torch.max(prob, dim=1)

        decoder_input = torch.cat(
            tensors=[
                decoder_input, 
                torch.empty(1, 1).type_as(source_tokens).fill_(next_word.item()).to(device)
            ], 
            dim=1
        )

        if next_word == eos_idx:
            break

    return decoder_input.squeeze(0)

Overwriting train/greedy_decode.py


In [5]:
%%writefile train/beam_search_decode.py
import torch
import torch.nn as nn

from tokenizers import Tokenizer

from model.Transformer import Transformer
from dataset.causal_mask import causal_mask

def beam_search_decode(model: Transformer,
                       beam_size: int,
                       source_tokens: torch.Tensor,
                       source_mask: torch.Tensor,
                       tokenizer_src: Tokenizer,
                       tokenizer_trg: Tokenizer,
                       max_len: int,
                       device: torch.device):
    
    """
    Function that calculates the multible candidate output of the transformer to choose from.
    (output top `beam_size` hieghts probabilities)

    Args:
        model: Transformer
            Model that should used for inference 
        
        beam_size: int
            Number to indicate how many candidates to consider     

        source_tokens: torch.Tensor
            the input sequance ids 
        
        source_mask: torch.Tensor
            Mask for the input size to avoid calculations for paddings 
        
        tokenizer_src: Tokenizer
            the tokenizer used in the source language
        
        tokenizer_trg: Tokenizer
            the tokenizer used in the target language
        
        max_len: int
            the maximum sequance length allowed

        device: torch.device
            the hardware device that's used in the compuations
        
    Example:
        model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

    Returns:
        out: List
            List of sequances that are candidates. 
    """
    
    sos_idx = tokenizer_trg.token_to_id('[SOS]')
    eos_idx = tokenizer_trg.token_to_id('[EOS]')

    # Precompute the encoder output and reuse it for every step
    encoder_output = model.encode(source_tokens, source_mask)

    # Initialize the decoder input with the sos token
    decoder_initial_input = torch.empty(1, 1).fill_(sos_idx).type_as(source_tokens).to(device)

    # Create a candidate list
    candidates = [(decoder_initial_input, 1)]

    while True:

        # If a candidate has reached the maximum length, it means we have run the decoding for at least max_len iterations, so stop the search
        if any([cand.size(1) == max_len for cand, _ in candidates]):
            break

        # Create a new list of candidates
        new_candidates = []

        for candidate, score in candidates:

            # Do not expand candidates that have reached the eos token
            if candidate[0][-1].item() == eos_idx:
                continue

            # Build the candidate's mask
            candidate_mask = causal_mask(candidate.size(1)).type_as(source_mask).to(device)
            # calculate output
            out = model.decode(encoder_output, source_mask, candidate, candidate_mask)
            # get next token probabilities
            prob = model.project(out[:, -1])
            # get the top k candidates
            topk_prob, topk_idx = torch.topk(prob, beam_size, dim=1)
            for i in range(beam_size):
                # for each of the top k candidates, get the token and its probability
                token = topk_idx[0][i].unsqueeze(0).unsqueeze(0)
                token_prob = topk_prob[0][i].item()
                # create a new candidate by appending the token to the current candidate
                new_candidate = torch.cat([candidate, token], dim=1)
                # We sum the log probabilities because the probabilities are in log space
                new_candidates.append((new_candidate, score + token_prob))

        # Sort the new candidates by their score
        candidates = sorted(new_candidates, key=lambda x: x[1], reverse=True)
        # Keep only the top k candidates
        candidates = candidates[:beam_size]

        # If all the candidates have reached the eos token, stop
        if all([cand[0][-1].item() == eos_idx for cand, _ in candidates]):
            break

    # Return the best candidate
    return candidates[0][0].squeeze()

Writing train/beam_search_decode.py


In [6]:
%%writefile train/run_validation.py
import torch
import torch.nn as nn

from torch.utils.data import DataLoader
from torch.utils import tensorboard

from tokenizers import Token

from model.Transformer import Transformer
from train.greedy_decode import greedy_decode
from train.beam_search_decode import beam_search_decode
from evaluate.evaluate_model_outputs import evaluate_model_outputs

def run_validation(model: Transformer,
                   validation_ds: DataLoader,
                   tokenizer_src: Tokenizer,
                   tokenizer_trg: Tokenizer,
                   max_len: int, 
                   device: torch.device,
                   print_msg: function,
                   global_step: int,
                   writer: tensorboard,
                   beam_size: int=1
                   num_examples: int=2):
    
    """
    Function to make predictions on the validation set to test the model performance.
    
    The function also evaluate the preidctions useing *weights&biases* , *torchmetrics*,
    and also *tensorboard*.

    Args:
        model: Transformer
            Model that should used for inference 
        
        validation_ds: Dataloader
            the validation dataloader to be used for validation

        tokenizer_src: Tokenizer
            the tokenizer used in the source language
        
        tokenizer_trg: Tokenizer
            the tokenizer used in the target language
        
        max_len: int
            the maximum sequance length allowed

        device: torch.device
            the hardware device that's used in the compuations

        print_msg: function
            function to create a message to appear the at the TQDM bar while training the model
        
        global_step: int 
            variable used to idicate the state globally and used for resuming training

        writer: tensorboard:
            tensorboard writer used in evaluation

        beam_size: int
            if you want beam search -> Number to indicate how many candidates to consider  
            if you want greedy serach -> beam_size = 1 (default = 1)
    
        num_examples: int 
            Number of samples in the validation data to be tested (default=2)

    Example:
        run_validation(model, val_dataloader, tokenizer_src, tokenizer_trg, config['seq_len'], device, lambda msg: batch_iterator.write(msg), global_step)

    Returns:
        None
    """
    
    model.eval() # Put the model in the evaluation mode

    count = 0 # counter to break when the num_examples reached
    source_texts = [] # List to store the source input text for each sample used in the validation 
    expected = [] # List to store the true output text for each sample used in the validation 
    predicted = [] # List to store the predicted output text for each sample used in the validation 

    try:
        # get the console window width
        with os.popen('stty size', 'r') as console:
            _, console_width = console.read().split()
            console_width = int(console_width)
    except:
        # If we can't get the console width, use 80 as default
        console_width = 80

    with torch.no_grad(): # stop calculating the gradients while testing

        for batch in validation_ds: # iterate over each batch in the validation set ot calculate the result
            
            count += 1
            
            encoder_input = batch["encoder_input"].to(device) # (batch, seq_len)
            encoder_mask = batch["encoder_mask"].to(device) # (batch, 1, 1, seq_len)

            # check that the batch size is 1
            assert encoder_input.size(0) == 1, "Batch size must be 1 for validation"

            model_out = None

            if beam_size > 1:
                model_out = beam_search_decode(model=model,
                       beam_size=beam_size,
                       source_tokens=encoder_input,
                       source_mask=encoder_mask,
                       tokenizer_src=tokenizer_src,
                       tokenizer_trg=tokenizer_trg,
                       max_len=max_len,
                       device=device)
            else:
                model_out = greedy_decode(model=model, 
                                      source_tokens=encoder_input,
                                      source_mask=encoder_mask,
                                      tokenizer_src=tokenizer_src,
                                      tokenizer_trg=tokenizer_trg,
                                      max_len=max_len,
                                      device=device)

            source_text = batch["src_text"][0]
            target_text = batch["trg_text"][0]
            model_out_text = tokenizer_trg.decode(model_out.detach().cpu().numpy())

            source_texts.append(source_text)
            expected.append(target_text)
            predicted.append(model_out_text)
            
            # Print the source, target and model output
            print_msg('-'*console_width)
            print_msg(f"{f'SOURCE: ':>12}{source_text}")
            print_msg(f"{f'TARGET: ':>12}{target_text}")
            print_msg(f"{f'PREDICTED: ':>12}{model_out_text}")

            if count == num_examples:
                print_msg('-'*console_width)
                break

    evaluate_model_outputs(predicted=predicted,
                           expected=expected,
                           global_step=global_step,
                           writer=writer)
    

Overwriting train/run_validation.py


In [57]:
os.mkdir('evaluate')

In [66]:
%%writefile evaluate/evaluate_model_outputs.py
import torchmetrics
import wandb
from torch.utils import tensorboard

from char_error_rate import char_error_rate_wandb, char_error_rate_tb
from word_error_rate import word_error_rate_wandb, word_error_rate_tb
from belu_score import belu_score_wandb, belu_score_tb

def evaluate_model_outputs(predicted: List,
                           expected: List,
                           global_step: int,
                           writer: tensorboard,
                           wandb: bool = False):
    """
    Function that evaluates model outputs through diffetent metrics:
        * Character error rate
        * Word error rate
        * BELU score
    
    the results are shown in 2 formats: 
        * tensorboard
        * weights&biases 
    
    Args:
        predicted: List
            Model output texts 
        
        expected: List
            True output texts
        
        global_step: int
            step of trainging

        wandb: bool
            True to log in wegiths and biases
        
        writer: tensorboard
            writer to show results
            
    Example:
        evaluate_model_outputs(pred, expect, 5, writer, True)

    Returns: None
    """
    
    if wandb:
        char_error_rate_wandb(predicted=predicted,
                              expected=expected,
                              global_step=global_step)
        
        word_error_rate_wandb(predicted=predicted,
                              expected=expected,
                              global_step=global_step)
        
        belu_score_wandb(predicted=predicted,
                              expected=expected,
                              global_step=global_step)
    if writer:
        char_error_rate_tb(writer=writer,
                           predicted=predicted,
                           expected=expected,
                           global_step=global_step)
        
        word_error_rate_tb(writer=writer,
                           predicted=predicted,
                           expected=expected,
                           global_step=global_step)
        
        belu_score_tb(writer=writer,
                           predicted=predicted,
                           expected=expected,
                           global_step=global_step)
        
        
        

Overwriting evaluate/evaluate_model_outputs.py


In [62]:
%%writefile evaluate/char_error_rate.py
import torchmetrics
import wandb
from torch.utils import tensorboard

def char_error_rate_wandb(predicted: List,
                          expected: List,
                          global_step: int):
    """
    Model output evaluation function that calculates 
    the rate of mispredicted cahrecters in the sequance.

    The results are logged in weigths and biases format.

    Args:
        predicted: List
            Model output texts 
        
        expected: List
            True output texts
        
        global_step: int
            step of trainging
    
    Example: 
        char_error_rate_wandb(pred, expect, 5)
    
    Returns: None
    """
    # Compute the char error rate 
    metric = torchmetrics.CharErrorRate()
    cer = metric(predicted, expected)
    wandb.log({'validation/cer': cer, 'global_step': global_step})


def char_error_rate_tb(writer: tensorboard,
                       predicted: List,
                       expected: List,
                       global_step: int):
    """
    Model output evaluation function that calculates 
    the rate of mispredicted cahrecters in the sequance.

    The results are shown in tensorboard.

    Args:
        predicted: List
            Model output texts 
        
        expected: List
            True output texts
        
        global_step: int
            step of trainging
    
    Example: 
        char_error_rate_tb(writer, pred, expect, 5)
    
    Returns: None
    """
    # Compute the char error rate 
    metric = torchmetrics.CharErrorRate()
    cer = metric(predicted, expected)
    writer.add_scalar('validation cer', cer, global_step)
    writer.flush()


Overwriting evaluate/char_error_rate.py


In [63]:
%%writefile evaluate/word_error_rate.py
import torchmetrics
import wandb
from torch.utils import tensorboard

def word_error_rate_wandb(predicted: List,
                          expected: List,
                          global_step: int):
    """
    Model output evaluation function that calculates 
    the rate of mispredicted words in the sequance.

    The results are logged in weigths and biases format.

    Args:
        predicted: List
            Model output texts 
        
        expected: List
            True output texts
        
        global_step: int
            step of trainging
    
    Example: 
        word_error_rate_wandb(pred, expect, 5)
    
    Returns: None
    """
    # Compute the word error rate
    metric = torchmetrics.WordErrorRate()
    wer = metric(predicted, expected)
    wandb.log({'validation/wer': wer, 'global_step': global_step})



def word_error_rate_tb(writer: tensorboard,
                       predicted: List,
                       expected: List,
                       global_step: int):
    """
    Model output evaluation function that calculates 
    the rate of mispredicted words in the sequance.

    The results are shown in tensorboard.

    Args:
        predicted: List
            Model output texts 
        
        expected: List
            True output texts
        
        global_step: int
            step of trainging
    
    Example: 
        word_error_rate_tb(writer, pred, expect, 5)
    
    Returns: None
    """
    # Compute the word error rate
    metric = torchmetrics.WordErrorRate()
    wer = metric(predicted, expected)
    writer.add_scalar('validation wer', wer, global_step)
    writer.flush()



Overwriting evaluate/word_error_rate.py


In [64]:
%%writefile evaluate/belu_score.py
import torchmetrics
import wandb
from torch.utils import tensorboard

def belu_score_wandb(predicted: List,
                     expected: List,
                     global_step: int):
    """
    Model output evaluation function that calculates 
    the rate of mispredicted the BLUE score in the sequance.

     BLEU (Bilingual Evaluation Understudy) is 
     a score used to evaluate the translations performed by a machine translator.

    The results are logged in weigths and biases format.

    Args:
        predicted: List
            Model output texts 
        
        expected: List
            True output texts
        
        global_step: int
            step of trainging
    
    Example: 
        belu_score_wandb(pred, expect, 5)
    
    Returns: None
    """
    # Compute the BLEU metric
    metric = torchmetrics.BLEUScore()
    bleu = metric(predicted, expected)
    wandb.log({'validation/BLEU': bleu, 'global_step': global_step})



def belu_score_tb(writer: tensorboard,
                  predicted: List,
                  expected: List,
                  global_step: int):
    """
    Model output evaluation function that calculates 
    the rate of mispredicted the BLUE score in the sequance.

     BLEU (Bilingual Evaluation Understudy) is 
     a score used to evaluate the translations performed by a machine translator.

    The results are shown in tensorboard.

    Args:
        predicted: List
            Model output texts 
        
        expected: List
            True output texts
        
        global_step: int
            step of trainging
    
    Example: 
        belu_score_tb(writer, pred, expect, 5)
    
    Returns: None
    """
    # Compute the BLEU metric
    metric = torchmetrics.BLEUScore()
    bleu = metric(predicted, expected)
    writer.add_scalar('validation BLEU', bleu, global_step)
    writer.flush()

Overwriting evaluate/belu_score.py


In [68]:
%%writefile config.py

from pathlib import Path

def get_config():
    """
    Function that returns a configruation of model training

    Returns: Dict
    """
    return {
        'dataset_name': 'yhavinga/ccmatrix',
        'batch_size': 8,
        'num_epochs': 20,
        'lr': 10**-4,
        'seq_len': 350,
        'd_model': 512,
        'd_ff': 2048,
        'number_of_layers': 6,
        'number_of_heads': 8,
        'lang_src': 'en',
        'lang_tgt': 'ar',
        'model_folder': 'weights',
        'model_basename': 'tmodel_',
        'preload': 'latest',
        'tokenizer_file': 'tokenizer_{0}.json',
        'experiment_name': 'runs/tmodel'
    }

def get_weights_file_path(config, epoch: str):
    """
    Function returns the weights file path 
    """
    model_folder = f"{config['model_folder']}"
    model_filename = f"{config['model_basename']}{epoch}.pth"
    return str(Path('.') / model_folder / model_filename)

# Find the latest weights file in the weights folder
def latest_weights_file_path(config):
    """
    Function returns the latest weights file path 
    """
    model_folder = f"{config['model_folder']}"
    model_filename = f"{config['model_basename']}*"
    weights_files = list(Path(model_folder).glob(model_filename))
    if len(weights_files) == 0:
        return None
    weights_files.sort()
    return str(weights_files[-1])

Overwriting config.py


In [70]:
%%writefile train/save_model_state.py

import torch
import torch.nn as nn
from config import get_weights_file_path
from typing import Dict
from model.Transformer import Transformer
def save_model_state(model: Transformer,
                     optimizer: torch.optim.Optimizer,
                     global_step: int,
                     config: Dict,
                     epoch: int) -> None:
    """
    function saves the model state, optimizer state, and the global_step for each epoch

    Args:
        model: Transformer
            model to save its state
        
        optimizer: torch.optim.Optimizer
            optimizer to save its state
        
        global_state: int

        config: Dict
        epoch: int
    """
     # Save the model at the end of every epoch
    model_filename = get_weights_file_path(config, f"{epoch:02d}")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'global_step': global_step
    }, model_filename)

Overwriting train/save_model_state.py


In [71]:
%%writefile train/train_model.py
import torch 
import torch.nn as nn
from dataset.dataset_loader import dataset_loader
from model.build_transformer import build_transformer
from train.run_validation import run_validation
from train.save_model_state import save_model_state
from config import get_config, get_weights_file_path, latest_weights_file_path
from torch.utils.tensorboard import SummaryWriter
from typing import Dict
import wandb
from tqdm import tqdm

def train_model(config: Dict,
                writer_: bool,
                wandb_: bool):
    """
    function used to make instance of the model and train it to the given dataset

    Args:
        config: Dict
            A dictonary of all neccsary vlaues 
        writer_: bool 
            True show results on tensorboard
        wandb_: bool
            True log results on weights&biases

    """
   
    # Define the device through device-agnostic code
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using device:", device)

    if (device == 'cuda'):
        print(f"Device name: {torch.cuda.get_device_name(device.index)}")
        print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")
    
    device = torch.device(device)

    # Make sure the weights folder exists
    Path(f"{config['model_folder']}").mkdir(parents=True, exist_ok=True)

    # load the training and validation dataloaders and tokenizers 
    train_dataloader, val_dataloader, tokenizer_src, tokenizer_trg = dataset_loader(conf=config)

    # Instaniate the Transformer model
    model = build_transformer(src_vocab_size=tokenizer_src.get_vocab_size(),
                              trg_vocab_size=tokenizer_trg.get_vocab_size(),
                              src_seq_len=config['seq_len'],
                              trg_seq_len=config['seq_len'],
                              d_model=config['d_model'],
                              N=config['number_of_layers'],
                              h=config['number_of_heads'],
                              dropout=config['dropout'],
                              d_ff=config['d_ff'])
    
    
    # Optimimzer to optimize the weights 
    optimizer = torch.optim.Adam(params=model.parameters(),
                                 lr=config['lr'],
                                 eps=1e-9)

    # Loss function to calculate the loss (ignore the padding token from the loss calculations) (smoothing to add bit of randomness)
    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'),
                                  label_smoothing=0.1).to(device)
    
    # If the user specified a model to preload before training, load it
    initial_epoch = 0
    global_step = 0
    preload = config['preload']
    model_filename = None
    if preload == 'latest':
        model_filename = latest_weights_file_path(config)
    elif preload:
        get_weights_file_path(config, preload)
    
    if model_filename:
        print(f'Preloading model {model_filename}')
        state = torch.load(model_filename)
        model.load_state_dict(state['model_state_dict'])
        initial_epoch = state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']
        del state
    else:
        print('No model to preload, starting from scratch')

    if writer_:

        # Tensorboard writer to show summaries of the training 
        writer = SummaryWriter(config['experiment_name'])

    if wandb_:
        # define our custom x axis metric
        wandb.define_metric("global_step")
        # define which metrics will be plotted against it
        wandb.define_metric("validation/*", step_metric="global_step")
        wandb.define_metric("train/*", step_metric="global_step")

    for epoch in range(initial_epoch, config['num_epochs']):
        # clear the GPU memory 
        torch.cuda.empty_cache()
        # put the model in training mode
        model.train()
        # create tqdm bar indicator
        batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")

        for batch in batch_iterator:

            encoder_input = batch['encoder_input'].to(device) # (batch, seq_len)
            decoder_input = batch['decoder_input'].to(device) # (batch, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (batch, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (batch, 1, seq_len, seq_len)

            # Run the tensors through the encoder, decoder and the projection layer
            encoder_output = model.encode(src=encoder_input,
                                          src_mask=encoder_mask) # (B, seq_len, d_model)
            
            decoder_output = model.decode(encoder_output=encoder_output,
                                          src_mask=encoder_mask,
                                          trg=decoder_input,
                                          trg_mask=decoder_mask) # (B, seq_len, d_model)
            
            proj_output = model.project(decoder_output) # (B, seq_len, vocab_size)

            # Compare the output with the label
            label = batch['label'].to(device) # (B, seq_len)

            # Compute the loss using a simple cross entropy
            loss = loss_fn(proj_output.view(-1, tokenizer_trg.get_vocab_size()), label.view(-1))
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

            # Log the loss
            if writer_:
                writer.add_scalar('train loss', loss.item(), global_step)
                writer.flush()
            
            if wandb_:
                wandb.log({'train/loss': loss.item(), 'global_step': global_step})

            optimizer.zero_grad(set_to_none=True)

            # Backpropagate the loss
            loss.backward()

            # Update the weights
            optimizer.step()

            global_step += 1

        # Run validation at the end of every epoch
        run_validation(model=model,
                       validation_ds=val_dataloader,
                       tokenizer_src=tokenizer_src,
                       tokenizer_trg=tokenizer_trg,
                       max_len=config['seq_len'],
                       device=device,
                       print_msg=lambda msg: batch_iterator.write(msg),
                       global_step=global_step,
                       writer=writer,
                       wandb_=wandb_)
        
        # Save the model at the end of every epoch
        save_model_state(model=model,
                         optimizer=optimizer,
                         global_step=global_step,
                         config=config,
                         epoch=epoch)

Writing train/train_model.py


## Section 4: Inference and Searching

In this section we use the pretrained model to make infrerences.

This Section contains 2 main functions:

* Translate
* Inference



In [7]:
%%writefile translate.py
from pathlib import Path
from config import get_config, latest_weights_file_path
from model.build_transformer import build_transformer
from tokenizers import Tokenizer
from datasets import load_dataset
from dataset.BilingualDataset import BilingualDataset
import torch
import sys

def translate(sentence: str):
    """
    Function used to make predictions on custom inputs. 
    The function takes a sentence in English and output Arabic translation of it.

    Args:
        sentence: str
            English sentence to be translated
    
    Example: 
        arabic = translate('I want to go to school')

    Returns: 
        out: str
            Arabic translation
    """
    # Define the device, tokenizers, and model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    config = get_config()

    tokenizer_src = Tokenizer.from_file(str(Path(config['tokenizer_file'].format(config['lang_src']))))
    tokenizer_trg = Tokenizer.from_file(str(Path(config['tokenizer_file'].format(config['lang_trg']))))
    
    model = build_transformer(src_vocab_size=tokenizer_src.get_vocab_size(),
                              trg_vocab_size=tokenizer_trg.get_vocab_size(),
                              src_seq_len=config["seq_len"],
                              trg_seq_len=config['seq_len'],
                              d_model=config['d_model']).to(device)

    # Load the pretrained weights
    model_filename = latest_weights_file_path(config)
    state = torch.load(model_filename)
    model.load_state_dict(state['model_state_dict'])

    # if the sentence is a number use it as an index to the test set
    label = ""
    if type(sentence) == int or sentence.isdigit():
        id = int(sentence)
        ds = load_dataset(f"{config['dataset_name']}", f"{config['lang_src']}-{config['lang_trg']}", split='all')
        ds = BilingualDataset(ds, tokenizer_src, tokenizer_trg, config['seq_len'])
        sentence = ds[id]['src_text']
        label = ds[id]["trg_text"]
    seq_len = config['seq_len']

    # translate the sentence
    model.eval()
    with torch.no_grad():
        # Precompute the encoder output and reuse it for every generation step
        source = tokenizer_src.encode(sentence)
        source = torch.cat(
            tensors=[
            torch.tensor([tokenizer_src.token_to_id('[SOS]')], dtype=torch.int64), 
            torch.tensor(source.ids, dtype=torch.int64),
            torch.tensor([tokenizer_src.token_to_id('[EOS]')], dtype=torch.int64),
            torch.tensor([tokenizer_src.token_to_id('[PAD]')] * (seq_len - len(source.ids) - 2), dtype=torch.int64)
        ], dim=0).to(device)

        source_mask = (source != tokenizer_src.token_to_id('[PAD]')).unsqueeze(0).unsqueeze(0).int().to(device)
        
        encoder_output = model.encode(source, source_mask)

        # Initialize the decoder input with the sos token
        decoder_input = torch.empty(1, 1).fill_(tokenizer_trg.token_to_id('[SOS]')).type_as(source).to(device)

        # Print the source sentence and target start prompt
        if label != "": print(f"{f'ID: ':>12}{id}") 
        print(f"{f'SOURCE: ':>12}{sentence}")
        if label != "": print(f"{f'TARGET: ':>12}{label}") 
        print(f"{f'PREDICTED: ':>12}", end='')

        # Generate the translation word by word
        while decoder_input.size(1) < seq_len:
            # build mask for target and calculate output
            decoder_mask = torch.triu(torch.ones((1, decoder_input.size(1), decoder_input.size(1))), diagonal=1).type(torch.int).type_as(source_mask).to(device)
            out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

            # project next token
            prob = model.project(out[:, -1])
            _, next_word = torch.max(prob, dim=1)
            decoder_input = torch.cat(
                tensors=[
                    decoder_input,
                    torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)
                ], 
                dim=1
            )

            # print the translated word
            print(f"{tokenizer_trg.decode([next_word.item()])}", end=' ')

            # break if we predict the end of sentence token
            if next_word == tokenizer_trg.token_to_id('[EOS]'):
                break

    # convert ids to tokens
    return tokenizer_trg.decode(decoder_input[0].tolist())
    
#read sentence from argument
translate(sys.argv[1] if len(sys.argv) > 1 else "I am not a very good a student.")

Overwriting translate.py


In [1]:
from config import get_config
cfg = get_config()
cfg['batch_size'] = 6
cfg['preload'] = None
cfg['num_epochs'] = 15

from train.train_model import train_model

train_model(config=cfg,
            writer_=True,
            wandb_=False)


Using device: cuda
Device name: NVIDIA GeForce GTX 1650
Device memory: 3.99969482421875 GB
Folder exists. Loading the dataset from the disk...


Loading dataset from disk:   0%|          | 0/30 [00:00<?, ?it/s]



TypeError: calculate_max_seq_len() got an unexpected keyword argument 'lang_src'

In [4]:
from pathlib import Path
import torch
import torch.nn as nn
from config import get_config, latest_weights_file_path
from train.run_validation import run_validation
from dataset.dataset_loader import dataset_loader
from model.build_transformer import build_transformer
from translate import translate

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

config = get_config()

train_dataloader, val_dataloader, tokenizer_src, tokenizer_trg = dataset_loader(conf=config)

model = build_transformer(src_vocab_size=tokenizer_src.get_vocab_size(),
                          trg_vocab_size=tokenizer_trg.get_vocab_size(),
                          src_seq_len=config["seq_len"],
                          trg_seq_len=config["seq_len"],
                          d_model=config['d_model']).to(device)

# Load the pretrained weights
model_filename = latest_weights_file_path(config)
state = torch.load(model_filename)
model.load_state_dict(state['model_state_dict'])

run_validation(model, val_dataloader, tokenizer_src, tokenizer_trg, config['seq_len'], device, lambda msg: print(msg), 0, None, num_examples=4)

t = translate("Why do I need to translate this?")

print(t)

Using device: cuda


Exception: The system cannot find the file specified. (os error 2)