<h1>Data Ingestion</h1>
<p> I will be using the opus books dataset. More specifically the english to french portion of the dataset. </p>

In [1]:
from datasets import load_dataset
### Using the Opus Books Dataset from Huggingface
def data_ingestion():
    ds = load_dataset(path="Helsinki-NLP/opus_books", name="en-fr")
    train_test_data=ds['train'].train_test_split(test_size=0.2,seed=42)
    test_data=train_test_data['test']
    train_val_split=train_test_data['train'].train_test_split(test_size=0.2,seed=42)
    train_data=train_val_split['train']
    validation_data=train_val_split['test']
    return train_data,validation_data,test_data

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data,validation_data,test_data=data_ingestion()

<h5> languagewise sentence generator function </h5>

In [3]:
def get_all_sentences(ds,lang):
    for pair in ds:
        # print(pair)
        yield pair['translation'][lang]

### Build tokenizer

In [None]:
from pathlib import Path
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
def build_tokenizer(config,ds,lang):
    tokenizer_path=Path(config['tokenizer_file'].format(lang))
    
    if not Path.exists(tokenizer_path):
        tokenizer=Tokenizer(WordLevel(unk_token='[UNK]'))
        tokenizer.pre_tokenizer=Whitespace()
        trainer=WordLevelTrainer(special_tokens=["[UNK]","[PAD]","[SOS]","[EOS]"],min_frequency=1)
        tokenizer.train_from_iterator(get_all_sentences(ds,lang),trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer=Tokenizer.from_file(str(tokenizer_path))
    
    return tokenizer

In [6]:
## The tokenizer will be trained on the train set of the data only.
##There will be 2 separate tokenizer, one will be for english and other will be for french.
tokenizer_en=build_tokenizer({'tokenizer_file':'tokenizer_en.json'},train_data,'en')
tokenizer_fr=build_tokenizer({'tokenizer_file':'tokenizer_fr.json'},train_data,'fr')

<p> max seq len will be needed during positional embedding layer creation. Incase a input comes bigger than mex seq len during inference, we have to truncate the sequence to max seq len. </p>

In [7]:
# A custom function to get the max seq len that is possible in the dataset.
def get_max_seq_len(train_data,test_data,validation_data):
    max_len=0
    for data in train_data:
        max_len=max(max_len,len(tokenizer_en.encode(data['translation']['en']).ids))
        max_len=max(max_len,len(tokenizer_fr.encode(data['translation']['fr']).ids))

    for data in test_data:
        max_len=max(max_len,len(tokenizer_en.encode(data['translation']['en']).ids))
        max_len=max(max_len,len(tokenizer_fr.encode(data['translation']['fr']).ids))

    for data in validation_data:
        max_len=max(max_len,len(tokenizer_en.encode(data['translation']['en']).ids))
        max_len=max(max_len,len(tokenizer_fr.encode(data['translation']['fr']).ids))
    return max_len    

### Causal Mask Generator Utility Function

In [8]:
import torch
def casual_mask_generator(size):
    mask=torch.triu(torch.ones(1,size,size),diagonal=1).type(torch.int)
    return mask==0

### Creating the dataset class

In [14]:
import torch
from torch.utils.data import Dataset
class opusDataset_En_to_Fr(Dataset):
    def __init__(self,data,tokenizer_en,tokenizer_fr):
        super().__init__()
        self.raw_data=data
        self.tokenizer_en=tokenizer_en
        self.tokenizer_fr=tokenizer_fr

        self.sos_token=torch.tensor([self.tokenizer_en.token_to_id("[SOS]")],dtype=torch.int64)  ### start of sequence token
        self.eos_token=torch.tensor([self.tokenizer_en.token_to_id("[EOS]")],dtype=torch.int64)  ### End of sequence token


    def __len__(self):
        return len(self.raw_data)
    
    def __getitem__(self, index):
        data_en=self.raw_data[index]['translation']['en']
        data_fr=self.raw_data[index]['translation']['fr']
        encoded_data_en=torch.tensor(self.tokenizer_en.encode(data_en).ids,dtype=torch.int64)
        encoded_data_fr=torch.tensor(self.tokenizer_fr.encode(data_fr).ids,dtype=torch.int64)

        final_encoded_en=torch.cat([
            encoded_data_en,
            self.eos_token,
        ]
        )
        final_encoded_fr=torch.cat([
            self.sos_token,
            encoded_data_fr,
        ])

        target_encoded_fr=torch.cat([
            encoded_data_fr,
            self.eos_token,
        ])

        return {
            'encoder_input':final_encoded_en,
            'decoder_input':final_encoded_fr,
            'target_output':target_encoded_fr,
            'src_sentence':data_en,
            'tgt_sentence':data_fr,
        }

In [15]:
train_dataset=opusDataset_En_to_Fr(train_data,tokenizer_en,tokenizer_fr)
test_dataset=opusDataset_En_to_Fr(test_data,tokenizer_en,tokenizer_fr)
validation_dataset=opusDataset_En_to_Fr(validation_data,tokenizer_en,tokenizer_fr)

### Creating the Dataloader class
<p> At first I will be creating a custom collate function for the loader, to pad the sequence of a batch to even length. </p>

In [None]:
def custom_collate(input_list):

    max_length_in_en_batch=0
    max_length_in_fr_batch=0
    max_length_in_tfr_batch=0

    for data in input_list:
        max_length_in_en_batch=max(max_length_in_en_batch,len(data['encoder_input']))

    for data in input_list:
        max_length_in_fr_batch=max(max_length_in_fr_batch,len(data['decoder_input']))

    for data in input_list:
        max_length_in_tfr_batch=max(max_length_in_tfr_batch,len(data['target_output']))

    encoder_inputs=[]
    decoder_inputs=[]
    target_outputs=[]
    encoder_masks=[]
    decoder_masks=[]
    src_sentences=[]
    tgt_sentences=[]

    pad_en=torch.tensor([tokenizer_en.token_to_id('PAD')])
    pad_fr=torch.tensor([tokenizer_fr.token_to_id('PAD')])
    

    for data in input_list:
        
        encoder_input= torch.cat(
            [
                data['encoder_input'],
                torch.tensor(pad_en*(max_length_in_en_batch-len(data['encoder_input'])))
            ]
        )

        decoder_input= torch.cat(
            [
                data['decoder_input'],
                torch.tensor(pad_fr*(max_length_in_fr_batch-len(data['decoder_input'])))
            ]
        )

        target_output = torch.cat(
            [
                data['target_output'],
                torch.tensor(pad_fr*(max_length_in_tfr_batch-len(data['target_output'])))
            ]
        )
        encoder_mask=(encoder_input!=torch.tensor(tokenizer_en.token_to_id('[PAD]'))).unsqueeze(0).unsqueeze(0).int()
        decoder_mask=((decoder_input!=torch.tensor(tokenizer_fr.token_to_id('[PAD]'))).unsqueeze(0).unsqueeze(0).int()) & casual_mask_generator(len(decoder_input))



        encoder_inputs.append(encoder_input)
        decoder_inputs.append(decoder_input)
        target_outputs.append(target_output)
        encoder_masks.append(encoder_mask)
        decoder_masks.append(decoder_mask)
        src_sentences.append(data['src_sentence'])
        tgt_sentences.append(data['tgt_sentence'])

    return{

        'encoder_input':torch.stack(encoder_inputs),
        'decoder_input':torch.stack(decoder_inputs),
        'target_output':torch.stack(target_outputs),
        'encoder_mask':torch.stack(encoder_masks),
        'decoder_mask':torch.stack(decoder_masks),
        'src_sentence': src_sentences,
        'tgt_sentence': tgt_sentences
    }

In [19]:
(torch.tensor([1]))*4

tensor([4])