In [1]:
import os
os.chdir('../')
%pwd

'/Users/omidsardari/WORK/Becoming a Data Scientist/Python Projects/End_to_End_Transformer_En_Fa'

In [2]:
from dataclasses import dataclass
from pathlib import Path
from typing import Tuple

In [3]:
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    tokenizer_file: Path
    dataset_name: str
    lang_src: str
    lang_tgt: str
    seq_len: int
    batch_size: int
    train_val_split_ratio: Tuple[float, float]  

In [4]:
from transformerEnFa.constants import *
from transformerEnFa.utils.common import read_yaml, create_directories

In [5]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        return DataTransformationConfig(
            root_dir = config.root_dir,
            tokenizer_file = config.tokenizer_file,
            dataset_name = config.dataset_name,
            lang_src = config.lang_src,
            lang_tgt = config.lang_tgt,
            seq_len = config.seq_len,
            batch_size = config.batch_size,
            train_val_split_ratio = tuple(config.train_val_split_ratio),
        )

In [6]:
# Huggingface Tokenizers
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

In [7]:
def get_all_sentences(ds, lang):
    print("Inspecting the first item of ds:", ds[0])  # Add this line for debugging
    for item in ds:
        yield item['translation'][lang]


In [8]:
def get_or_build_tokenizer(config, ds, lang):
    tokenizer_path = Path(config.tokenizer_file.format(lang))
    if not Path.exists(tokenizer_path):
        # Huggigface code
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        # added line of code 
        tokenizer_path.parent.mkdir(parents=True, exist_ok=True)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer


In [9]:
from typing import Any
from pathlib import Path
import torch
import torch.nn
from torch.utils.data import Dataset

In [10]:

class BilingualDataset(Dataset):

    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len) -> None:
        super().__init__()

        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self.seq_len = seq_len

        self.sos_token = torch.tensor([tokenizer_src.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_src.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_src.token_to_id("[PAD]")], dtype=torch.int64)


    def __len__(self):
        return len(self.ds)
    
    def __getitem__(self, idx):
        src_target_pair = self.ds[idx]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]
     
        # text to tokens
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids
        
        # Calculate padding number
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1
        
        #  Raise Error if the number get negative

        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError('Sentence is too long base on the sequence length has defined ') 
        
        # Add SOS, EOS and PADDINGS to the source text
        encoder_input = torch.concat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64)

                  ]
        )

        decoder_input = torch.concat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64)

                  ]
        )

        label = torch.concat(
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64)

                  ]
        )

        # Check the size of the tensors to make sure they are all seq_len 
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len


        return {
            "encoder_input": encoder_input,  # (seq_len)
            "decoder_input": decoder_input,  # (seq_len)
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1, 1, seq_len)
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)), # (1, seq_len) & (1, seq_len, seq_len),
            "label": label,  # (seq_len)
            "src_text": src_text,
            "tgt_text": tgt_text,
        }

def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0

In [11]:
def calculate_max_lengths(self, ds_raw):
    max_len_src = 0
    max_len_tgt = 0
    for item in ds_raw:
        src_ids = self.tokenizer_src.encode(item['translation'][self.config.lang_src]).ids
        tgt_ids = self.tokenizer_tgt.encode(item['translation'][self.config.lang_tgt]).ids
        max_len_src = max(len(src_ids), max_len_src)
        max_len_tgt = max(len(tgt_ids), max_len_tgt)
    return max_len_src, max_len_tgt


In [12]:
from torch.utils.data import DataLoader,  random_split
import pandas as pd
from transformerEnFa.logging import logger

In [13]:
"""def get_ds(config):
    # Dataset only has a train split, so we divide it
    ds_raw = load_dataset(config.dataset_name, split='train[:5%]')

        # It only has the train split, so we divide it overselves
    
    # Build Tokenoizers
    tokenizer_src = get_or_build_tokenizer(config, ds_raw, config.lang_src)
    tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config.lang_tgt)

    # 90% training, 10% validation
    train_ds_size = int(0.9 * len(ds_raw))
    val_ds_size = len(ds_raw) - train_ds_size
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])


    # Convert train and validate dataset to tokens by padding, mask, SOS, EOS and etc.
    train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config.lang_src, config.lang_tgt, config.seq_len)
    val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config.lang_src, config.lang_tgt, config.seq_len)

    # Find the maximum length of each sentence in the source and target sentence
    max_len_src = 0
    max_len_tgt = 0

    for item in ds_raw:
       src_ids = tokenizer_src.encode(item['translation'][config.lang_src]).ids
       tgt_ids = tokenizer_tgt.encode(item['translation'][config.lang_tgt]).ids
       max_len_src = max(len(src_ids), max_len_src)
       max_len_tgt = max(len(tgt_ids), max_len_tgt)
    logger.info(f"Max length of source sentence: {max_len_src}")
    logger.info(f"Max length of target sentence: {max_len_tgt}")


    train_dataloader = DataLoader(train_ds, batch_size=config.batch_size, shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)

    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt"""

'def get_ds(config):\n    # Dataset only has a train split, so we divide it\n    ds_raw = load_dataset(config.dataset_name, split=\'train[:5%]\')\n\n        # It only has the train split, so we divide it overselves\n    \n    # Build Tokenoizers\n    tokenizer_src = get_or_build_tokenizer(config, ds_raw, config.lang_src)\n    tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config.lang_tgt)\n\n    # 90% training, 10% validation\n    train_ds_size = int(0.9 * len(ds_raw))\n    val_ds_size = len(ds_raw) - train_ds_size\n    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])\n\n\n    # Convert train and validate dataset to tokens by padding, mask, SOS, EOS and etc.\n    train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config.lang_src, config.lang_tgt, config.seq_len)\n    val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config.lang_src, config.lang_tgt, config.seq_len)\n\n    # Find the maximum length of each sente

In [14]:
#config = ConfigurationManager()
#data_transformation_config = config.get_data_transformation_config()
#get_ds(config=data_transformation_config)

In [15]:
from torch.utils.data import DataLoader,  random_split
from transformerEnFa.logging import logger
from datasets import load_dataset 


In [20]:

class DataTransformationTrainingPipeline():
    def __init__(self):
        self.config_manager = ConfigurationManager()
        self.config = self.config_manager.get_data_transformation_config()

    def calculate_max_lengths(self, ds_raw):
        max_len_src = 0
        max_len_tgt = 0
        for item in ds_raw:
            src_ids = self.tokenizer_src.encode(item['translation'][self.config.lang_src]).ids
            tgt_ids = self.tokenizer_tgt.encode(item['translation'][self.config.lang_tgt]).ids
            max_len_src = max(len(src_ids), max_len_src)
            max_len_tgt = max(len(tgt_ids), max_len_tgt)
        return max_len_src, max_len_tgt

    def get_ds(self):
        # Load the dataset
        ds_raw = load_dataset(self.config.dataset_name, split='train[:5%]')

        # Build tokenizers for source and target languages
        self.tokenizer_src = get_or_build_tokenizer(self.config, ds_raw, self.config.lang_src)
        self.tokenizer_tgt = get_or_build_tokenizer(self.config, ds_raw, self.config.lang_tgt)

        # Split the dataset into training and validation sets
        train_ds_size = int(0.9 * len(ds_raw))
        val_ds_size = len(ds_raw) - train_ds_size
        train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

        # Prepare the datasets for training and validation
        train_ds = BilingualDataset(train_ds_raw, self.tokenizer_src, self.tokenizer_tgt, self.config.lang_src, self.config.lang_tgt, self.config.seq_len)
        val_ds = BilingualDataset(val_ds_raw, self.tokenizer_src, self.tokenizer_tgt, self.config.lang_src, self.config.lang_tgt, self.config.seq_len)

        # Calculate the maximum sentence lengths
        max_len_src, max_len_tgt = self.calculate_max_lengths(ds_raw)

        # Log the maximum sentence lengths
        print(f"Max length of source sentence: {max_len_src}")
        print(f"Max length of target sentence: {max_len_tgt}")

        # Create DataLoaders for the training and validation datasets
        self.train_dataloader = DataLoader(train_ds, batch_size=self.config.batch_size, shuffle=True)
        self.val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)

        return self.train_dataloader, self.val_dataloader
    



In [21]:
data_transformation = DataTransformationTrainingPipeline()
data_transformation.get_ds()

[2024-02-09 14:13:21,714: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-02-09 14:13:21,716: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-09 14:13:21,717: INFO: common: created directory at: artifacts]
[2024-02-09 14:13:21,717: INFO: common: created directory at: artifacts/data_transformation]
Max length of source sentence: 37
Max length of target sentence: 33


(<torch.utils.data.dataloader.DataLoader at 0x11c61a450>,
 <torch.utils.data.dataloader.DataLoader at 0x169369150>)