In [2]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class Config_Data:
    batch_size: int
    num_epochs: int
    lr: float
    seq_len: int
    d_model: int
    datasource: str
    lang_src: str
    lang_tgt: str
    model_folder: str
    model_basename: str
    preload: str
    tokenizer_file: str
    experiment_name: str

In [3]:
from Translate.constants import *
import os
from Translate.utils.common import *

In [4]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    

    def get_config(self) -> Config_Data:
        config = self.config.config_data

        # create_directories([config.root_dir])

        data_ingestion_config = Config_Data(
            
            batch_size = config.batch_size,
            num_epochs = config.num_epochs,
            lr = config.lr,
            seq_len =  config.seq_len,
            d_model = config.d_model,
            datasource = config.datasource,
            lang_src = config.lang_src,
            lang_tgt = config.lang_tgt,
            model_folder = config.model_folder,
            model_basename = config.model_basename,
            preload = config.preload,
            tokenizer_file = config.tokenizer_file,
            experiment_name = config.experiment_name,
        )

        return data_ingestion_config

In [16]:
os.chdir('../')
!pwd

/Users/nikhil0035/Documents/GitHub/Machine_Translation_using_Transformers


In [17]:
!pwd

/Users/nikhil0035/Documents/GitHub/Machine_Translation_using_Transformers


In [21]:
import os
from Translate import logger
from Translate.entity.config_entity import Config_Data
from Translate.components.dataset import BilingualDataset

from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

from torch.utils.data import Dataset, DataLoader, random_split


import warnings
from tqdm import tqdm
from pathlib import Path

class DataInjestion():
    def __init__(self,config: Config_Data,data_class):
        self.config = config
        self.BilingualDataset = data_class
    
    @staticmethod
    def get_all_sentences(ds, lang):
        for item in ds:
            yield item['translation'][lang]
    
    @staticmethod
    def get_or_build_tokenizer(config,ds, lang):
        tokenizer_path = Path(config.tokenizer_file.format(lang))
        if not Path.exists(tokenizer_path):

            tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
            tokenizer.pre_tokenizer = Whitespace()
            trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
            tokenizer.train_from_iterator(DataInjestion.get_all_sentences(ds, lang), trainer=trainer)
            tokenizer.save(str(tokenizer_path))
        
        else:
            tokenizer = Tokenizer.from_file(str(tokenizer_path))
        return tokenizer
    
    
    def get_ds(self):
        ds_raw = load_dataset(f"{self.config.datasource}", f"{self.config.lang_src}-{self.config.lang_tgt}", split='train')

        tokenizer_src = self.get_or_build_tokenizer(self.config, ds_raw, self.config.lang_src)
        tokenizer_tgt = self.get_or_build_tokenizer(self.config, ds_raw, self.config.lang_tgt)

        # tokenizer_src = self.get_or_build_tokenizer(ds_raw, self.config.lang_src)
        # tokenizer_tgt = self.get_or_build_tokenizer(ds_raw, self.config.lang_tgt)

        train_ds_size = int(0.9 * len(ds_raw))
        val_ds_size = len(ds_raw) - train_ds_size
        train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

        train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, self.config.lang_src, self.config.lang_tgt, self.config.seq_len)
        val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt,  self.config.lang_src, self.config.lang_tgt, self.config.seq_len)

        max_len_src = 0
        max_len_tgt = 0

        for item in ds_raw:
            src_ids = tokenizer_src.encode(item['translation'][self.config.lang_src]).ids
            tgt_ids = tokenizer_tgt.encode(item['translation'][self.config.lang_tgt]).ids
            max_len_src = max(max_len_src, len(src_ids))
            max_len_tgt = max(max_len_tgt, len(tgt_ids))

        # print(f'Max length of source sentence: {max_len_src}')
        # print(f'Max length of target sentence: {max_len_tgt}')
        logger.info(f'Max length of source sentence: {max_len_src}')
        logger.info(f'Max length of target sentence: {max_len_tgt}')

        train_dataloader = DataLoader(train_ds, batch_size=self.config.batch_size, shuffle=True)
        val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)

        return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
config_instance = ConfigurationManager()
config_obj = config_instance.get_config()

[2024-01-28 23:00:10,175: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-01-28 23:00:10,176: INFO: common: yaml file: params.yaml loaded successfully]
[2024-01-28 23:00:10,177: INFO: common: created directory at: artifacts]


In [22]:
datainjestion = DataInjestion(config=config_obj,data_class=BilingualDataset)

In [24]:
train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt= datainjestion.get_ds()

[2024-01-28 23:03:07,537: INFO: 3537205078: Max length of source sentence: 309]
[2024-01-28 23:03:07,540: INFO: 3537205078: Max length of target sentence: 274]


In [26]:
batch = next(iter(train_dataloader))

# Print the batch or inspect its structure
print(batch['src_text'])
print(batch['tgt_text'])

["'You know how fond I am of you, Alexis,' she replied when she had heard him out, 'and how ready I am to do anything for you; but I have kept silent because I knew I could be of no use to you and Anna Arkadyevna.' She pronounced the formal 'Anna Arkadyevna' with peculiar precision.", "'A decision, some decision, Alexis Alexandrovich!", "'So I shall: I am going the day after to-morrow, Agatha Mikhaylovna, only I must finish my business.'", "'No, please stay!", 'I hardly know whether I had slept or not after this musing; at any rate, I started wide awake on hearing a vague murmur, peculiar and lugubrious, which sounded, I thought, just above me.', 'When he came to him, he stood like one amazed, looking at him, turning him first on one side, then on the other; looked at the wound the bullet had made, which it seems was just in his breast, where it had made a hole, and no great quantity of blood had followed; but he had bled inwardly, for he was quite dead.', 'Without stopping the izvoshc

['— Tu sai, Aleksej — ella disse, dopo averlo ascoltato — come io ti voglia bene e come sia pronta a fare tutto per te; ma ho taciuto, perché sapevo che non posso essere utile a te e ad Anna Arkad’evna — ella disse, pronunciando con particolare sforzo “Anna Arkad’evna”. — Ti prego di non credere che io voglia biasimare.', '— Una decisione, una qualsiasi decisione, Aleksej Aleksandrovic.', '— Parto proprio domani, Agaf’ja Michajlovna. Bisogna risolvere la faccenda.', '— No, rimanete ancora, vi prego.', 'Non so se mi addormentassi o no; ma a un tratto sentii sopra alla testa un mormorio vago, strano e lugubre, che mi scosse.', 'Quando gli fu vicino rimase com’uomo sbalordito guardando il cadavere, voltandolo prima su un fianco, indi sull’altro, contemplando la ferita che la palla aveva fatto, che sembra lo avesse colpito esattamente nel petto, onde non si vide al di fuori gran copia di sangue, perchè diffuso tutto nell’interno. Raccolti l’arco e le frecce dell’ucciso, tornossene addietro