In [1]:
import os

In [2]:
pwd

'c:\\Users\\Devendra\\Projects\\LanguageTranslation\\research'

In [3]:
cd ..


c:\Users\Devendra\Projects\LanguageTranslation


In [4]:
pwd

'c:\\Users\\Devendra\\Projects\\LanguageTranslation'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: Path


In [6]:
from LanguageTranslation.constants import *
from LanguageTranslation.utils.utils import read_yaml, create_directories


In [7]:
class ConfigurationManager:
    def __init__(
        self,config_filepath = CONFIG_FILE_PATH,params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            tokenizer_name = config.tokenizer_name
        )

        return data_transformation_config

In [8]:
import os
from LanguageTranslation.logging import logger
from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


[2024-01-25 19:44:10,557: INFO: config: PyTorch version 2.1.2 available.]
[2024-01-25 19:44:10,557: INFO: config: TensorFlow version 2.13.0 available.]


In [9]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)



    def preprocess_function(self,examples):
        inputs = [ex["en"] for ex in examples["translation"]]
        targets = [ex["hi"] for ex in examples["translation"]]
        model_inputs = self.tokenizer(inputs, max_length=128, truncation=True)

        # Setup the tokenizer for targets
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(targets, max_length=128, truncation=True)

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs


    def convert(self):
        dataset_iitb_english_hindi = load_from_disk(self.config.data_path)
        dataset_iitb_english_hindi_pt = dataset_iitb_english_hindi.map(self.preprocess_function, batched = True)
        dataset_iitb_english_hindi_pt.save_to_disk(os.path.join(self.config.root_dir,"iitb-english-hindi_dataset"))


In [10]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.convert()
except Exception as e:
    raise e

[2024-01-25 19:44:12,122: INFO: utils: yaml file: config\config.yaml loaded successfully]
[2024-01-25 19:44:12,122: INFO: utils: yaml file: params.yaml loaded successfully]
[2024-01-25 19:44:12,136: INFO: utils: created directory at: artifacts]
[2024-01-25 19:44:12,136: INFO: utils: created directory at: artifacts/data_transformation]


Map: 100%|██████████| 1659083/1659083 [13:02<00:00, 2119.33 examples/s] 
Map: 100%|██████████| 520/520 [00:00<00:00, 3697.62 examples/s]
Map: 100%|██████████| 2507/2507 [00:00<00:00, 3418.38 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 1659083/1659083 [00:01<00:00, 1026554.29 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 520/520 [00:00<00:00, 49056.19 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2507/2507 [00:00<00:00, 156084.79 examples/s]
