In [1]:
import os


In [2]:
%pwd

'e:\\2025\\Project_Learning\\NLP_Text_Summarizer\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'e:\\2025\\Project_Learning\\NLP_Text_Summarizer'

## Data Transformation Entity


In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    """
    Defines the structure for Data Transformation configuration.
    Note: The 'tokenizer_name' is typed as Path in the image, 
    but for a Hugging Face model name, 'str' is usually more appropriate.
    """
    root_dir: Path
    data_path: Path
    tokenizer_name: Path

## Configuration Code ### (as the order in Readme.md file )

In [6]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml,  create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        # Ensure the main artifacts root directory exists
        create_directories([self.config.artifacts_root])


    def get_data_transformation_config(self) -> DataTransformationConfig:
        """
        Reads data_transformation configuration from config.yaml
        and returns it as a DataTransformationConfig object.
        """
        # 1. Access the 'data_transformation' section
        config = self.config.data_transformation

        # 2. Create the root directory for data transformation artifacts
        create_directories([config.root_dir])

        # 3. Create the DataTransformationConfig entity object
        data_transformation_config = DataTransformationConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path),
            tokenizer_name=config.tokenizer_name
        )

        # 4. Return the configured object
        return data_transformation_config

## Creating The Components 


In [7]:
import os
from transformers import AutoTokenizer
from datasets import load_from_disk
from textSummarizer.logging import logger
from pathlib import Path
from typing import Dict, Any, Union

# Assuming DataTransformationConfig is imported from .entity
# Example: from textSummarizer.entity import DataTransformationConfig

class DataTransformation:
    """
    A component class to handle tokenization and transformation 
    of the dataset for the model training stage.
    """
    def __init__(self, config: DataTransformationConfig):
        # Stores the configuration object (data paths, tokenizer name)
        self.config = config
        
        # Load the tokenizer specified in the configuration
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_name)

    def convert_examples_to_features(self, example_batch: Dict[str, Union[list, Any]]):
        """
        Tokenizes an example batch of the dataset.

        Args:
            example_batch (Dict): A batch dictionary from the dataset (e.g., {'dialogue': [...], 'summary': [...]}).

        Returns:
            Dict: A dictionary containing tokenized inputs, labels, and attention masks.
        """
        # Tokenize the input text (dialogue)
        # Max length of 1024 tokens for input dialogue
        input_encodings = self.tokenizer(example_batch['dialogue'], max_length=1024, truncation=True)

        # Tokenize the target summary (labels)
        with self.tokenizer.as_target_tokenizer():
            # Max length of 128 tokens for target summary
            target_encodings = self.tokenizer(example_batch['summary'], max_length=128, truncation=True)

        # Create the dictionary of features for the model (input_ids, attention_mask, and labels)
        return {
            'input_ids' : input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
            'labels': target_encodings['input_ids']
        }

    def convert(self):
        """
        Loads the dataset from disk, tokenizes it using the mapping function, 
        and saves the tokenized dataset to a new location.
        """
        logger.info(f"Loading dataset from: {self.config.data_path}")
        # Load the dataset saved in the previous stage
        dataset_samsum = load_from_disk(self.config.data_path)
        
        logger.info("Starting dataset tokenization...")
        # Apply the tokenization function (convert_examples_to_features)
        dataset_samsum_pt = dataset_samsum.map(self.convert_examples_to_features, batched=True)
        
        # Save the tokenized, processed dataset
        save_path = os.path.join(self.config.root_dir, "samsum_dataset")
        dataset_samsum_pt.save_to_disk(save_path)
        logger.info(f"Tokenized dataset saved to: {save_path}")

  from .autonotebook import tqdm as notebook_tqdm


## Creating the Pipeline


In [8]:
try:
    # 1. Initialize Configuration Manager
    config = ConfigurationManager()
    
    # 2. Get the specific configuration for Data Transformation
    data_transformation_config = config.get_data_transformation_config()
    
    # 3. Initialize the Data Transformation Component
    data_transformation = DataTransformation(config=data_transformation_config)
    
    # 4. Execute the transformation method
    data_transformation.convert()
    
except Exception as e:
    raise e

[2025-10-28 15:57:15,040: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-10-28 15:57:15,056: INFO: common: yaml file: params.yaml loaded successfully]
[2025-10-28 15:57:15,060: INFO: common: created directory at: artifacts]
[2025-10-28 15:57:15,063: INFO: common: created directory at: artifacts/data_transformation]


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


[2025-10-28 15:57:23,837: INFO: 1902114724: Loading dataset from: artifacts\data_ingestion\samsum_dataset]
[2025-10-28 15:57:23,932: INFO: 1902114724: Starting dataset tokenization...]


Map: 100%|██████████| 14732/14732 [00:04<00:00, 3282.14 examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 1032.76 examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 2442.16 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 85648.11 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 30560.34 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 37427.08 examples/s]

[2025-10-28 15:57:30,100: INFO: 1902114724: Tokenized dataset saved to: artifacts\data_transformation\samsum_dataset]



