In [29]:
import os

In [30]:
# os.chdir('Text-Summarizer')

In [31]:
%pwd

'd:\\Programs\\Projects\\GitHub_Do_Not_Delete\\Text-Summarizer'

In [32]:
#os.chdir('../')
%pwd

'd:\\Programs\\Projects\\GitHub_Do_Not_Delete\\Text-Summarizer'

In [33]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: Path

In [34]:
from TextSummarizer.constants import *
from TextSummarizer.utils.common import read_yaml,create_directories

In [35]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH):
        
        self.config= read_yaml(config_filepath)
        self.params= read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_Transformation

        create_directories([config.root_dir])

        data_transformation_config= DataTransformationConfig(
            root_dir=config.root_dir,
            data_path= config.data_path,
            tokenizer_name=config.tokenizer_name
        )

        return data_transformation_config



In [36]:
import os 
from TextSummarizer.logging import logger
from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk

In [37]:
class DataTransformation:
    def __init__(self,config: DataTransformationConfig):
        self.config= config
        self.tokenizer=AutoTokenizer.from_pretrained(config.tokenizer_name, local_files_only=True )


    def convert_examples_to_features(self,example_batch):
        input_encodings=self.tokenizer(example_batch['dialogue'],max_length=1024,truncation=True) #dialogue

        with self.tokenizer.as_target_tokenizer():
            target_encodings= self.tokenizer(example_batch['summary'],max_length=1024,truncation=True)

        return {
            'input_ids': input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
            'labels':target_encodings['input_ids']
        }
    

    def create_dirs_files(self):
        dirs=""
        l=str(self.config.data_path).split('/')[2:]
        for i in l:
             dirs+=i+'/'
          
        dataset_loc=Path(self.config.data_path).resolve()  #
        desired_location=Path(self.config.root_dir).resolve()
        desired_location=desired_location.joinpath(dirs)
        os.makedirs(desired_location,exist_ok=True)

        for root,d,files in os.walk(dataset_loc):
            rel_path = Path(root).relative_to(dataset_loc)           
            target_dir = desired_location / rel_path
            
            target_dir.mkdir(parents=True,exist_ok=True)

            for file in files:
                empty_file = target_dir / file
                empty_file.touch(exist_ok=True)


    def convert(self):
        dataset_path = Path(self.config.data_path)
        
        dataset_samsum = load_from_disk(f"file://{dataset_path}")
        print(dataset_samsum.column_names)
        dataset_samsum_pt = dataset_samsum.map(self.convert_examples_to_features, batched = True)
        
        save_dir = Path(self.config.root_dir).joinpath("DataSet/samsum_dataset")
        
        self.create_dirs_files()
        
        dataset_samsum_pt.save_to_disk(str(save_dir))





In [38]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    print("Transformation initialization Done")
    data_transformation.convert()
    print("Transformation Done")
except Exception as e:
    raise e

[2026-01-05 00:13:22,825: INFO : common : yaml file: config\config.yaml loaded successfully]
[2026-01-05 00:13:22,857: INFO : common : yaml file: params.yaml loaded successfully]
[2026-01-05 00:13:22,861: INFO : common : created directories at: artifacts]
[2026-01-05 00:13:22,873: INFO : common : created directories at: artifacts/data_Transformation]
Transformation initialization Done
{'train': ['id', 'dialogue', 'summary'], 'test': ['id', 'dialogue', 'summary'], 'validation': ['id', 'dialogue', 'summary']}


Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/14732 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/819 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/818 [00:00<?, ? examples/s]

Transformation Done


In [39]:
%pwd
#os.chdir("GitHub_Do_Not_Delete/Text-Summarizer/")

'd:\\Programs\\Projects\\GitHub_Do_Not_Delete\\Text-Summarizer'

In [41]:
#os.chdir('T ext-Summarizer')

In [None]:
# %pwd
