In [1]:
import os


In [2]:
%pwd

'e:\\2025\\Project_Learning\\NLP_Text_Summarizer\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'e:\\2025\\Project_Learning\\NLP_Text_Summarizer'

## Data Model Trainer Entity
 

### After Updating the config.yaml & params.yaml file 


In [5]:
### Entity Creation
from dataclasses import dataclass
from pathlib import Path
from typing import List, Union # Add other imports used in other configs



# # Added Union for flexibility in save_steps/epochs if needed, though float/int is fine

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int

## Configuration Manager


In [6]:
from pathlib import Path
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories
# Assuming ModelTrainerConfig is imported from textSummarizer.entity
from textSummarizer.entity import ModelTrainerConfig 

class ConfigurationManager:
    """
    Manages the loading and parsing of configuration and parameters 
    from YAML files into typed entity objects.
    """
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        # 1. Load configuration and parameters using the utility function
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        # 2. Ensure the main artifacts root directory exists
        create_directories([self.config.artifacts_root])
        
    # --- Model Trainer Configuration Method ---
    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        """
        Reads configuration from config.yaml and params.yaml to create 
        the ModelTrainerConfig entity.
        """
        # 1. Get static paths from config.yaml
        config = self.config.model_trainer
        
        # 2. Get training hyperparameters from params.yaml
        params = self.params.TrainingArguments

        # 3. Create the artifact root directory for the model trainer stage
        create_directories([config.root_dir])

        # 4. Construct the ModelTrainerConfig object
        # NOTE: Arguments here MUST match the ModelTrainerConfig dataclass definition.
        model_trainer_config = ModelTrainerConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path),
            model_ckpt=config.model_ckpt,
            num_train_epochs=int(params.num_train_epochs),
            warmup_steps=params.warmup_steps,
            per_device_train_batch_size=params.per_device_train_batch_size,
            weight_decay=params.weight_decay,
            logging_steps=params.logging_steps,
            
            # NOTE: We maintain 'evaluation_strategy' here to match the 
            # entity's structure, even though the Trainer takes 'eval_strategy'.
            evaluation_strategy=params.evaluation_strategy,
            eval_steps=params.eval_steps, 
            
            save_steps=float(params.save_steps),
            gradient_accumulation_steps=params.gradient_accumulation_steps
        )
        
        return model_trainer_config
        
# --- (Other configuration methods would follow here, e.g., get_data_ingestion_config) ---

## Create The Components


In [7]:
import torch
import os
from datasets import load_from_disk
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, TrainingArguments, Trainer
from pathlib import Path

# Assuming ModelTrainerConfig is imported from .entity
# Example: from textSummarizer.entity import ModelTrainerConfig

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        # Stores the configuration object (paths and hyperparameters)
        self.config = config

    def train(self):
        # 1. Device Setup
        # Check if a CUDA-enabled GPU is available; otherwise, use the CPU.
        device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # 2. Load Model Components
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

        # 3. Loading Data
        # Load the tokenized dataset saved by the Data Transformation stage
        dataset_samsum_pt = load_from_disk(self.config.data_path)
        
        SUBSET_FRACTION = 0.005
        train_len = int(SUBSET_FRACTION * len(dataset_samsum_pt['train']))
        eval_len = int(SUBSET_FRACTION * len(dataset_samsum_pt['validation']))
        
        train_dataset_subset = dataset_samsum_pt["train"].select(range(train_len))
        eval_dataset_subset = dataset_samsum_pt["validation"].select(range(eval_len))

        # 4. Define Training Arguments
        # Create a TrainingArguments instance using parameters from the configuration
        # Note: All training arguments are correctly passed from self.config
        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir,
            num_train_epochs=self.config.num_train_epochs, 
            warmup_steps=self.config.warmup_steps,
            per_device_train_batch_size=self.config.per_device_train_batch_size, 
            per_device_eval_batch_size=self.config.per_device_train_batch_size, # Typically uses train batch size or a separate eval batch size
            weight_decay=self.config.weight_decay, 
            logging_steps=self.config.logging_steps,
            eval_strategy=self.config.evaluation_strategy,
            eval_steps=self.config.eval_steps,
            save_steps=self.config.save_steps, # Uses 1e6 which is essentially 'never save checkpoints during training'
            gradient_accumulation_steps=self.config.gradient_accumulation_steps,
        )

        # 5. Initialize Hugging Face Trainer
        trainer = Trainer(
            model=model_pegasus,
            args=trainer_args,
            tokenizer=tokenizer,
            data_collator=seq2seq_data_collator,
            # Assigning the split datasets for training and evaluation
            train_dataset=train_dataset_subset, # ⬅️ USE THE SUBSET
            eval_dataset=eval_dataset_subset  # ⬅️ USE THE SUBSET
        )

        # 6. Start Training
        trainer.train()

        # 7. Save the final model (Standard practice, though not explicitly in the image)
        # Save model and tokenizer to the root directory
        model_pegasus.save_pretrained(os.path.join(self.config.root_dir, "pegasus-samsum-model"))
        tokenizer.save_pretrained(os.path.join(self.config.root_dir, "tokenizer"))  

  from .autonotebook import tqdm as notebook_tqdm


## Creating the Pipeline 

In [None]:
try:
    # 1. Initialize Configuration Manager
    config = ConfigurationManager()
    
    # 2. Get the specific configuration for Model Training
    model_trainer_config = config.get_model_trainer_config()
    
    # 3. Initialize the Model Trainer Component
    model_trainer = ModelTrainer(config=model_trainer_config)
    
    # 4. Execute the training method
    model_trainer.train()
    
except Exception as e:
    raise e

[2025-10-28 18:52:36,821: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-10-28 18:52:36,825: INFO: common: yaml file: params.yaml loaded successfully]
[2025-10-28 18:52:36,829: INFO: common: created directory at: artifacts]
[2025-10-28 18:52:36,829: INFO: common: created directory at: artifacts/model_trainer]


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
