- Update config.yml
- Update params.yml
- Update entity
- Update the configuration manager in src config
- Update the components
- Update the pipeline
- Update main.py
- Update app.py

In [None]:
import os 
%pwd
os.chdir("../../../")
%pwd

In [None]:
# Update entity
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class TrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    per_device_eval_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    eval_steps: int
    save_steps: int
    gradient_accumulation_steps: int

In [None]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories


# Update the configuration manager in src config
class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifact_root])

    def get_trainer_config(self) -> TrainerConfig:
        config = self.config.trainer
        params = self.params.TrainingArgs
        create_directories([config.root_dir])
    
        data_config = TrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_ckpt = config.model_ckpt,
            num_train_epochs = params.num_train_epochs,
            warmup_steps = params.warmup_steps,
            per_device_train_batch_size = params.per_device_train_batch_size,
            per_device_eval_batch_size = params.per_device_eval_batch_size,
            weight_decay = params.weight_decay,
            logging_steps = params.logging_steps,
            evaluation_strategy = params.evaluation_strategy,
            eval_steps = params.eval_steps,
            save_steps = params.save_steps,
            gradient_accumulation_steps = params.gradient_accumulation_steps
        )
        return data_config

In [None]:
x = ConfigurationManager()
x = x.get_trainer_config()

In [None]:
import torch
from transformers import (AutoModelForSeq2SeqLM, AutoTokenizer, 
                            Trainer, TrainingArguments, DataCollatorForSeq2Seq)
from datasets import load_dataset, load_from_disk

In [None]:
class ModelTrainer:
    def __init__(self, config: TrainerConfig):
        self.config = config
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(self.device)
        self.dataCollator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer,
                                                    model=self.model,)
        

    def train(self):
        # Load dataset
        dataset = load_from_disk(self.config.data_path)

        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir, 
            num_train_epochs=self.config.num_train_epochs, 
            warmup_steps=self.config.warmup_steps,
            per_device_train_batch_size=self.config.per_device_train_batch_size, 
            per_device_eval_batch_size=self.config.per_device_eval_batch_size,
            weight_decay=self.config.weight_decay, 
            logging_steps=self.config.logging_steps,
            evaluation_strategy=self.config.evaluation_strategy, 
            eval_steps=self.config.eval_steps, 
            save_steps=self.config.save_steps,
            gradient_accumulation_steps=self.config.gradient_accumulation_steps
        ) 

        # trainer_args = TrainingArguments(
        #                 output_dir=self.config.root_dir, 
        #                 num_train_epochs=1, 
        #                 warmup_steps=50,
        #                 per_device_train_batch_size=1, 
        #                 per_device_eval_batch_size=1,
        #                 weight_decay=0.01, 
        #                 logging_steps=10,
        #                 evaluation_strategy='steps', 
        #                 eval_steps=10, 
        #                 save_steps=1e6,
        #                 gradient_accumulation_steps=1
        #             )

        trainer = Trainer(model=self.model, args=trainer_args,
                  tokenizer=self.tokenizer, data_collator=self.dataCollator,
                  train_dataset=dataset["train"], 
                  eval_dataset=dataset["validation"],)
        
        trainer.train()

        # Save the model
        self.model.save_pretrained(os.path.join(self.config.root_dir,"pegasus-samsum-model"))
        self.tokenizer.save_pretrained(os.path.join(self.config.root_dir,"pegasus-samsum-tokenizer"))

In [None]:
x = ModelTrainer(x)
x = x.train()

In [None]:
# Update the pipeline
try:
    config = ConfigurationManager()
    data_config = config.get_trainer_config()
    model_trainer = ModelTrainer(config=data_config)
    model_trainer.train()
except Exception as e:
    raise e