In [1]:
import os

In [2]:
os.chdir("../")

In [3]:
%pwd

'/Users/tien/Documents/mlProjects/textSummarization/textSummarizer'

In [4]:
#3. Write the entity

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
  root_dir: Path
  data_path: Path
  model_ckpt: Path
  num_train_epochs: int
  warmup_steps: int
  per_device_train_batch_size: int
  weight_decay: float
  logging_steps: int
  evaluation_strategy: str
  eval_steps: int
  save_steps: float
  gradient_accumulation_steps: int

In [5]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
  def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):
    self.config = read_yaml(config_filepath)
    self.params = read_yaml(params_filepath)
    create_directories([self.config.artifacts_root])

  
  def get_model_trainer_config(self) -> ModelTrainerConfig:
    config = self.config.model_trainer
    params = self.params.TrainingArguments
    
    create_directories([config.root_dir])

    
    model_trainer_config = ModelTrainerConfig(
      root_dir=config.root_dir,
      data_path=config.data_path,
      model_ckpt = config.model_ckpt,
      num_train_epochs = params.num_train_epochs,
      warmup_steps = params.warmup_steps,
      per_device_train_batch_size = params.per_device_train_batch_size,
      weight_decay = params.weight_decay,
      logging_steps = params.logging_steps,
      evaluation_strategy = params.evaluation_strategy,
      eval_steps = params.evaluation_strategy,
      save_steps = params.save_steps,
      gradient_accumulation_steps = params.gradient_accumulation_steps
    )

    return model_trainer_config

In [7]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch

  from .autonotebook import tqdm as notebook_tqdm


[2023-07-13 22:18:06: INFO: instantiator: Created a temporary directory at /var/folders/t3/h3nhhgs91gx424npw04bv5xc0000gn/T/tmp2ahs810v]
[2023-07-13 22:18:06: INFO: instantiator: Writing /var/folders/t3/h3nhhgs91gx424npw04bv5xc0000gn/T/tmp2ahs810v/_remote_module_non_scriptable.py]


In [8]:
class ModelTrainer:
  def __init__(self, config: ModelTrainerConfig):
    self.config = config
  
  def train(self):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
    model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
    seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model = model_pegasus)

    #loading_data
    dataset_samsum_pt = load_from_disk(self.config.data_path)

    
    # trainer_args = TrainingArguments(
    #   output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,
    #   per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size,
    #   weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,
    #   evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,
    #   gradient_accumulation_steps=self.config.gradient_accumulation_steps
    # ) 
    trainer_args = TrainingArguments(
            output_dir=self.config.root_dir, num_train_epochs=1, warmup_steps=500,
            per_device_train_batch_size=1, per_device_eval_batch_size=1,
            weight_decay=0.01, logging_steps=10,
            evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
            gradient_accumulation_steps=16
        ) 


    trainer = Trainer(model=model_pegasus, args=trainer_args,
              tokenizer=tokenizer, data_collator=seq2seq_data_collator,
              train_dataset=dataset_samsum_pt["test"], 
              eval_dataset=dataset_samsum_pt["validation"])

    trainer.train()

    ## Save model
    model_pegasus.save_pretrained(os.path.join(self.config.root_dir,"pegasus-samsum-model"))
    ## Save tokenizer
    tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))

In [9]:
try:
  config = ConfigurationManager()
  model_trainer_config = config.get_model_trainer_config()
  model_trainer = ModelTrainer(config = model_trainer_config)
  model_trainer.train()

except Exception as e:
  raise e

[2023-07-13 22:18:10: INFO: common: Read config/config.yaml successfully.]
[2023-07-13 22:18:10: INFO: common: Read params.yaml successfully.]
{'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'source_URL': 'https://github.com/entbappy/Branching-tutorial/raw/master/summarizer-data.zip', 'local_data_file': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion'}, 'data_validation': {'root_dir': 'artifacts/data_validation', 'STATUS_FILE': 'artifacts/data_validation/status.txt', 'ALL_REQUIRED_FILES': ['train', 'test', 'validation']}, 'data_transformation': {'root_dir': 'artifacts/data_transformation', 'data_path': 'artifacts/data_ingestion/samsum_dataset', 'tokenizer_name': 'google/pegasus-cnn_dailymail'}, 'model_trainer': {'root_dir': 'artifacts/model_trainer', 'data_path': 'artifacts/data_transformation/samsum_dataset', 'model_ckpt': 'google/pegasus-cnn_dailymail'}, 'model_evaluation': {'root_dir': 'artifacts/model_evaluation

  0%|          | 0/51 [00:00<?, ?it/s]You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 20%|█▉        | 10/51 [10:57<47:19, 69.25s/it]

{'loss': 3.3137, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.2}


 39%|███▉      | 20/51 [22:17<34:07, 66.04s/it]

{'loss': 3.1, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.39}


 59%|█████▉    | 30/51 [32:58<21:28, 61.38s/it]

{'loss': 3.0839, 'learning_rate': 3e-06, 'epoch': 0.59}


 78%|███████▊  | 40/51 [43:35<11:36, 63.28s/it]

{'loss': 2.9821, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.78}


 98%|█████████▊| 50/51 [54:01<00:56, 56.63s/it]

{'loss': 3.1031, 'learning_rate': 5e-06, 'epoch': 0.98}


100%|██████████| 51/51 [55:26<00:00, 65.23s/it]


{'train_runtime': 3326.5288, 'train_samples_per_second': 0.246, 'train_steps_per_second': 0.015, 'train_loss': 3.1165119339438045, 'epoch': 1.0}


In [10]:
!pip install --upgrade accelerate

Collecting accelerate
  Using cached accelerate-0.21.0-py3-none-any.whl (244 kB)
Installing collected packages: accelerate
Successfully installed accelerate-0.21.0
