In [2]:
import os

In [3]:
%pwd

'd:\\Text Summarizer\\Text-Summarizer-NLP\\research'

In [4]:
os.chdir("../")

In [5]:
%pwd

'd:\\Text Summarizer\\Text-Summarizer-NLP'

In [7]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int

In [8]:
from TextSummarizer.constants import *
from TextSummarizer.utils.common import read_yaml, create_directories

In [9]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_ckpt = config.model_ckpt,
            num_train_epochs = params.num_train_epochs,
            warmup_steps = params.warmup_steps,
            per_device_train_batch_size = params.per_device_train_batch_size,
            weight_decay = params.weight_decay,
            logging_steps = params.logging_steps,
            evaluation_strategy = params.evaluation_strategy,
            eval_steps = params.evaluation_strategy,
            save_steps = params.save_steps,
            gradient_accumulation_steps = params.gradient_accumulation_steps
        )

        return model_trainer_config

In [10]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch

  from .autonotebook import tqdm as notebook_tqdm


[2024-04-07 18:31:47,681: INFO: config: PyTorch version 2.2.2 available.]


In [11]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config


    
    def train(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
        
        #loading data 
        dataset_samsum_pt = load_from_disk(self.config.data_path)

        # trainer_args = TrainingArguments(
        #     output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,
        #     per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size,
        #     weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,
        #     evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,
        #     gradient_accumulation_steps=self.config.gradient_accumulation_steps
        # ) 


        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir, num_train_epochs=1, warmup_steps=500,
            per_device_train_batch_size=1, per_device_eval_batch_size=1,
            weight_decay=0.01, logging_steps=10,
            evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
            gradient_accumulation_steps=16
        ) 

        trainer = Trainer(model=model_pegasus, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["train"], 
                  eval_dataset=dataset_samsum_pt["validation"])
        
        trainer.train()

        ## Save model
        model_pegasus.save_pretrained(os.path.join(self.config.root_dir,"pegasus-samsum-model"))
        ## Save tokenizer
        tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))

In [12]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2024-04-07 18:31:53,892: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-04-07 18:31:53,897: INFO: common: yaml file: params.yaml loaded successfully]
[2024-04-07 18:31:53,900: INFO: common: created directory at: artifacts]
[2024-04-07 18:31:53,902: INFO: common: created directory at: artifacts/model_trainer]


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  1%|          | 10/920 [23:34<37:25:08, 148.03s/it]

{'loss': 3.1995, 'grad_norm': 20.154674530029297, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}


  2%|▏         | 20/920 [36:31<18:14:47, 72.99s/it] 

{'loss': 3.2262, 'grad_norm': 9.674358367919922, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.02}


  3%|▎         | 30/920 [46:32<15:00:30, 60.71s/it]

{'loss': 2.9225, 'grad_norm': 11.289918899536133, 'learning_rate': 3e-06, 'epoch': 0.03}


  4%|▍         | 40/920 [58:07<14:28:32, 59.22s/it]

{'loss': 2.8533, 'grad_norm': 14.776792526245117, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.04}


  5%|▌         | 50/920 [1:08:41<14:37:59, 60.55s/it]

{'loss': 2.6914, 'grad_norm': 16.207897186279297, 'learning_rate': 5e-06, 'epoch': 0.05}


  7%|▋         | 60/920 [1:20:12<16:56:45, 70.94s/it]

{'loss': 2.7283, 'grad_norm': 10.838278770446777, 'learning_rate': 6e-06, 'epoch': 0.07}


  8%|▊         | 70/920 [1:30:25<14:48:51, 62.74s/it]

{'loss': 2.6243, 'grad_norm': 10.833501815795898, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.08}


  9%|▊         | 80/920 [1:40:50<13:31:19, 57.95s/it]

{'loss': 2.4212, 'grad_norm': 12.532299041748047, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.09}


 10%|▉         | 90/920 [1:50:21<13:18:30, 57.72s/it]

{'loss': 2.4894, 'grad_norm': 7.339000225067139, 'learning_rate': 9e-06, 'epoch': 0.1}


 11%|█         | 100/920 [2:00:13<14:27:14, 63.46s/it]

{'loss': 2.4589, 'grad_norm': 6.9647064208984375, 'learning_rate': 1e-05, 'epoch': 0.11}


 12%|█▏        | 110/920 [2:11:32<13:56:04, 61.93s/it]

{'loss': 2.2184, 'grad_norm': 54.506431579589844, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.12}


 13%|█▎        | 120/920 [2:21:21<12:53:44, 58.03s/it]

{'loss': 2.1508, 'grad_norm': 6.699660778045654, 'learning_rate': 1.2e-05, 'epoch': 0.13}


 14%|█▍        | 130/920 [2:31:53<14:04:43, 64.16s/it]

{'loss': 2.1534, 'grad_norm': 6.4138031005859375, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.14}


 15%|█▌        | 140/920 [2:41:59<12:57:51, 59.84s/it]

{'loss': 2.0988, 'grad_norm': 8.225994110107422, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.15}


 16%|█▋        | 150/920 [2:52:09<12:41:19, 59.32s/it]

{'loss': 2.0106, 'grad_norm': 7.2247514724731445, 'learning_rate': 1.5e-05, 'epoch': 0.16}


 17%|█▋        | 160/920 [3:02:37<14:21:55, 68.05s/it]

{'loss': 1.9727, 'grad_norm': 14.101868629455566, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.17}


 18%|█▊        | 170/920 [3:13:42<14:04:55, 67.59s/it]

{'loss': 2.0016, 'grad_norm': 10.409403800964355, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.18}


 20%|█▉        | 180/920 [3:24:17<13:57:45, 67.93s/it]

{'loss': 2.0088, 'grad_norm': 7.399185657501221, 'learning_rate': 1.8e-05, 'epoch': 0.2}


 21%|██        | 190/920 [3:35:05<13:18:07, 65.60s/it]

{'loss': 1.8793, 'grad_norm': 4.903065204620361, 'learning_rate': 1.9e-05, 'epoch': 0.21}


 22%|██▏       | 200/920 [3:45:37<12:23:23, 61.95s/it]

{'loss': 1.9241, 'grad_norm': 5.045696258544922, 'learning_rate': 2e-05, 'epoch': 0.22}


 23%|██▎       | 210/920 [3:56:30<11:49:07, 59.93s/it]

{'loss': 1.9194, 'grad_norm': 13.5042085647583, 'learning_rate': 2.1e-05, 'epoch': 0.23}


 24%|██▍       | 220/920 [4:07:03<12:27:16, 64.05s/it]

{'loss': 1.7808, 'grad_norm': 5.980283260345459, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.24}


 25%|██▌       | 230/920 [4:17:30<12:34:31, 65.61s/it]

{'loss': 1.8758, 'grad_norm': 3.7059009075164795, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.25}


 26%|██▌       | 240/920 [4:27:58<11:14:36, 59.52s/it]

{'loss': 1.7891, 'grad_norm': 4.282290935516357, 'learning_rate': 2.4e-05, 'epoch': 0.26}


 27%|██▋       | 250/920 [4:38:23<11:26:49, 61.51s/it]

{'loss': 1.8703, 'grad_norm': 13.948225975036621, 'learning_rate': 2.5e-05, 'epoch': 0.27}


 28%|██▊       | 260/920 [4:48:38<11:35:33, 63.23s/it]

{'loss': 1.778, 'grad_norm': 5.277416229248047, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.28}


 29%|██▉       | 270/920 [4:58:52<10:57:37, 60.70s/it]

{'loss': 1.721, 'grad_norm': 6.2583208084106445, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.29}


 30%|███       | 280/920 [5:08:58<11:14:53, 63.27s/it]

{'loss': 1.7566, 'grad_norm': 5.574337959289551, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.3}


 32%|███▏      | 290/920 [5:19:51<11:26:10, 65.35s/it]

{'loss': 1.8628, 'grad_norm': 3.8642096519470215, 'learning_rate': 2.9e-05, 'epoch': 0.31}


 33%|███▎      | 300/920 [5:29:58<10:55:37, 63.45s/it]

{'loss': 1.6846, 'grad_norm': 5.25118350982666, 'learning_rate': 3e-05, 'epoch': 0.33}


 34%|███▎      | 310/920 [5:40:51<11:41:31, 69.00s/it]

{'loss': 1.8514, 'grad_norm': 4.676946640014648, 'learning_rate': 3.1e-05, 'epoch': 0.34}


 35%|███▍      | 320/920 [5:51:10<10:43:49, 64.38s/it]

{'loss': 1.8889, 'grad_norm': 8.73365592956543, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.35}


 36%|███▌      | 330/920 [6:01:32<9:47:43, 59.77s/it] 

{'loss': 1.8081, 'grad_norm': 11.167832374572754, 'learning_rate': 3.3e-05, 'epoch': 0.36}


 37%|███▋      | 340/920 [6:11:49<9:57:08, 61.77s/it] 

{'loss': 1.7291, 'grad_norm': 3.853654384613037, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.37}


 38%|███▊      | 350/920 [6:23:23<10:47:16, 68.13s/it]

{'loss': 1.724, 'grad_norm': 3.6391093730926514, 'learning_rate': 3.5e-05, 'epoch': 0.38}


 39%|███▉      | 360/920 [6:34:22<9:29:50, 61.05s/it] 

{'loss': 1.6644, 'grad_norm': 4.258339881896973, 'learning_rate': 3.6e-05, 'epoch': 0.39}


 40%|████      | 370/920 [6:44:20<8:56:38, 58.54s/it]

{'loss': 1.6922, 'grad_norm': 5.415938377380371, 'learning_rate': 3.7e-05, 'epoch': 0.4}


 41%|████▏     | 380/920 [6:54:52<9:13:19, 61.48s/it] 

{'loss': 1.72, 'grad_norm': 3.7504234313964844, 'learning_rate': 3.8e-05, 'epoch': 0.41}


 42%|████▏     | 390/920 [7:06:18<9:11:55, 62.48s/it] 

{'loss': 1.7028, 'grad_norm': 3.528146982192993, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.42}


 43%|████▎     | 400/920 [7:16:53<8:43:06, 60.36s/it]

{'loss': 1.7136, 'grad_norm': 5.69550895690918, 'learning_rate': 4e-05, 'epoch': 0.43}


 45%|████▍     | 410/920 [7:27:48<10:02:06, 70.84s/it]

{'loss': 1.7222, 'grad_norm': 6.137825965881348, 'learning_rate': 4.1e-05, 'epoch': 0.45}


 46%|████▌     | 420/920 [7:38:26<8:40:57, 62.51s/it] 

{'loss': 1.6425, 'grad_norm': 6.684006690979004, 'learning_rate': 4.2e-05, 'epoch': 0.46}


 47%|████▋     | 430/920 [7:48:49<8:26:58, 62.08s/it]

{'loss': 1.7803, 'grad_norm': 6.388526916503906, 'learning_rate': 4.3e-05, 'epoch': 0.47}


 48%|████▊     | 440/920 [7:59:59<8:32:18, 64.04s/it] 

{'loss': 1.7315, 'grad_norm': 5.907268047332764, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.48}


 49%|████▉     | 450/920 [8:09:52<7:50:17, 60.04s/it]

{'loss': 1.6482, 'grad_norm': 7.0299553871154785, 'learning_rate': 4.5e-05, 'epoch': 0.49}


 50%|█████     | 460/920 [8:20:13<8:25:38, 65.95s/it]

{'loss': 1.705, 'grad_norm': 3.692817211151123, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.5}


 51%|█████     | 470/920 [8:30:17<7:24:57, 59.33s/it]

{'loss': 1.7178, 'grad_norm': 4.825588226318359, 'learning_rate': 4.7e-05, 'epoch': 0.51}


 52%|█████▏    | 480/920 [8:41:23<7:51:44, 64.33s/it]

{'loss': 1.6305, 'grad_norm': 5.463203430175781, 'learning_rate': 4.8e-05, 'epoch': 0.52}


 53%|█████▎    | 490/920 [8:51:29<7:42:35, 64.55s/it]

{'loss': 1.6501, 'grad_norm': 4.539587020874023, 'learning_rate': 4.9e-05, 'epoch': 0.53}


 54%|█████▍    | 500/920 [9:01:57<7:18:09, 62.59s/it]

{'loss': 1.656, 'grad_norm': 4.157235622406006, 'learning_rate': 5e-05, 'epoch': 0.54}


                                                     
 54%|█████▍    | 500/920 [9:14:52<7:18:09, 62.59s/it]

{'eval_loss': 1.486262321472168, 'eval_runtime': 774.428, 'eval_samples_per_second': 1.056, 'eval_steps_per_second': 1.056, 'epoch': 0.54}


 55%|█████▌    | 510/920 [9:26:52<9:32:39, 83.80s/it]  

{'loss': 1.6997, 'grad_norm': 4.323009967803955, 'learning_rate': 4.880952380952381e-05, 'epoch': 0.55}


 57%|█████▋    | 520/920 [9:37:02<7:10:47, 64.62s/it]

{'loss': 1.671, 'grad_norm': 3.6052086353302, 'learning_rate': 4.761904761904762e-05, 'epoch': 0.56}


 58%|█████▊    | 530/920 [9:47:28<6:30:04, 60.01s/it]

{'loss': 1.6912, 'grad_norm': 4.302635192871094, 'learning_rate': 4.642857142857143e-05, 'epoch': 0.58}


 59%|█████▊    | 540/920 [9:58:02<6:34:29, 62.29s/it]

{'loss': 1.5904, 'grad_norm': 4.070862293243408, 'learning_rate': 4.523809523809524e-05, 'epoch': 0.59}


 60%|█████▉    | 550/920 [10:08:33<6:40:57, 65.02s/it]

{'loss': 1.7086, 'grad_norm': 3.7986061573028564, 'learning_rate': 4.404761904761905e-05, 'epoch': 0.6}


 61%|██████    | 560/920 [10:19:03<6:19:34, 63.26s/it]

{'loss': 1.7216, 'grad_norm': 8.71263313293457, 'learning_rate': 4.2857142857142856e-05, 'epoch': 0.61}


 62%|██████▏   | 570/920 [10:29:35<6:00:11, 61.75s/it]

{'loss': 1.6956, 'grad_norm': 5.589451789855957, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.62}


 63%|██████▎   | 580/920 [10:40:49<6:39:30, 70.50s/it]

{'loss': 1.6256, 'grad_norm': 3.5073533058166504, 'learning_rate': 4.047619047619048e-05, 'epoch': 0.63}


 64%|██████▍   | 590/920 [10:51:29<5:44:08, 62.57s/it]

{'loss': 1.5481, 'grad_norm': 3.2693939208984375, 'learning_rate': 3.928571428571429e-05, 'epoch': 0.64}


 65%|██████▌   | 600/920 [11:01:51<5:46:37, 64.99s/it]

{'loss': 1.6658, 'grad_norm': 3.7586865425109863, 'learning_rate': 3.809523809523809e-05, 'epoch': 0.65}


 66%|██████▋   | 610/920 [11:12:02<5:10:13, 60.04s/it]

{'loss': 1.568, 'grad_norm': 3.892430067062378, 'learning_rate': 3.690476190476191e-05, 'epoch': 0.66}


 67%|██████▋   | 620/920 [11:22:35<5:14:34, 62.92s/it]

{'loss': 1.6635, 'grad_norm': 3.379603862762451, 'learning_rate': 3.571428571428572e-05, 'epoch': 0.67}


 68%|██████▊   | 630/920 [11:33:37<5:11:24, 64.43s/it]

{'loss': 1.6288, 'grad_norm': 3.6838059425354004, 'learning_rate': 3.4523809523809526e-05, 'epoch': 0.68}


 70%|██████▉   | 640/920 [11:43:32<4:36:54, 59.34s/it]

{'loss': 1.6211, 'grad_norm': 3.537419319152832, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.7}


 71%|███████   | 650/920 [11:54:11<4:47:34, 63.91s/it]

{'loss': 1.5024, 'grad_norm': 3.156414270401001, 'learning_rate': 3.2142857142857144e-05, 'epoch': 0.71}


 72%|███████▏  | 660/920 [12:03:44<4:04:17, 56.37s/it]

{'loss': 1.5731, 'grad_norm': 2.809880495071411, 'learning_rate': 3.095238095238095e-05, 'epoch': 0.72}


 73%|███████▎  | 670/920 [12:15:00<4:28:13, 64.37s/it]

{'loss': 1.5763, 'grad_norm': 3.4254395961761475, 'learning_rate': 2.9761904761904762e-05, 'epoch': 0.73}


 74%|███████▍  | 680/920 [12:26:24<4:28:58, 67.24s/it]

{'loss': 1.6017, 'grad_norm': 4.391582012176514, 'learning_rate': 2.857142857142857e-05, 'epoch': 0.74}


 75%|███████▌  | 690/920 [12:36:13<3:33:48, 55.78s/it]

{'loss': 1.5539, 'grad_norm': 3.7878761291503906, 'learning_rate': 2.7380952380952383e-05, 'epoch': 0.75}


 76%|███████▌  | 700/920 [12:46:35<3:59:04, 65.20s/it]

{'loss': 1.6379, 'grad_norm': 3.5953280925750732, 'learning_rate': 2.6190476190476192e-05, 'epoch': 0.76}


 77%|███████▋  | 710/920 [12:58:31<3:36:56, 61.98s/it]

{'loss': 1.6035, 'grad_norm': 3.6266727447509766, 'learning_rate': 2.5e-05, 'epoch': 0.77}


 78%|███████▊  | 720/920 [13:09:53<4:01:24, 72.42s/it]

{'loss': 1.5657, 'grad_norm': 3.198047637939453, 'learning_rate': 2.380952380952381e-05, 'epoch': 0.78}


 79%|███████▉  | 730/920 [13:20:14<3:21:13, 63.55s/it]

{'loss': 1.5386, 'grad_norm': 3.3967933654785156, 'learning_rate': 2.261904761904762e-05, 'epoch': 0.79}


 80%|████████  | 740/920 [13:31:25<3:10:14, 63.41s/it]

{'loss': 1.6786, 'grad_norm': 4.703410625457764, 'learning_rate': 2.1428571428571428e-05, 'epoch': 0.8}


 82%|████████▏ | 750/920 [13:41:18<2:53:55, 61.39s/it]

{'loss': 1.5703, 'grad_norm': 3.620673179626465, 'learning_rate': 2.023809523809524e-05, 'epoch': 0.81}


 83%|████████▎ | 760/920 [13:51:33<2:34:58, 58.11s/it]

{'loss': 1.6089, 'grad_norm': 19.45999526977539, 'learning_rate': 1.9047619047619046e-05, 'epoch': 0.83}


 84%|████████▎ | 770/920 [14:01:38<2:32:23, 60.96s/it]

{'loss': 1.5515, 'grad_norm': 3.368812084197998, 'learning_rate': 1.785714285714286e-05, 'epoch': 0.84}


 85%|████████▍ | 780/920 [14:12:05<2:20:28, 60.20s/it]

{'loss': 1.5696, 'grad_norm': 3.889497756958008, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.85}


 86%|████████▌ | 790/920 [14:22:53<2:21:05, 65.12s/it]

{'loss': 1.5772, 'grad_norm': 4.26412296295166, 'learning_rate': 1.5476190476190476e-05, 'epoch': 0.86}


 87%|████████▋ | 800/920 [14:33:10<2:02:46, 61.39s/it]

{'loss': 1.6628, 'grad_norm': 3.489530324935913, 'learning_rate': 1.4285714285714285e-05, 'epoch': 0.87}


 88%|████████▊ | 810/920 [14:44:25<1:52:50, 61.55s/it]

{'loss': 1.5535, 'grad_norm': 2.585451126098633, 'learning_rate': 1.3095238095238096e-05, 'epoch': 0.88}


 89%|████████▉ | 820/920 [14:54:25<1:42:54, 61.75s/it]

{'loss': 1.6034, 'grad_norm': 4.5574188232421875, 'learning_rate': 1.1904761904761905e-05, 'epoch': 0.89}


 90%|█████████ | 830/920 [15:04:58<1:40:21, 66.91s/it]

{'loss': 1.6841, 'grad_norm': 3.7561933994293213, 'learning_rate': 1.0714285714285714e-05, 'epoch': 0.9}


 91%|█████████▏| 840/920 [15:15:39<1:20:49, 60.62s/it]

{'loss': 1.5413, 'grad_norm': 4.230932235717773, 'learning_rate': 9.523809523809523e-06, 'epoch': 0.91}


 92%|█████████▏| 850/920 [15:30:08<1:57:43, 100.91s/it]

{'loss': 1.5523, 'grad_norm': 6.4766364097595215, 'learning_rate': 8.333333333333334e-06, 'epoch': 0.92}


 93%|█████████▎| 860/920 [15:44:42<1:51:50, 111.85s/it]

{'loss': 1.5592, 'grad_norm': 3.2418441772460938, 'learning_rate': 7.142857142857143e-06, 'epoch': 0.93}


 95%|█████████▍| 870/920 [16:05:32<1:38:18, 117.97s/it]

{'loss': 1.5835, 'grad_norm': 4.349340438842773, 'learning_rate': 5.9523809523809525e-06, 'epoch': 0.94}


 96%|█████████▌| 880/920 [16:19:02<53:16, 79.90s/it]   

{'loss': 1.54, 'grad_norm': 3.3458216190338135, 'learning_rate': 4.7619047619047615e-06, 'epoch': 0.96}


 97%|█████████▋| 890/920 [16:30:52<36:37, 73.24s/it]

{'loss': 1.5387, 'grad_norm': 7.7944793701171875, 'learning_rate': 3.5714285714285714e-06, 'epoch': 0.97}


 98%|█████████▊| 900/920 [16:44:17<22:32, 67.64s/it] 

{'loss': 1.6044, 'grad_norm': 3.7522785663604736, 'learning_rate': 2.3809523809523808e-06, 'epoch': 0.98}


 99%|█████████▉| 910/920 [17:00:01<15:47, 94.71s/it] 

{'loss': 1.5198, 'grad_norm': 4.371145725250244, 'learning_rate': 1.1904761904761904e-06, 'epoch': 0.99}


100%|██████████| 920/920 [17:11:26<00:00, 67.27s/it]
Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


{'loss': 1.6044, 'grad_norm': 3.682349443435669, 'learning_rate': 0.0, 'epoch': 1.0}
{'train_runtime': 61886.2252, 'train_samples_per_second': 0.238, 'train_steps_per_second': 0.015, 'train_loss': 1.823976505321005, 'epoch': 1.0}
