In [1]:
import os

In [4]:
%pwd

'c:\\Users\\mehak\\OneDrive\\NLP_Project\\Text-Summarizer---NLP'

In [3]:
os.chdir('../')

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True) 
class ModelTrainerConfig:  
    root_dir: Path
    data_path: Path
    model: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    per_device_eval_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int
    learning_rate: float

In [6]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments
        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir =  config.root_dir,
            data_path = config.data_path,
            model = config.model,
            num_train_epochs = params.num_train_epochs,
            warmup_steps = params.warmup_steps,
            per_device_train_batch_size = params.per_device_train_batch_size,
            per_device_eval_batch_size = params.per_device_eval_batch_size,
            weight_decay = params.weight_decay,
            logging_steps = params.logging_steps,
            evaluation_strategy = params.evaluation_strategy,
            eval_steps = params.eval_steps,
            save_steps = params.save_steps,
            gradient_accumulation_steps = params.gradient_accumulation_steps,
            learning_rate = params.learning_rate
        )

        return model_trainer_config

In [8]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch

  from .autonotebook import tqdm as notebook_tqdm


[2025-02-04 11:05:12,492: INFO: config: PyTorch version 2.4.1 available.]


In [9]:
from transformers import AdamW

In [None]:
#device = torch.cuda.is_available()

In [11]:
#device

False

In [12]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
    
    def train(self):

        tokenizer = AutoTokenizer.from_pretrained(self.config.model)
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model)

        for param in model_pegasus.parameters():
            param.requires_grad = False

        for param in model_pegasus.model.encoder.layers[-2:].parameters():
            param.requires_grad = True

        for param in model_pegasus.model.decoder.layers[-2:].parameters():
            param.requires_grad = True

        dataset = load_from_disk(self.config.data_path)
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,
            per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_eval_batch_size,
            weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,
            evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,
            gradient_accumulation_steps=self.config.gradient_accumulation_steps, learning_rate=5e-5
            ) 

        optimizer = AdamW([
            {"params": model_pegasus.model.encoder.layers[-2:].parameters(), "lr": 3e-5},
            {"params": model_pegasus.model.decoder.layers[-2:].parameters(), "lr": 3e-5},
        ], lr=5e-5, weight_decay=self.config.weight_decay)

        trainer = Trainer(
            model=model_pegasus,
            args=trainer_args,
            tokenizer=tokenizer,
            data_collator=seq2seq_data_collator,
            train_dataset=dataset["train"],
            eval_dataset=dataset["validation"],
            optimizers=(optimizer, None)
        )

        trainer.train()

        # Save the fine-tuned model
        model_pegasus.save_pretrained(os.path.join(self.config.root_dir, "pegasus-samsum-model"))
        tokenizer.save_pretrained(os.path.join(self.config.root_dir, "tokenizer"))

In [13]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.train()
except Exception as e:
    raise e

[2025-02-04 11:05:32,279: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-02-04 11:05:32,290: INFO: common: yaml file: params.yaml loaded successfully]
[2025-02-04 11:05:32,296: INFO: common: created directory at: artifacts]
[2025-02-04 11:05:32,299: INFO: common: created directory at: artifacts/model_trainer]


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  1%|▏         | 10/690 [27:49<25:56:20, 137.32s/it]

{'loss': 3.1692, 'grad_norm': 54.798912048339844, 'learning_rate': 6.000000000000001e-07, 'epoch': 0.04}


  3%|▎         | 20/690 [44:10<22:40:37, 121.85s/it]

{'loss': 3.0284, 'grad_norm': 45.83332824707031, 'learning_rate': 1.2000000000000002e-06, 'epoch': 0.09}


  4%|▍         | 30/690 [1:06:10<35:43:06, 194.83s/it]

{'loss': 3.1191, 'grad_norm': 46.864227294921875, 'learning_rate': 1.8e-06, 'epoch': 0.13}


  6%|▌         | 40/690 [1:39:28<35:54:49, 198.91s/it]

{'loss': 3.023, 'grad_norm': 51.96515655517578, 'learning_rate': 2.4000000000000003e-06, 'epoch': 0.17}


  7%|▋         | 50/690 [2:08:39<30:14:33, 170.11s/it]

{'loss': 3.0195, 'grad_norm': 49.2462043762207, 'learning_rate': 3e-06, 'epoch': 0.22}


  9%|▊         | 60/690 [3:26:01<31:53:23, 182.23s/it]  

{'loss': 3.0041, 'grad_norm': 45.92800521850586, 'learning_rate': 3.6e-06, 'epoch': 0.26}


 10%|█         | 70/690 [3:37:53<13:12:44, 76.72s/it] 

{'loss': 2.9773, 'grad_norm': 44.335018157958984, 'learning_rate': 4.2000000000000004e-06, 'epoch': 0.3}


 12%|█▏        | 80/690 [4:01:32<24:44:36, 146.03s/it]

{'loss': 2.9305, 'grad_norm': 44.184329986572266, 'learning_rate': 4.800000000000001e-06, 'epoch': 0.35}


 13%|█▎        | 90/690 [4:28:19<26:42:59, 160.30s/it]

{'loss': 2.9664, 'grad_norm': 40.07671356201172, 'learning_rate': 5.4e-06, 'epoch': 0.39}


 14%|█▍        | 100/690 [4:45:57<17:11:22, 104.89s/it]

{'loss': 2.9985, 'grad_norm': 45.71270751953125, 'learning_rate': 6e-06, 'epoch': 0.43}


 16%|█▌        | 110/690 [5:07:28<24:01:41, 149.14s/it]

{'loss': 3.0005, 'grad_norm': 39.997108459472656, 'learning_rate': 6.6e-06, 'epoch': 0.48}


 17%|█▋        | 120/690 [5:24:54<15:16:29, 96.47s/it] 

{'loss': 2.9654, 'grad_norm': 42.51305389404297, 'learning_rate': 7.2e-06, 'epoch': 0.52}


 19%|█▉        | 130/690 [5:50:31<24:32:12, 157.74s/it]

{'loss': 2.9162, 'grad_norm': 45.5532112121582, 'learning_rate': 7.8e-06, 'epoch': 0.56}


 20%|██        | 140/690 [6:17:15<25:13:47, 165.14s/it]

{'loss': 3.0022, 'grad_norm': 32.12028884887695, 'learning_rate': 8.400000000000001e-06, 'epoch': 0.61}


 22%|██▏       | 150/690 [6:42:29<17:03:10, 113.69s/it]

{'loss': 2.8539, 'grad_norm': 32.148197174072266, 'learning_rate': 9e-06, 'epoch': 0.65}


 23%|██▎       | 160/690 [6:57:30<13:01:56, 88.52s/it] 

{'loss': 2.8208, 'grad_norm': 30.895307540893555, 'learning_rate': 9.600000000000001e-06, 'epoch': 0.7}


 25%|██▍       | 170/690 [7:13:10<14:09:34, 98.03s/it]

{'loss': 2.7471, 'grad_norm': 36.40605926513672, 'learning_rate': 1.02e-05, 'epoch': 0.74}


 26%|██▌       | 180/690 [7:28:13<13:30:59, 95.41s/it]

{'loss': 2.8072, 'grad_norm': 36.155181884765625, 'learning_rate': 1.08e-05, 'epoch': 0.78}


 28%|██▊       | 190/690 [7:42:29<11:59:27, 86.34s/it]

{'loss': 2.6513, 'grad_norm': 30.261356353759766, 'learning_rate': 1.1400000000000001e-05, 'epoch': 0.83}


 29%|██▉       | 200/690 [7:56:58<12:02:30, 88.47s/it]

{'loss': 2.6736, 'grad_norm': 40.970069885253906, 'learning_rate': 1.2e-05, 'epoch': 0.87}


 30%|███       | 210/690 [8:11:39<11:58:16, 89.78s/it]

{'loss': 2.6263, 'grad_norm': 28.737869262695312, 'learning_rate': 1.26e-05, 'epoch': 0.91}


 32%|███▏      | 220/690 [8:26:21<11:52:05, 90.90s/it]

{'loss': 2.5887, 'grad_norm': 27.367626190185547, 'learning_rate': 1.32e-05, 'epoch': 0.96}


 33%|███▎      | 230/690 [8:50:09<11:42:36, 91.65s/it] 

{'loss': 2.5395, 'grad_norm': 24.992799758911133, 'learning_rate': 1.3800000000000002e-05, 'epoch': 1.0}


 35%|███▍      | 240/690 [9:01:40<8:55:16, 71.37s/it] 

{'loss': 2.5209, 'grad_norm': 33.42159652709961, 'learning_rate': 1.44e-05, 'epoch': 1.04}


 36%|███▌      | 250/690 [9:13:00<8:39:20, 70.82s/it]

{'loss': 2.4914, 'grad_norm': 23.269376754760742, 'learning_rate': 1.5e-05, 'epoch': 1.09}


 38%|███▊      | 260/690 [9:25:00<8:22:35, 70.13s/it]

{'loss': 2.5324, 'grad_norm': 26.336008071899414, 'learning_rate': 1.56e-05, 'epoch': 1.13}


 39%|███▉      | 270/690 [9:37:42<9:32:27, 81.78s/it]

{'loss': 2.4852, 'grad_norm': 24.361303329467773, 'learning_rate': 1.62e-05, 'epoch': 1.17}


 41%|████      | 280/690 [9:52:50<10:01:27, 88.02s/it]

{'loss': 2.4527, 'grad_norm': 30.39527702331543, 'learning_rate': 1.6800000000000002e-05, 'epoch': 1.22}


 42%|████▏     | 290/690 [10:08:01<10:22:45, 93.41s/it]

{'loss': 2.4027, 'grad_norm': 19.92495346069336, 'learning_rate': 1.74e-05, 'epoch': 1.26}


 43%|████▎     | 300/690 [10:22:01<8:46:07, 80.94s/it] 

{'loss': 2.5025, 'grad_norm': 19.067960739135742, 'learning_rate': 1.8e-05, 'epoch': 1.3}


 45%|████▍     | 310/690 [10:37:00<9:30:49, 90.13s/it]

{'loss': 2.3615, 'grad_norm': 18.748706817626953, 'learning_rate': 1.86e-05, 'epoch': 1.35}


 46%|████▋     | 320/690 [11:45:38<24:16:43, 236.23s/it]  

{'loss': 2.3629, 'grad_norm': 21.79363441467285, 'learning_rate': 1.9200000000000003e-05, 'epoch': 1.39}


 48%|████▊     | 330/690 [11:56:45<6:57:01, 69.50s/it]  

{'loss': 2.3715, 'grad_norm': 19.677841186523438, 'learning_rate': 1.98e-05, 'epoch': 1.43}


 49%|████▉     | 340/690 [12:08:19<6:51:39, 70.57s/it]

{'loss': 2.288, 'grad_norm': 25.769872665405273, 'learning_rate': 2.04e-05, 'epoch': 1.48}


 51%|█████     | 350/690 [12:19:42<6:44:48, 71.44s/it]

{'loss': 2.3007, 'grad_norm': 18.25079345703125, 'learning_rate': 2.1e-05, 'epoch': 1.52}


 52%|█████▏    | 360/690 [12:32:26<6:39:03, 72.56s/it]

{'loss': 2.2821, 'grad_norm': 20.840227127075195, 'learning_rate': 2.16e-05, 'epoch': 1.56}


 54%|█████▎    | 370/690 [12:44:06<6:00:37, 67.62s/it]

{'loss': 2.2682, 'grad_norm': 122.87893676757812, 'learning_rate': 2.22e-05, 'epoch': 1.61}


 55%|█████▌    | 380/690 [12:55:03<5:40:24, 65.89s/it]

{'loss': 2.2433, 'grad_norm': 19.380781173706055, 'learning_rate': 2.2800000000000002e-05, 'epoch': 1.65}


 57%|█████▋    | 390/690 [13:05:36<5:12:33, 62.51s/it]

{'loss': 2.325, 'grad_norm': 35.104103088378906, 'learning_rate': 2.3400000000000003e-05, 'epoch': 1.69}


 58%|█████▊    | 400/690 [13:16:03<5:03:26, 62.78s/it]

{'loss': 2.2856, 'grad_norm': 27.29322624206543, 'learning_rate': 2.4e-05, 'epoch': 1.74}


 59%|█████▉    | 410/690 [13:26:53<4:55:01, 63.22s/it]

{'loss': 2.2438, 'grad_norm': 15.272923469543457, 'learning_rate': 2.4599999999999998e-05, 'epoch': 1.78}


 61%|██████    | 420/690 [13:37:54<5:01:25, 66.98s/it]

{'loss': 2.1757, 'grad_norm': 13.432772636413574, 'learning_rate': 2.52e-05, 'epoch': 1.82}


 62%|██████▏   | 430/690 [13:48:41<4:29:26, 62.18s/it]

{'loss': 2.1508, 'grad_norm': 19.99643325805664, 'learning_rate': 2.58e-05, 'epoch': 1.87}


 64%|██████▍   | 440/690 [13:59:28<4:36:04, 66.26s/it]

{'loss': 2.1506, 'grad_norm': 14.815589904785156, 'learning_rate': 2.64e-05, 'epoch': 1.91}


 65%|██████▌   | 450/690 [14:09:56<4:09:29, 62.37s/it]

{'loss': 2.1381, 'grad_norm': 17.810791015625, 'learning_rate': 2.7000000000000002e-05, 'epoch': 1.95}


 67%|██████▋   | 460/690 [14:19:38<3:44:06, 58.46s/it]

{'loss': 2.1008, 'grad_norm': 13.77190113067627, 'learning_rate': 2.7600000000000003e-05, 'epoch': 2.0}


 68%|██████▊   | 470/690 [14:29:57<3:29:17, 57.08s/it]

{'loss': 2.1548, 'grad_norm': 17.559022903442383, 'learning_rate': 2.8199999999999998e-05, 'epoch': 2.04}


 70%|██████▉   | 480/690 [14:40:21<3:43:58, 63.99s/it]

{'loss': 2.1155, 'grad_norm': 14.225872039794922, 'learning_rate': 2.88e-05, 'epoch': 2.09}


 71%|███████   | 490/690 [14:51:11<3:32:26, 63.73s/it]

{'loss': 2.1069, 'grad_norm': 16.647958755493164, 'learning_rate': 2.94e-05, 'epoch': 2.13}


 72%|███████▏  | 500/690 [15:01:58<3:29:10, 66.05s/it]

{'loss': 2.1241, 'grad_norm': 17.302061080932617, 'learning_rate': 3e-05, 'epoch': 2.17}


                                                      
 72%|███████▏  | 500/690 [15:10:10<3:29:10, 66.05s/it]

{'eval_loss': 1.7819159030914307, 'eval_runtime': 492.5138, 'eval_samples_per_second': 1.661, 'eval_steps_per_second': 0.416, 'epoch': 2.17}


 74%|███████▍  | 510/690 [15:20:50<3:31:40, 70.56s/it]  

{'loss': 2.1251, 'grad_norm': 17.829133987426758, 'learning_rate': 2.8421052631578946e-05, 'epoch': 2.22}


 75%|███████▌  | 520/690 [15:31:15<2:55:26, 61.92s/it]

{'loss': 2.087, 'grad_norm': 17.286632537841797, 'learning_rate': 2.6842105263157896e-05, 'epoch': 2.26}


 77%|███████▋  | 530/690 [15:41:27<2:52:01, 64.51s/it]

{'loss': 2.0961, 'grad_norm': 14.258685111999512, 'learning_rate': 2.526315789473684e-05, 'epoch': 2.3}


 78%|███████▊  | 540/690 [15:52:07<2:30:08, 60.06s/it]

{'loss': 2.0998, 'grad_norm': 14.123178482055664, 'learning_rate': 2.368421052631579e-05, 'epoch': 2.35}


 80%|███████▉  | 550/690 [16:03:04<2:27:07, 63.06s/it]

{'loss': 2.0891, 'grad_norm': 18.787899017333984, 'learning_rate': 2.2105263157894736e-05, 'epoch': 2.39}


 81%|████████  | 560/690 [16:12:47<2:08:49, 59.46s/it]

{'loss': 1.9568, 'grad_norm': 12.423715591430664, 'learning_rate': 2.0526315789473685e-05, 'epoch': 2.43}


 83%|████████▎ | 570/690 [16:23:53<2:09:27, 64.73s/it]

{'loss': 2.0526, 'grad_norm': 24.086139678955078, 'learning_rate': 1.894736842105263e-05, 'epoch': 2.48}


 84%|████████▍ | 580/690 [16:35:08<2:02:47, 66.98s/it]

{'loss': 2.0476, 'grad_norm': 22.288957595825195, 'learning_rate': 1.736842105263158e-05, 'epoch': 2.52}


 86%|████████▌ | 590/690 [16:44:53<1:40:35, 60.36s/it]

{'loss': 1.9993, 'grad_norm': 15.788033485412598, 'learning_rate': 1.5789473684210526e-05, 'epoch': 2.56}


 87%|████████▋ | 600/690 [16:55:36<1:35:08, 63.42s/it]

{'loss': 2.1155, 'grad_norm': 15.590600967407227, 'learning_rate': 1.4210526315789473e-05, 'epoch': 2.61}


 88%|████████▊ | 610/690 [17:06:42<1:26:25, 64.82s/it]

{'loss': 2.0441, 'grad_norm': 12.765673637390137, 'learning_rate': 1.263157894736842e-05, 'epoch': 2.65}


 90%|████████▉ | 620/690 [17:17:39<1:17:19, 66.28s/it]

{'loss': 1.9814, 'grad_norm': 13.159260749816895, 'learning_rate': 1.1052631578947368e-05, 'epoch': 2.69}


 91%|█████████▏| 630/690 [17:28:18<1:06:22, 66.38s/it]

{'loss': 2.0933, 'grad_norm': 16.337514877319336, 'learning_rate': 9.473684210526315e-06, 'epoch': 2.74}


 93%|█████████▎| 640/690 [17:38:41<51:13, 61.46s/it]  

{'loss': 2.0434, 'grad_norm': 12.050564765930176, 'learning_rate': 7.894736842105263e-06, 'epoch': 2.78}


 94%|█████████▍| 650/690 [17:49:40<42:13, 63.34s/it]

{'loss': 2.0485, 'grad_norm': 13.533679962158203, 'learning_rate': 6.31578947368421e-06, 'epoch': 2.82}


 96%|█████████▌| 660/690 [18:00:09<30:30, 61.03s/it]

{'loss': 2.0756, 'grad_norm': 43.373294830322266, 'learning_rate': 4.736842105263158e-06, 'epoch': 2.87}


 97%|█████████▋| 670/690 [18:10:51<21:33, 64.69s/it]

{'loss': 2.0593, 'grad_norm': 44.632659912109375, 'learning_rate': 3.157894736842105e-06, 'epoch': 2.91}


 99%|█████████▊| 680/690 [18:21:56<10:50, 65.06s/it]

{'loss': 1.9913, 'grad_norm': 13.256025314331055, 'learning_rate': 1.5789473684210526e-06, 'epoch': 2.95}




{'loss': 1.9793, 'grad_norm': 14.96561336517334, 'learning_rate': 0.0, 'epoch': 3.0}


100%|██████████| 690/690 [18:32:25<00:00, 96.73s/it]


{'train_runtime': 66745.8018, 'train_samples_per_second': 0.662, 'train_steps_per_second': 0.01, 'train_loss': 2.425387382507324, 'epoch': 3.0}
