- Update config.yml
- Update params.yml
- Update entity
- Update the configuration manager in src config
- Update the components
- Update the pipeline
- Update main.py
- Update app.py

In [1]:
import os 
%pwd
os.chdir("../../../")
%pwd

'd:\\Project\\Text-Summarizer\\textSummarizer'

In [2]:
# Update entity
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelEvalConfig:
    root_dir: Path
    data_path: Path
    model_path: Path
    tokenizer_path: Path
    metric_file_name: Path

In [3]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories


# Update the configuration manager in src config
class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifact_root])

    def get_model_eval_config(self) -> ModelEvalConfig:
        config = self.config.model_evaluation
        create_directories([config.root_dir])
    
        data_config = ModelEvalConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_path= config.model_path,
            tokenizer_path= config.tokenizer_path,
            metric_file_name= config.metric_file_name,
        )
        return data_config

In [4]:
x = ConfigurationManager()
x = x.get_model_eval_config()

[2025-04-07 13:47:48,863 - INFO - (common)]: YAML file config\config.yaml loaded successfully.
[2025-04-07 13:47:48,865 - INFO - (common)]: YAML file parameters.yaml loaded successfully.
[2025-04-07 13:47:48,866 - INFO - (common)]: Directory artifacts created or already exists.
[2025-04-07 13:47:48,867 - INFO - (common)]: Directory artifacts/model_evaluation created or already exists.


In [5]:
import torch
from transformers import (AutoModelForSeq2SeqLM, AutoTokenizer, 
                            Trainer, TrainingArguments, DataCollatorForSeq2Seq)
from tqdm import tqdm
from datasets import load_dataset, load_from_disk
from evaluate import load
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


[2025-04-07 13:47:53,079 - INFO - (config)]: PyTorch version 2.4.1+cu124 available.


In [None]:
class ModelEvaluation:
    def __init__(self, config: ModelEvalConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_path)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(config.model_path)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def generate_batch_sized_chunks(self, list_of_elements, batch_size):
        """Split the dataset into smaller batches that we can process simultaneously.
        Yield successive batch-sized chunks from list_of_elements."""
        for i in range(0, len(list_of_elements), batch_size):
            yield list_of_elements[i : i + batch_size]


    def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer,
                                batch_size=2,
                                column_text="article",
                                column_summary="highlights"):
        article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))
        target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))

        all_predictions = []
        all_references = []

        for article_batch, target_batch in tqdm(
            zip(article_batches, target_batches), total=len(article_batches)):

            inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                            padding="max_length", return_tensors="pt")

            summaries = model.generate(input_ids=inputs["input_ids"].to(self.device),
                            attention_mask=inputs["attention_mask"].to(self.device),
                            length_penalty=0.8, num_beams=8, max_length=128)

            decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                    clean_up_tokenization_spaces=True)
                for s in summaries]

            decoded_summaries = [d.replace("", " ") for d in decoded_summaries]

            all_predictions.extend(decoded_summaries)
            all_references.extend(target_batch)

        # Compute and return ROUGE scores
        score = metric.compute(predictions=all_predictions, references=all_references)
        return score
    

    def evaluate(self):
        tokenizer = self.tokenizer
        model = self.model.to(self.device)

        # Load the dataset
        dataset_samsum = load_from_disk(self.config.data_path)

        # Load the metric
        rouge_metric = load("rouge")

        score = self.calculate_metric_on_test_ds(dataset_samsum['test'][0:10], rouge_metric,
                                    model, tokenizer,
                                    batch_size=2,
                                    column_text="dialogue",
                                    column_summary="summary")

        rouge_scores = {key: score[key] for key in ["rouge1", "rouge2", "rougeL", "rougeLsum"]}

        df = pd.DataFrame(rouge_scores, index = [f'pegasus'] )
        df.to_csv(self.config.metric_file_name, index=False)

In [7]:
x = ModelEvaluation(x)
x = x.evaluate()

100%|██████████| 5/5 [01:31<00:00, 18.39s/it]

[2025-04-07 13:49:34,771 - INFO - (rouge_scorer)]: Using default tokenizer.





In [None]:
# Update the pipeline
try:
    config = ConfigurationManager()
    data_config = config.get_model_eval_config()
    model_evaluator = ModelEvaluation(config=data_config)
    model_evaluator.evaluate()
except Exception as e:
    raise e