In [1]:
import os
os.chdir("../")
%pwd

'/home/migue/data-science/portfolio/text-summarizer'

## Entity

In [2]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    data_path: Path
    model_path: Path
    tokenizer_path: Path
    metric_file_name: Path

In [3]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories

In [4]:
class ConfigurationManager:
    def __init__(
        self,
        config_file_path=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
    ):

        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation

        create_directories([config.root_dir])

        model_evaluation_config = ModelEvaluationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_path=config.model_path,
            tokenizer_path=config.tokenizer_path,
            metric_file_name=config.metric_file_name,
        )

        return model_evaluation_config

In [5]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_from_disk, load_metric
import torch
import pandas as pd
from tqdm.notebook import tqdm

[2024-09-02 21:30:39,417: INFO: config: PyTorch version 2.3.1 available.]


In [6]:
class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config

    def generate_batch_sized_chuncks(self, list_of_elements: list, batch_size: int):
        """Split dataset into smaller batches that can be processed simultaneously
        Yield successive batch-sized chunks from list of elements.

        Args:
            list_of_elements (list): List with elements to be split on batches
            batch_size (int): Number of elements per batch

        Yield:
            list: Batches
        """
        for i in range(0, len(list_of_elements), batch_size):
            yield list_of_elements[i : i + batch_size]

    def calculate_test_metric(
        self,
        dataset,
        metric,
        model,
        tokenizer,
        batch_size: int = 16,
        device: str = "cuda",
        column_text: str = "article",
        column_summary: str = "highlights",
    ) -> float:
        """_summary_

        Args:
            dataset (datasets.arrow_dataset.Dataset): _description_
            metric (str): _description_
            model (transformers.models.pegasus.modeling_pegasus.PegasusForConditionalGeneration): _description_
            tokenizer (transformers.models.pegasus.tokenization_pegasus_fast.PegasusTokenizerFast): _description_
            batch_size (int): _description_
            column_text (str, optional): _description_. Defaults to "article".
            column_summary (str, optional): _description_. Defaults to "highlights".

        Returns:
            float: _description_
        """
        article_batches = list(
            self.generate_batch_sized_chuncks(dataset[column_text], batch_size)
        )
        target_batches = list(
            self.generate_batch_sized_chuncks(dataset[column_summary], batch_size)
        )

        for article_batch, target_batch in tqdm(
            zip(article_batches, target_batches), total=len(article_batches)
        ):

            inputs = tokenizer(
                article_batch,
                max_length=1024,
                truncation=True,
                padding="max_length",
                return_tensors="pt",
            )
            summaries = model.generate(
                input_ids=inputs["input_ids"].to(device),
                attention_mask=inputs["attention_mask"].to(device),
                length_penalty=0.8,
                num_beams=8,
                max_length=128,  # avoid long sequences
            )

            decoded_summaries = [
                tokenizer.decode(
                    s,
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=True,
                )
                for s in summaries
            ]

            decoded_summaries = [d.replace("", " ") for d in decoded_summaries]

            metric.add_batch(predictions=decoded_summaries, references=target_batch)

        return metric.compute()

    def evaluate(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(
            self.config.model_path
        ).to(device)

        dataset_pt = load_from_disk(self.config.data_path)

        rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
        rouge_metric = load_metric("rouge")
        score = self.calculate_test_metric(
            dataset_pt["test"],
            rouge_metric,
            model_pegasus,
            tokenizer,
            batch_size=8,
            column_text="dialogue",
            column_summary="summary",
        )

        rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)

        df = pd.DataFrame(rouge_dict, index=["pegasus"])
        df.to_csv(self.config.metric_file_name, index=False)

## Pipeline

In [7]:
try:
    config = ConfigurationManager()
    model_evaluation_config = config.get_model_evaluation_config()
    model_evaluation_config = ModelEvaluation(config=model_evaluation_config)
    model_evaluation_config.evaluate()
except Exception as e:
    raise e

[2024-09-02 21:30:40,620: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-09-02 21:30:40,621: INFO: common: yaml file: params.yaml loaded successfully]
[2024-09-02 21:30:40,622: INFO: common: created a directory at: artifacts]
[2024-09-02 21:30:40,622: INFO: common: created a directory at: artifacts/model_evaluation]


  rouge_metric = load_metric("rouge")


  0%|          | 0/103 [00:00<?, ?it/s]

[2024-09-02 21:33:57,309: INFO: rouge_scorer: Using default tokenizer.]
