In [1]:
!pip install -U pip
!pip install -U huggingface-hub==0.27.1 transformers==4.46.1 datasets==3.1.0  torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 bitsandbytes==0.45.0 accelerate==1.0.1

Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.0.1
Collecting huggingface-hub==0.27.1
  Downloading huggingface_hub-0.27.1-py3-none-any.whl.metadata (13 kB)
Collecting transformers==4.46.1
  Downloading transformers-4.46.1-py3-none-any.whl.metadata (44 kB)
Collecting datasets==3.1.0
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting torch==2.3.1
  Downloading torch-2.3.1-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting torchvision==0.18.1
  Downloading torchvision-0.18.1-cp310-cp310-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecti

In [None]:
# Install Rouge-Score library for evaluating text generation quality
!pip install rouge-score

# Upgrade NLTK to the latest version
!pip install --upgrade nltk

import nltk
# Download WordNet data from NLTK
nltk.download('wordnet')

# Download Open Multilingual Wordnet (omw) for extended WordNet support
nltk.download('omw-1.4')

# Download NLTK data for Tokenization
nltk.download('punkt')

# Download tokenization data for tokenizing in specific formats
nltk.download('punkt_tab')

# Install BERT-Score library for evaluating text similarity using BERT embeddings
!pip install bert-score

# Install Inflect library for generating plural forms, singular forms, and more
!pip install inflect

# Install gspread and oauth2client for Google Sheets API access
!pip install gspread oauth2client


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=e06d2e362fbb1b0ef701ccb7acb35fcf634d1e313bd0a7566f78e8dcd82603c2
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.4
    

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
Installing collected packages: bert-score
Successfully installed bert-score-0.3.13


In [None]:

import gc
import torch

import transformers
from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
import os
import re
import inflect
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import single_meteor_score
from bert_score import score as bert_score
import pandas as pd
import numpy as np
import pickle
import gspread
from oauth2client.service_account import ServiceAccountCredentials
pd.set_option('display.max_colwidth', None)

In [5]:
df = pd.read_parquet("hf://datasets/Muadil/all_unique_cleaned_openai_summarize_comparisons_test/data/train-00000-of-00001.parquet")

In [None]:
def initialize_google_sheets():
    """
    Initializes the connection to Google Sheets and opens the desired sheet.

    This function sets up the Google Sheets API access using a service account JSON file,
    opens the spreadsheet by its URL, and returns the first worksheet.

    Returns:
        gspread.models.Worksheet: The first worksheet of the specified Google Sheet.
    """
    # Setting up necessary permissions to access Google Sheets API
    client = gspread.service_account(filename="/kaggle/input/your_google_cloud_service_account.json")

    # Opening the specific Google Sheet using its URL
    spreadsheet = client.open_by_url(
        "https://docs.google.com/spreadsheets/d/your_file_id")

    # Retrieving the first worksheet (tab) in the spreadsheet
    return spreadsheet.get_worksheet(0)


def sheets_to_df(worksheet):
    """
    Converts the data from a Google Sheets worksheet into a pandas DataFrame.

    This function fetches all records from a given worksheet and converts the data 
    into a pandas DataFrame for further processing.

    Args:
        worksheet (gspread.models.Worksheet): The worksheet from which to extract data.

    Returns:
        pd.DataFrame: A DataFrame containing the data from the worksheet.
    """
    # Fetching data from the worksheet and converting it into a pandas DataFrame
    data = worksheet.get_all_records()  # Retrieves the data as a list of dictionaries
    df = pd.DataFrame(data)  # Converting the list of records into a DataFrame
    return df


In [None]:
def load_data_to_the_sheet(worksheet, sheet_df):
    """
    Loads data from a pandas DataFrame to a Google Sheets worksheet.

    This function cleans the data by removing any invalid or empty values (such as NaN, inf, -inf),
    and then writes the cleaned data to the specified Google Sheets worksheet. The DataFrame's columns 
    are used as the headers, and the data is appended to the sheet.

    Args:
        worksheet (gspread.models.Worksheet): The worksheet where data should be written.
        sheet_df (pd.DataFrame): The DataFrame containing the data to be written to the worksheet.

    Returns:
        None: This function does not return any value, it only updates the worksheet.
    """
    # Clean the data by replacing invalid or empty values with None
    sheet_df = sheet_df.replace([float("nan"), float("inf"), float("-inf")], None)

    # Write data to Google Sheets
    headers = sheet_df.columns.tolist()
    updated_data = sheet_df.values.tolist()

    worksheet.clear()  # Clears any existing data in the worksheet
    worksheet.append_row(headers)  # Appends the headers (column names)
    worksheet.append_rows(updated_data)  # Appends the data rows
    print("Data successfully updated.")


def save_the_output(sheet_df, model_name, prediction, row_index):
    """
    Saves the model's prediction and score to the specified row in the DataFrame.

    This function adds a new column for the model if it does not already exist, 
    then saves the model's prediction for the specified row. It also adds a corresponding
    score column for the model if it's not already present.

    Args:
        sheet_df (pd.DataFrame): The DataFrame containing the data to be updated.
        model_name (str): The name of the model making the prediction.
        prediction (any type): The model's prediction to be stored.
        row_index (int): The index of the row where the prediction should be saved.

    Returns:
        None: This function updates the DataFrame in place and does not return a value.
    """
    
    # Replace special characters in model name to make it suitable as a column name
    model_name = model_name.replace("-", "_").replace("/", "_")
    
    # Add new model column if it doesn't exist
    columns = sheet_df.columns.tolist()
    if model_name not in columns:
        print(f"{model_name} column initialized")
        sheet_df[model_name] = None
        
        # Add score column for the model
        size = len(sheet_df.columns.to_list())
        score_col = f"Score_{(size-2)//2}"
        sheet_df[score_col] = None
    
    # Save the model's prediction to the specified row
    if sheet_df.loc[row_index, model_name] in [np.nan, None, ""]:
        sheet_df.loc[row_index, model_name] = prediction


def get_inference(model, tokenizer, text, max_token_length, model_name, device):
    """
    Generates a summary for a given text using a model and tokenizer.

    This function formats the input text, tokenizes it, and uses the model to generate a summary. 
    If the model name contains 'instruct', it extracts the summary from the output.

    Args:
        model (transformers.PreTrainedModel): The model used for generating the summary.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to preprocess and decode the text.
        text (str): The input text that needs to be summarized.
        max_token_length (int): The maximum length of the input tokens.
        model_name (str): The name of the model, used to check if it's an instruction-based model.
        device (torch.device): The device on which the model should run (e.g., CPU or GPU).

    Returns:
        str: The predicted summary generated by the model.
    """
    
    # Format the input text for summarization
    text = f"System: I want you to summarize this text\nDocument: {text}\nSummary:"
    
    # Tokenize the input text and move to the correct device
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_token_length).to(device)
    
    # Generate the output (summary) using the model
    output = model.generate(**inputs, max_new_tokens=max_token_length, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id)
    
    # Decode the output to obtain the prediction
    prediction = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # If the model is instruction-based, process the output to extract the summary
    if "instruct" in model_name.lower():
        if "Summary:" in prediction:
            prediction = prediction.split("Summary:")[1].strip()
        else:
            prediction = prediction  # Handle unexpected situations
    
    return prediction


In [None]:
def calculate_summary_metrics(references, predictions):
    """
    Calculates evaluation metrics (ROUGE, METEOR, BERTScore) to assess the quality of model-generated summaries.

    Args:
        references (list): A list of reference summaries.
        predictions (list): A list of summaries generated by the model.

    Returns:
        dict: A dictionary containing the average ROUGE, METEOR, and BERTScore metrics.
    """
    
    # Check if the lengths of references and predictions match
    if len(references) != len(predictions):
        raise ValueError("References and predictions must have the same length!")

    # Create a ROUGE scorer object
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Initialize lists to store individual scores
    rouge1_scores, rouge2_scores, rougeL_scores = [], [], []
    meteor_scores = []

    # Calculate metrics for each reference and prediction pair
    for ref, pred in zip(references, predictions):
        # Compute ROUGE scores
        rouge_scores = rouge_scorer_obj.score(ref, pred)
        rouge1_scores.append(rouge_scores['rouge1'].fmeasure)
        rouge2_scores.append(rouge_scores['rouge2'].fmeasure)
        rougeL_scores.append(rouge_scores['rougeL'].fmeasure)

        # Compute METEOR score (by tokenizing the sentences)
        tokenized_ref = nltk.word_tokenize(ref)
        tokenized_pred = nltk.word_tokenize(pred)
        meteor_scores.append(single_meteor_score(tokenized_ref, tokenized_pred))

    # Compute BERTScore
    P, R, F1 = bert_score(predictions, references, lang="en", rescale_with_baseline=True)
    bert_f1 = F1.mean().item()

    # Compute average scores
    scores = {
        'ROUGE-1': sum(rouge1_scores) / len(rouge1_scores),
        'ROUGE-2': sum(rouge2_scores) / len(rouge2_scores),
        'ROUGE-L': sum(rougeL_scores) / len(rougeL_scores),
        'METEOR': sum(meteor_scores) / len(meteor_scores),
        'BERTScore': bert_f1,
    }

    return scores




def add_metrics_to_dataframe(metrics_df, model_name, references, predictions):
    """
    Calculates evaluation metrics and adds them to the provided DataFrame.

    Args:
        metrics_df (pd.DataFrame): The existing DataFrame to store the evaluation metrics.
        model_name (str): The name of the model.
        references (list): A list of reference summaries.
        predictions (list): A list of summaries generated by the model.

    Returns:
        pd.DataFrame: The updated DataFrame with the added metrics.
    """
    # Calculate the metrics
    scores = calculate_summary_metrics(references, predictions)

    # Add the model name to the metrics scores
    scores["Model Name"] = model_name

    # Append the metrics as a new row to the DataFrame
    metrics_df = pd.concat([metrics_df, pd.DataFrame([scores])], ignore_index=True)

    return metrics_df





def summarize_and_save_metrics_AutoModelForSeq2SeqLM(models, tokenizer_names, texts, references, metrics_df=None, device="cpu", max_token_length=512):
    """
    Summarizes texts using models, calculates evaluation metrics, and saves them to a DataFrame.

    Args:
        models (list): A list of model names to perform summarization.
        tokenizer_names (list): A list of tokenizer names corresponding to the models.
        texts (list): A list of texts to summarize.
        references (list): A list of reference summaries (should have the same length as texts).
        metrics_df (pd.DataFrame, optional): An existing DataFrame to store evaluation metrics.
                                             If None, a new DataFrame will be created.
        device (str): The device to run the model on ('cpu' or 'cuda').

    Returns:
        pd.DataFrame: The updated metrics DataFrame.
    """
    # If metrics_df is None, create an empty DataFrame
    if metrics_df is None or not isinstance(metrics_df, pd.DataFrame):
        metrics_df = pd.DataFrame(columns=["Model Name", "ROUGE-1", "ROUGE-2", "ROUGE-L", "METEOR", "BERTScore"])
        print("A new metrics_df has been created.")

    # Check if texts and references have the same length
    if len(texts) != len(references):
        raise ValueError("texts and references must have the same length.")

    # For quantization (optional configuration)
    # quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_quant_type="nf4")

    for model_name, tokenizer_name in zip(models, tokenizer_names):
        # Load the model and tokenizer
        # model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            # quantization_config=quantization_config,
            device_map="auto"  # Automatically set device if GPU is available
        )
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, device_map="auto")
        tokenizer.pad_token = tokenizer.eos_token
        print(f"{model_name} loaded, generating outputs.")

        # Read from Google Sheet
        worksheet = initialize_google_sheets()
        sheet_df = sheets_to_df(worksheet)
        
        # Generate predictions
        predictions = []
        for row, text in enumerate(texts):
            prediction = get_inference(model, tokenizer, text, max_token_length, model_name, device)
            save_the_output(sheet_df, model_name, prediction, row)
            predictions.append(prediction)

        # Write back to Google Sheets
        load_data_to_the_sheet(worksheet, sheet_df)

        # Add evaluation metrics to the DataFrame
        metrics_df = add_metrics_to_dataframe(metrics_df, model_name, references, predictions)
        print(f"Evaluation metrics for model '{model_name}' have been saved.")
        
        # Clean up memory
        del model
        del tokenizer
        gc.collect()
        torch.cuda.empty_cache()

    return metrics_df





# List of models
models = [
    "Muadil/Llama-3.2-1B-Instruct_sum_DPO_140k_1_20ep"
]

# List of tokenizers
tokenizer_names = [
    "Muadil/Llama-3.2-1B-Instruct_sum_DPO_140k_1_20ep"
]

# Authentication with Hugging Face using the token stored in Kaggle secrets
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

# Retrieve the Hugging Face token from the secrets
secret_value_0 = user_secrets.get_secret("HF_TOKEN")

# Login to Hugging Face using the token
from huggingface_hub import login
login(token=secret_value_0)



# Create an empty DataFrame to store performance metrics
metrics_df = pd.DataFrame(columns=["Model Name", "ROUGE-1", "ROUGE-2", "ROUGE-L", "METEOR", "BERTScore"])

# Run the function and calculate the metrics
metrics_df = summarize_and_save_metrics_AutoModelForSeq2SeqLM(
    models=models,
    tokenizer_names=tokenizer_names,
    texts=list(df.iloc[:1000]["prompt"]),  # Take the first 1000 "prompt" texts
    references=list(df.iloc[:1000]["chosen"]),  # Take the first 1000 "chosen" references
    metrics_df=metrics_df,
    device="cuda",  # Use GPU
)

from IPython.display import FileLink

# Save the file as a Pickle file on Kaggle
file_name = "output.pkl"
metrics_df.to_pickle(file_name)

# Create a download link
display(FileLink(file_name))

In [None]:
metrics_df