## Library Imports

In [200]:
from datasets import load_dataset
import random
import time
import json
import utils
import mercury as mr
import openai
import os
import pandas as pd
pd.set_option('display.float_format', '{:.10f}'.format)
from google.cloud.exceptions import NotFound
import os
from tqdm import tqdm
import time

#Vertex AI libraries
import vertexai
from vertexai.preview.generative_models import GenerativeModel, GenerationConfig, Part
from vertexai.preview.tuning import sft
from vertexai.preview.generative_models import GenerativeModel, GenerationConfig, Part
from vertexai.evaluation import EvalTask, MetricPromptTemplateExamples, PointwiseMetric, PairwiseMetric

#OpenAI library
from openai import OpenAI
client = OpenAI()

## Loading Dataset From HuggingFace

In [201]:
dataset = load_dataset("knkarthick/dialogsum")

## Creating different sized tuning Datasets

In [202]:
# full size datasets
train12460=dataset["train"].to_list()
valid500 =dataset["validation"].to_list()
test1500 =dataset["test"].to_list()

base_instruction="Summarize the following dialogue: "
for item in test1500: 
    item["dialogue"] = base_instruction + item["dialogue"]

# smaller datasets for rapid testing
train2000=train12460[:2000]
test100=test1500[:100]
test250=test1500[:250]
test10=test1500[:10]

## Data Formatting for Tuning API

In [None]:
#Prepare data for Gemini 1.5 Tuning
# Define a base prompt for zero-shot summarization 
base_instruction="Summarize the following dialogue: "
utils.format_tuning_dataset(train2000, valid500, base_instruction, "dialogsum_train2000_inst","dialogsum_valid500_inst")
utils.format_tuning_dataset(train12460, valid500, base_instruction, "dialogsum_train12460_inst","dialogsum_valid500_inst")

base_instruction=""
utils.format_tuning_dataset(train2000, valid500, base_instruction, "dialogsum_train2000_no_inst","dialogsum_valid500_no_inst")
utils.format_tuning_dataset(train12460, valid500, base_instruction, "dialogsum_train12460_no_inst","dialogsum_valid500_no_inst")

In [None]:
utils.delete_and_upload("dialogsum_train12460_inst.jsonl")
utils.delete_and_upload("dialogsum_train2000_inst.jsonl")
utils.delete_and_upload("dialogsum_valid500_inst.jsonl")

## Submit Tuning Job

In [None]:
model="gemini-1.5-flash-001"
utils.tune_gemini("gs://mchrestkha-sample-data/dialogsum/dialogsum_train2000_inst.jsonl", "gs://mchrestkha-sample-data/dialogsum/dialogsum_valid500_inst.jsonl", model, "dialogsum_2000_inst")
utils.tune_gemini("gs://mchrestkha-sample-data/dialogsum/dialogsum_train12460_inst.jsonl", "gs://mchrestkha-sample-data/dialogsum/dialogsum_valid500_inst.jsonl", model, "dialogsum_124600_inst")

## OpenAI Tuning

In [None]:
#Prepare data for OpenAI  Tuning
# Define a base prompt for zero-shot summarization 
system_prompt="Summarize the following dialogue: "

# Initialize lists to store messages for training and validation
train_messages = []
validation_messages = []
train = train2000
valid = valid500

# Iterate over training data and create messages for each dialogue-summary pair
for d in train:
  prompts = []
  prompts.append({"role": "system", "content": system_prompt})
  prompts.append({"role": "user", "content": d["dialogue"]})
  prompts.append({"role": "assistant", "content": d["summary"]})
  train_messages.append({'messages': prompts})

# Iterate over validation data and create messages similarly
for d in valid:
  prompts = []
  prompts.append({"role": "system", "content": system_prompt})
  prompts.append({"role": "user", "content": d["dialogue"]})
  prompts.append({"role": "assistant", "content": d["summary"]})
  validation_messages.append({'messages': prompts})

    # Print lengths of message lists and an example training message
len(train_messages), len(validation_messages), train_messages[2]

In [None]:
# Save to JSON locally
utils.dicts_to_jsonl(train_messages, "openai_dialogsum_train2000", False)
utils.dicts_to_jsonl(validation_messages, "openai_dialogsum_valid500", False)

In [None]:
# Register & Uplaod Files to OpenAI Storage
client.files.create(
  file=open("openai_dialogsum_train2000.jsonl", "rb"),
  purpose="fine-tune"
)

client.files.create(
  file=open("openai_dialogsum_valid500.jsonl", "rb"),
  purpose="fine-tune"
)

In [None]:
#Submit Tuning Job
client.fine_tuning.jobs.create(
  training_file="file-KxJuvj5sQ3kLQoI7f6X8S9PE", 
  validation_file="file-QGvOkG9PtiZJ7y0L1JmNbzDE",
  model="gpt-4o-mini-2024-07-18"
)

## Running Predictions on Test Data
### For X test examples takes Y min to generate predictions

In [203]:
gemini_text = []
openai_text = []
gemini_tuned_text = []
openai_tuned_text = []

tuning_job = sft.SupervisedTuningJob("projects/642508009780/locations/us-central1/tuningJobs/2137747369456828416")
tuned_model = GenerativeModel(tuning_job.tuned_model_endpoint_name)
model = GenerativeModel("gemini-1.5-flash-001")
client = OpenAI()
#test=test10
#test=test1500
test=test250

for row in tqdm(test, desc="Processing", unit="row"):
    try:
        gemini_response = model.generate_content(contents=row["dialogue"])
        gemini_text.append(gemini_response.text)
    except (ValueError, AttributeError):  # Catch broader potential errors
        gemini_text.append("Blocked")
        
    try:
        gemini_tuned_response = tuned_model.generate_content(contents=row["dialogue"])
        gemini_tuned_text.append(gemini_tuned_response.text)
    except (ValueError, AttributeError):  # Catch broader potential errors
        gemini_tuned_text.append("Blocked")

    try:
        openai_response = client.chat.completions.create(
            model="gpt-4o-mini-2024-07-18",
            messages=[{"role": "user", "content": row["dialogue"]}]
        )
        openai_text.append(openai_response.choices[0].message.content)
    except (ValueError, AttributeError): 
        openai_text.append("Blocked")

    try:
        openai_tuned_response = client.chat.completions.create(
            model="ft:gpt-4o-mini-2024-07-18:personal::A3WwHRrJ",
            messages=[{"role": "user", "content": row["dialogue"]}]
        )
        openai_tuned_text.append(openai_tuned_response.choices[0].message.content)
    except (ValueError, AttributeError): 
        openai_tuned_text.append("Blocked")

# Directly create the final DataFrame with responses included
df_final = pd.DataFrame(test)
df_final["gemini_response"] = gemini_text
df_final["openai_response"] = openai_text
df_final["gemini_tuned_response"] = gemini_tuned_text
df_final["openai_tuned_response"] = openai_tuned_text

Processing: 100%|██████████| 250/250 [13:44<00:00,  3.30s/row]


In [207]:
df_final["summary_response"]=df_final["summary"]
df_test_predictions_final=df_final
df_test_predictions_final.to_csv('df_test_predictions_final.csv', index=False) 

## Running Computation & Model Pointwise Evals

In [210]:
#Define a pointwise custom summarization quality metric 
pointwise_custom_summary_metric_prompt = """
# Instruction
You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models.
We will provide you with the user input and an AI-generated response.
You should first read the user input carefully for analyzing the task, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.
You will assign the response a rating following the Rating Rubric and Evaluation Steps. Give step-by-step explanations for your rating, and only choose ratings from the Rating Rubric. **Explicitly include the word count of the response as the first step in your explanation**, and ensure it aligns with the criteria.

# Evaluation
## Metric Definition
You will be assessing summarization quality, which measures the overall ability to summarize text.  The context to be summarized are provided in the user prompt. The response should be shorter than the text in the context. The response should not contain information that is not present in the context.

## Criteria
Less than 50 words: The response contains less than 50 words.  Use the following formula to count the words in the response: `=COUNTA(SPLIT(response, " "))`
Groundedness: The response contains information included only in the context. The response does not reference any outside information.
Observer Perspective: The response is written from an observer perspective.

## Rating Rubric
5: (Very good). The summary is less than 50 words, is grounded and is written as an observer.
4: (Good). The summary is less than 50 words and is grounded.  
3: (Ok). The summary is more than 50 words but mostly grounded
2: (Bad). The summary is more than 50 words and not grounded.
1: (Very bad). The summary is more than 50 words and not grounded.

## Evaluation Steps
STEP 1: Assess the response in aspects of word count, groundedness, and observer perspective according to the criteria.  **Use the provided formula to determine the EXACT word count**
STEP 2: Score based on the rubric.

# User Inputs and AI-generated Response
## User Inputs

### Prompt
{prompt}

## AI-generated Response
{response}

"""

pointwise_custom_summary_metric = PointwiseMetric(
  metric="custom_point_summary_metric",
  metric_prompt_template=pointwise_custom_summary_metric_prompt,
)

In [None]:
def run_eval(dataset, col_prompt,col_response,col_reference):
    eval_dataset_comp=dataset[[col_prompt,col_response,col_reference]]
    #print(eval_dataset_comp)
    eval_dataset_comp = eval_dataset_comp.rename(columns={col_prompt: 'prompt', col_response: 'response', col_reference: 'reference'})
    #print(eval_dataset_comp)
    eval_task = EvalTask(
        dataset=eval_dataset_comp, 
        metrics=["rouge_l_sum",MetricPromptTemplateExamples.Pointwise.SUMMARIZATION_QUALITY, pointwise_custom_summary_metric],
        )
    eval_result = eval_task.evaluate().summary_metrics
    eval_result_df = pd.DataFrame(eval_result, index=[col_response]).rename_axis('model').reset_index()
    return eval_result_df

# Evaluate different models
results = [
run_eval(df_final, "dialogue", "gemini_response", "summary"),
run_eval(df_final, "dialogue", "gemini_tuned_response", "summary"),
run_eval(df_final, "dialogue", "summary_response", "summary"),
run_eval(df_final, "dialogue", "openai_response", "summary"),
run_eval(df_final, "dialogue", "openai_tuned_response", "summary"),
]

# Combine results
combined_comp_point_eval_result = pd.concat(results, ignore_index=True)

Computing metrics with a total of 750 Vertex online evaluation service requests.


100%|██████████| 750/750 [50:03<00:00,  4.00s/it]


All 750 metric requests are successfully computed.
Evaluation Took:3003.383097610007 seconds
Computing metrics with a total of 750 Vertex online evaluation service requests.


100%|██████████| 750/750 [50:00<00:00,  4.00s/it]


All 750 metric requests are successfully computed.
Evaluation Took:3000.8083779989975 seconds
Computing metrics with a total of 750 Vertex online evaluation service requests.


100%|██████████| 750/750 [50:01<00:00,  4.00s/it] 


All 750 metric requests are successfully computed.
Evaluation Took:3001.101887780009 seconds
Computing metrics with a total of 750 Vertex online evaluation service requests.


 64%|██████▍   | 483/750 [32:12<19:18,  4.34s/it] 

In [None]:
combined_comp_point_eval_result
#combined_comp_point_eval_result.to_csv('combined_comp_point_eval_result.csv', index=False) 

## Running Pairwise (AutoSxS) Model Evals

In [None]:
#Define a pointwise custom summarization quality metric 
pairwise_custom_summary_metric_prompt = """
# Instruction
You are an expert evaluator. Your task is to evaluate the quality of the responses generated by two AI models. We will provide you with the user input and a pair of AI-generated responses (Response A and Response B).
You should first read the user input carefully for analyzing the task, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.
You will first judge responses individually, following the Rating Rubric and Evaluation Steps.
Then you will give step-by-step explanations for your judgement, compare results to declare the winner based on the Rating Rubric and Evaluation Steps.
# Evaluation
## Metric Definition
You will be assessing summarization quality, which measures the overall ability to summarize text.  The context to be summarized are provided in the user prompt. The response should be shorter than the text in the context. The response should not contain information that is not present in the context.

## Criteria
Less than 50 words: The response contains less than 50 words.  Use the following formula to count the words in the response: `=COUNTA(SPLIT(response, " "))`
Groundedness: The response contains information included only in the context. The response does not reference any outside information.
Observer Perspective: The response is written from an observer perspective.

## Rating Rubric
"A": Response A summarizes the given context as per the criteria better than response B.
"SAME": Response A and B summarizes the given context equally well as per the criteria.
"B": Response B summarizes the given context as per the criteria better than response A.

## Evaluation Steps
STEP 1: Analyze Response A based on the summarization quality criteria: Determine how well Response A fulfills the user requirements, is less than 50 words, is grounded and is written as an observer, and provide assessment according to the criterion.
STEP 2: Analyze Response B based on the summarization quality criteria: Determine how well Response A fulfills the user requirements, is less than 50 words, is grounded and is written as an observer, and provide assessment according to the criterion.
STEP 3: Compare the overall performance of Response A and Response B based on your analyses and assessment.
STEP 4: Output your preference of "A", "SAME" or "B" to the pairwise_choice field according to the Rating Rubric.
STEP 5: Output your assessment reasoning in the explanation field.


# User Inputs and AI-generated Responses
## User Inputs

### Prompt
{prompt}

## AI-generated Responses
### Response A
{baseline_model_response}

### Response B
{response}

"""

pairwise_custom_summary_metric = PairwiseMetric(
  metric="custom_pairwise_summary_metric",
  metric_prompt_template=pairwise_custom_summary_metric_prompt,
)

In [None]:
eval_dataset_pair = df_final[['dialogue', 'gemini_tuned_response', 'openai_tuned_response']].rename(columns={
    'dialogue': 'prompt', 
    'gemini_tuned_response': 'response', 
    'openai_tuned_response': 'baseline_model_response'
})

In [None]:
eval_task = EvalTask(
    dataset=eval_dataset_pair, 
    metrics=[MetricPromptTemplateExamples.Pairwise.SUMMARIZATION_QUALITY, pairwise_custom_summary_metric],
    )
eval_result = eval_task.evaluate()

In [None]:
combined_pair_eval_result=eval_result.summary_metrics
#combined_pair_eval_result.to_csv('combined_pair_eval_result.csv', index=False) 