In [1]:
import pandas as pd
import google.generativeai as genai
import json

# Load the model responses and prompts
df = pd.read_csv("../data/FineTunedResponses/responses_to_fine_tuned.csv")

with open('../env.json', 'r') as f:
    env_vars = json.load(f)


In [6]:
from google.api_core import retry
import os

os.environ["GOOGLE_API_KEY"] = env_vars["GOOGLE_API_KEY"]

#env.json
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

model = genai.GenerativeModel("gemini-2.0-flash")

#retry
model.generate_content = retry.Retry(predicate=lambda e: hasattr(e, 'code') and e.code in {429, 503})(model.generate_content)

In [33]:
# Basic Rubric For Evaluation of Responses
rubric_instructions = '''
You are an expert evaluator for AI-generated content.
Rate the following response according to this rubric:

- Relevance (1-5): How well the response addresses the prompt.
- Accuracy (1-5): Is the response factual and correct?
- Completeness (1-5): Are all required parts of the task addressed?
- Clarity (1-5): Is the language coherent and well-structured?

Provide the scores as a JSON object like this:
{
  "relevance": 5,
  "accuracy": 4,
  "completeness": 5,
  "clarity": 4
}

Then briefly justify your scores.
'''

In [8]:
# Takes the rubric and the response and returns the evaluation
def evaluate_with_gemini(prompt, response, expected):
    full_input = f"""
{rubric_instructions}

Prompt:
{prompt}

Expected Answer:
{expected}

Model Response:
{response}
"""
    result = model.generate_content(full_input)
    return result.text

In [None]:
# Create an subsection copy of the sample data
df_sample = df.head(3).copy()

# Use loc (I had to look this up) to set the values
df_sample.loc[:, "evaluation"] = df_sample.apply(
    lambda row: evaluate_with_gemini(
        row["original_prompt"],
        row["response_to_fine_tuned"],
        row["fine_tuned_prompt"]  # Using fine_tuned_prompt as the expected answer
    ), axis=1
)

In [19]:
#Prints the subsection to check the values and make sure it looks right
print(df_sample[["original_prompt", "response_to_fine_tuned", "evaluation"]])

                                     original_prompt  \
0   One-pot vegetarian pasta recipes for busy nights   
1  We have the following blog content... what is ...   
2  how o sort element using merge sort technique ...   

                              response_to_fine_tuned  \
0  Okay, let's analyze the strengths and weakness...   
1  Okay, let's break down the likely user intent ...   
2  ```java\r\npublic class MergeSort {\r\n\r\n   ...   

                                          evaluation  
0  {\n  "relevance": 5,\n  "accuracy": 4,\n  "com...  
1  ```json\n{\n  "relevance": 5,\n  "accuracy": 4...  
2  ```json\n{\n  "relevance": 5,\n  "accuracy": 5...  


In [31]:
from chromadb import Documents, EmbeddingFunction, Embeddings
import google.generativeai as genai
from google.api_core import retry
from google.api_core import exceptions

# A function that uses Gemini model to convert the responses into embeddings.
# Can generate embeddings for documents or queries depending on if the variable document_mode is set to true or false.
class GeminiEmbeddingFunction(EmbeddingFunction):
    document_mode = True  # Specify whether to generate embeddings for documents or queries

    def is_retriable(e):
        return isinstance(e, exceptions.ServiceUnavailable) or \
               isinstance(e, exceptions.ResourceExhausted) or \
               (hasattr(e, 'code') and e.code in {429, 503})

    @retry.Retry(predicate=is_retriable)
    def __call__(self, input: Documents) -> Embeddings:
        embedding_task = "retrieval_document" if self.document_mode else "retrieval_query"

        # Handle single string or list of strings
        if isinstance(input, str):
            input = [input]
        
        embeddings = []
        for text in input:
            response = genai.embed_content(
                model="models/embedding-001",
                content=text,
                task_type=embedding_task
            )
            # Extract just the embedding values from the response (With the call of the embedding row)
            embeddings.append(response['embedding'])
        
        return embeddings

In [43]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from chromadb import Documents, Embeddings

# A class that uses the rubric and the embeddings to grade the responses given to it
class RubricGrader:
    def __init__(self):
        self.embedding_function = GeminiEmbeddingFunction()
        # Define rubric categories (Im not sure if we should have this as a dictionary or a list, and also if we should have categories for grading as well or just a single rubric)
        # Like we will have to find a way to get a rubric for each model, and then get it so tthat we take the model that has the top graded response 
        self.rubric_prompts = {
            "Excellent": "The submission is well-organized, addresses the prompt thoroughly, and provides in-depth analysis with robust supporting evidence.",
            "Good": "The submission addresses most aspects of the prompt with clear points but may lack depth in analysis.",
            "Average": "The submission responds to the prompt in a basic manner, though the analysis and organization are limited.",
            "Poor": "The submission is incomplete, lacks coherence, and does not satisfactorily address the prompt."
        }
        # Precompute embeddings for rubric descriptions
        self.rubric_embeddings = self._compute_rubric_embeddings()
    
    def _compute_rubric_embeddings(self):
        descriptions = list(self.rubric_prompts.values())
        # Get embeddings for all descriptions at once
        embeddings = self.embedding_function(descriptions)
        return {grade: emb for grade, emb in zip(self.rubric_prompts.keys(), embeddings)}
    
    def grade_submission(self, submission_prompt: str):
        # Get embedding for the submission
        submission_embedding = self.embedding_function([submission_prompt])[0]
        
        best_grade = None
        highest_similarity = -1
        
        # Compare against each rubric embedding
        for grade, rubric_embedding in self.rubric_embeddings.items():
            sim = cosine_similarity([submission_embedding], [rubric_embedding])[0][0]
            if sim > highest_similarity:
                highest_similarity = sim
                best_grade = grade
                
        return best_grade, highest_similarity

# Call the grader
grader = RubricGrader()

# Below is a small example if needed

# # Example submissions to go through the grader (Have to get it so we can go through all the responses, and then select the top ones)
# example_prompts = [
#     "The paper offers an insightful discussion with robust evidence, clearly meeting and exceeding the criteria.",
#     "The response is decent and clearly structured, though some points need more elaboration.",
#     "A superficial attempt with limited analysis and organization.",
#     "There is minimal content provided, lacking coherence, focus, and clear arguments."
# ]

# # Process and display grading for each example
# for prompt in example_prompts:
#     grade, similarity = grader.grade_submission(prompt)
#     print(f"Submission Prompt: {prompt}")
#     print(f"Assigned Grade: {grade} (Similarity Score: {similarity:.2f})")
#     print("-" * 80)

  self.embedding_function = GeminiEmbeddingFunction()


### The cell below evaluates prompts using a rubric-based grading system with embeddings generated by Gemini. The system works by:

1. **Rubric Definition**
   - Defines 4 grade levels: Excellent, Good, Average, and Poor
   - Each grade has a detailed description of what constitutes that level of quality
   - These descriptions serve as reference points for grading

2. **Embedding Generation**
   - Uses Google's Gemini model to convert both:
     - The rubric descriptions into numerical vectors (embeddings)
     - Each prompt into its own embedding

3. **Similarity Scoring**
   - Compares each prompt's embedding to the rubric embeddings using cosine similarity
   - Scores range from 0 (completely different) to 1 (identical)
   - The grade with the highest similarity score is assigned to the prompt

4. **Batch Processing**
   - Processes prompts in batches of 10 to manage memory and API usage
   - Includes error handling for invalid or empty prompts
   - Saves progress periodically to prevent data loss

### Understanding the Results:

The output includes:
- Grade distribution showing how many prompts fall into each category
- Average similarity scores for each grade level
- Sample graded prompts with their scores
- Statistics about processed and skipped prompts

#### Interpreting Similarity Scores
- Higher scores (closer to 1.0) indicate stronger matches with the rubric criteria
- Scores typically fall between 0.6 and 0.8 (Can be much better with more training)
- Small differences in scores can be meaningful due to the high-dimensional nature of embeddings

#### Limitations/What needs improvement
- The system relies on semantic similarity, which may not capture all aspects of quality
- Very short or very long prompts may be graded differently due to noise in the text (Preprocessing hopefully fixes but no guarantee)
- The quality of grading depends on how well the rubric descriptions differentiate between grades, should be well but may have to train it better still
- Need to get a way to follow similar strategy with all models for future comparison, without it taking too long
   - End goal is to be able to have grading good enough to where each model has qualities its best at and can clearly be chosen

In [42]:
import pandas as pd
from tqdm import tqdm
import time
import numpy as np

# Read the CSV file
df = pd.read_csv('../data/ShareGPT/separated_prompts_clean.csv').head(100)

# Initialize the grader
grader = RubricGrader()

# Create lists to store results
results = []

# Define batch size
BATCH_SIZE = 10
SLEEP_TIME = .25  # seconds

# Process prompts in batches
for start_idx in tqdm(range(0, len(df), BATCH_SIZE), desc="Processing batches"):
    batch_df = df.iloc[start_idx:start_idx + BATCH_SIZE]
    batch_results = []
    
    # Process each prompt in the current batch
    for idx, row in batch_df.iterrows():
        prompt = row['prompt']
        
        # Skip NaN or empty prompts
        if pd.isna(prompt) or not isinstance(prompt, str):
            print(f"Skipping prompt {row['original_id']}: Invalid prompt value")
            continue
            
        try:
            grade, similarity = grader.grade_submission(prompt)
            batch_results.append({
                'original_id': row['original_id'],
                'prompt': prompt,
                'grade': grade,
                'similarity_score': similarity
            })
        except Exception as e:
            print(f"Error processing prompt {row['original_id']}: {str(e)}")
    
    # Add batch results to main results list
    results.extend(batch_results)
    
    # Save intermediate results after each batch
    if len(results) % (BATCH_SIZE * 10) == 0:  # Save every 10 batches
        interim_df = pd.DataFrame(results)
        interim_df.to_csv(f'prompt_grading_results_interim_{len(results)}.csv', index=False)
    
    # Sleep between batches to avoid rate limits
    time.sleep(SLEEP_TIME)

# Convert all results to DataFrame
results_df = pd.DataFrame(results)

# Display summary statistics
print("\nGrade Distribution:")
print(results_df['grade'].value_counts())

print("\nAverage Similarity Score by Grade:")
print(results_df.groupby('grade')['similarity_score'].mean())

# Save final results
results_df.to_csv('prompt_grading_results_final.csv', index=False)
print("\nFinal results saved to prompt_grading_results_final.csv")

# Display sample of results
print("\nSample of graded prompts:")
print(results_df.sample(5))

# Print statistics about skipped prompts
print("\nTotal prompts processed:", len(results))
print("Total prompts in dataset:", len(df))
print("Skipped prompts:", len(df) - len(results))

  self.embedding_function = GeminiEmbeddingFunction()
Processing batches: 100%|██████████| 10/10 [00:20<00:00,  2.01s/it]


Grade Distribution:
grade
Poor         60
Good         16
Excellent    14
Average      10
Name: count, dtype: int64

Average Similarity Score by Grade:
grade
Average      0.709245
Excellent    0.687306
Good         0.754996
Poor         0.737981
Name: similarity_score, dtype: float32

Final results saved to prompt_grading_results_final.csv

Sample of graded prompts:
    original_id                                             prompt      grade  \
94        89757  Now, you need to act both as an interviewer an...  Excellent   
35        16382  give me a terraform module that will run sever...       Poor   
26        54555   Could you write cucumber scenarios for such app?       Good   
60       175882       Create a good improv comedy script on crypto       Poor   
6         73273  I have a complex task for you\r\n\r\nSeparate ...       Poor   

    similarity_score  
94          0.658958  
35          0.671703  
26          0.779992  
60          0.784713  
6           0.802017  

Tota




## Analyzing responses for best attributs in certain categories

In [46]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class ResponseEvaluator:
    def __init__(self):
        self.embedding_function = GeminiEmbeddingFunction()
        
        # Define rubric descriptions for each category and level
        self.category_rubrics = {
            'Relevance': {
                5: "The response directly and fully addresses all aspects of the prompt with perfect alignment",
                4: "The response addresses the main aspects of the prompt with good alignment",
                3: "The response somewhat addresses the prompt but may have some tangential content",
                2: "The response only partially addresses the prompt with significant deviation",
                1: "The response barely addresses or misses the point of the prompt entirely"
            },
            'Accuracy': {
                5: "The response is completely factual with precise, verifiable information",
                4: "The response is mostly accurate with minor imprecisions",
                3: "The response has a mix of accurate and questionable information",
                2: "The response contains several inaccuracies",
                1: "The response is largely incorrect or misleading"
            },
            'Completeness': {
                5: "The response comprehensively covers all required elements with additional valuable insights",
                4: "The response covers all required elements thoroughly",
                3: "The response covers most required elements with some gaps",
                2: "The response misses several required elements",
                1: "The response is significantly incomplete"
            },
            'Clarity': {
                5: "The response is exceptionally well-structured, clear, and easy to understand",
                4: "The response is well-organized and clearly expressed",
                3: "The response is generally clear but could be better organized",
                2: "The response is somewhat unclear or poorly structured",
                1: "The response is confusing and poorly organized"
            }
        }
        
        # Pre-compute embeddings for all rubric descriptions
        self.category_embeddings = self._compute_rubric_embeddings()
    
    def _compute_rubric_embeddings(self):
        embeddings = {}
        for category, levels in self.category_rubrics.items():
            category_emb = {}
            for level, description in levels.items():
                emb = self.embedding_function([description])[0]
                category_emb[level] = emb
            embeddings[category] = category_emb
        return embeddings
    
    def evaluate_response(self, response: str):
        # Get embedding for the response
        response_embedding = self.embedding_function([response])[0]
        
        # Evaluate each category
        scores = {}
        similarities = {}
        for category, level_embeddings in self.category_embeddings.items():
            # Compare with each level in the category
            level_scores = {}
            for level, level_emb in level_embeddings.items():
                sim = cosine_similarity([response_embedding], [level_emb])[0][0]
                level_scores[level] = sim
            
            # Get the best matching level
            best_level = max(level_scores.items(), key=lambda x: x[1])
            scores[category] = best_level[0]
            similarities[category] = best_level[1]
        
        return {
            'scores': scores,
            'similarities': similarities,
            'overall_score': np.mean(list(scores.values())),
            'strongest_category': max(similarities.items(), key=lambda x: x[1])[0],
            'weakest_category': min(similarities.items(), key=lambda x: x[1])[0]
        }

def evaluate_responses(responses: list, prompts: list = None):
    """
    Evaluate multiple responses and return the best one
    """
    evaluator = ResponseEvaluator()
    results = []
    
    for i, response in enumerate(responses):
        result = evaluator.evaluate_response(response)
        result['response'] = response
        result['prompt'] = prompts[i] if prompts else None
        results.append(result)
    
    # Sort by overall score
    results.sort(key=lambda x: x['overall_score'], reverse=True)
    
    return {
        'best_response': results[0],
        'all_results': results,
        'summary': {
            'average_score': np.mean([r['overall_score'] for r in results]),
            'score_distribution': {
                category: np.mean([r['scores'][category] for r in results])
                for category in results[0]['scores'].keys()
            }
        }
    }


In [51]:
import pandas as pd
import time

# Read the CSV file
df = pd.read_csv('../data/FineTunedResponses/responses_to_fine_tuned.csv').head(100)

# Define batch parameters
BATCH_SIZE = 20
SLEEP_TIME = .2  # seconds between batches

# Initialize empty list for all results
all_batch_results = []

# Process in batches
for start_idx in range(0, len(df), BATCH_SIZE):
    print(f"\nProcessing batch {start_idx//BATCH_SIZE + 1} of {(len(df) + BATCH_SIZE - 1)//BATCH_SIZE}")
    
    # Get current batch
    batch_df = df.iloc[start_idx:start_idx + BATCH_SIZE]
    
    # Prepare responses and prompts for this batch
    batch_responses = batch_df['response_to_fine_tuned'].tolist()
    batch_prompts = batch_df['original_prompt'].tolist()
    
    try:
        # Evaluate the batch
        batch_results = evaluate_responses(batch_responses, batch_prompts)
        all_batch_results.append(batch_results)
        
        # Print batch summary
        print(f"Batch best response score: {batch_results['best_response']['overall_score']:.2f}")
        print(f"Batch average score: {batch_results['summary']['average_score']:.2f}")
        
        # Save intermediate results
        batch_results_df = pd.DataFrame([{
            'original_id': batch_df.iloc[i]['original_id'],
            'original_prompt': batch_df.iloc[i]['original_prompt'],
            'response': result['response'],
            'overall_score': result['overall_score'],
            **{f'score_{category}': score for category, score in result['scores'].items()},
            'strongest_category': result['strongest_category'],
            'weakest_category': result['weakest_category']
        } for i, result in enumerate(batch_results['all_results'])])
        
        batch_results_df.to_csv(f'response_evaluation_results_batch_{start_idx//BATCH_SIZE + 1}.csv', index=False)
        
    except Exception as e:
        print(f"Error processing batch {start_idx//BATCH_SIZE + 1}: {str(e)}")
    
    # Sleep between batches
    time.sleep(SLEEP_TIME)

# Combine all results
all_results = []
all_responses = []
for batch_result in all_batch_results:
    all_results.extend(batch_result['all_results'])
    
# Calculate overall statistics
overall_results = {
    'best_response': max(all_results, key=lambda x: x['overall_score']),
    'all_results': all_results,
    'summary': {
        'average_score': sum(r['overall_score'] for r in all_results) / len(all_results),
        'score_distribution': {
            category: sum(r['scores'][category] for r in all_results) / len(all_results)
            for category in all_results[0]['scores'].keys()
        }
    }
}

# Print final results
print("\n=== Final Results ===")
print("\nBest Overall Response:")
print(f"Overall Score: {overall_results['best_response']['overall_score']:.2f}")
print("\nCategory Scores:")
for category, score in overall_results['best_response']['scores'].items():
    print(f"{category}: {score}")

print("\nOverall Summary:")
print(f"Average Score: {overall_results['summary']['average_score']:.2f}")
print("\nCategory Averages:")
for category, score in overall_results['summary']['score_distribution'].items():
    print(f"{category}: {score:.2f}")

# Save final combined results
final_results_df = pd.DataFrame([{
    'original_id': df.iloc[all_results.index(result)]['original_id'],
    'original_prompt': df.iloc[all_results.index(result)]['original_prompt'],
    'response': result['response'],
    'overall_score': result['overall_score'],
    **{f'score_{category}': score for category, score in result['scores'].items()},
    'strongest_category': result['strongest_category'],
    'weakest_category': result['weakest_category']
} for result in all_results])

final_results_df.to_csv('response_evaluation_results_final.csv', index=False)
print("\nFinal results saved to response_evaluation_results_final.csv")

  self.embedding_function = GeminiEmbeddingFunction()



Best Response:
Original Prompt: One-pot vegetarian pasta recipes for busy nights

Text: Okay, let's analyze the strengths and weaknesses of these two one-pot pasta recipes:

**General Considerations for Both Recipes:**

*   **Strengths (Shared):**
    *   **Ease of Preparation:**  Both are designed for simplicity, minimizing dishes and cooking steps. They are ideal for busy weeknights.
    *   **Customizable:**  The ingredients listed are broad, allowing for flexibility based on personal preferences and what's on hand.
    *   **Budget-Friendly:** One-pot meals often utilize...

Overall Score: 5.00

Category Scores:
Relevance: 5
Accuracy: 5
Completeness: 5
Clarity: 5

Strongest in: Relevance
Weakest in: Accuracy

Overall Summary:
Average Score: 4.12

Category Averages:
Relevance: 3.40
Accuracy: 4.60
Completeness: 3.60
Clarity: 4.90

Detailed Results for All Responses:

Response 1:
Original Prompt: One-pot vegetarian pasta recipes for busy nights
Response: Okay, let's analyze the stren