# Overview
- This notebook allows a user to explore evaluations of tested prompts.

# Setup

In [54]:
import pandas as pd
import os
import json
from glob import glob
from base import DATA_PATH

EVAL_DIR = '../data/evaluations'
files = glob(os.path.join(EVAL_DIR, '*.parquet'))
dfs = [pd.read_parquet(file) for file in files]
merged = pd.concat(dfs, ignore_index=True)

print(merged.shape)
cols = ['example_no','prompt_version','llm_bleu_score','llm_rouge_score','llm_f1']
example_1 = merged[merged.example_no=='example']
# example_2 = merged[merged.example_no=='example-1']
example_1[cols].sort_values(by='llm_f1')

(11, 24)


Unnamed: 0,example_no,prompt_version,llm_bleu_score,llm_rouge_score,llm_f1
1,example,7,0.449965,"{'rouge-1': {'f': 0.6071428521444515, 'p': 0.6...",0.75
3,example,6,0.432629,"{'rouge-1': {'f': 0.6140350827192983, 'p': 0.6...",0.75
4,example,4,0.306604,"{'rouge-1': {'f': 0.591304342826465, 'p': 0.58...",0.75
5,example,5,0.317677,"{'rouge-1': {'f': 0.5614035037719299, 'p': 0.5...",0.75
7,example,1,0.453636,"{'rouge-1': {'f': 0.5607476585728012, 'p': 0.6...",0.75
9,example,3,0.364264,"{'rouge-1': {'f': 0.5840707914605686, 'p': 0.5...",0.75
10,example,2,0.389698,"{'rouge-1': {'f': 0.5794392473578479, 'p': 0.6...",0.75
8,example,8,0.383626,"{'rouge-1': {'f': 0.5739130384786391, 'p': 0.5...",0.823529


# Review
- Taking a look at the LLM response with the highest F1 score.
## Expected output

In [45]:
example='example'
output_path = (f'{DATA_PATH}/output-{example}.txt')
with open(output_path, 'r') as file:
    expected_output = json.load(file)

In [56]:
expected_output

{'changes': [{'type': 'Query',
   'field': 'getWeather',
   'change': "Renamed input parameter 'location' to 'city'",
   'breaking': True,
   'release_note': 'The input parameter for `getWeather` has been renamed from `location` to `city`. This is a breaking change, so make sure to update any queries that use `location` to `city`.'},
  {'type': 'Weather',
   'field': 'visibility',
   'change': "Added new Int field 'visibility'",
   'breaking': False,
   'release_note': "We've added a new `visibility` field to the `Weather` type. You can now get visibility information in your weather queries without modifying existing ones. This is a non-breaking change."}],
 'release_notes': {'summary': 'This release introduces a breaking change with the renaming of the `location` parameter to `city` in the `getWeather` query, and a non-breaking enhancement with the addition of a new `visibility` field in the `Weather` type.'}}

## LLM output

In [55]:
json.loads(example_1[example_1.prompt_version==8].llm_change_report.values[0])

{'changes': [{'type': 'Query',
   'field': 'getWeather',
   'change': "Parameter name changed from 'location' to 'city'",
   'breaking': True,
   'release_note': 'The parameter name in `getWeather` query has been changed from `location` to `city`. This is a breaking change - existing queries using the `location` parameter will need to be updated to use `city` instead.'},
  {'type': 'Weather',
   'field': 'visibility',
   'change': "Added new Int field 'visibility'",
   'breaking': False,
   'release_note': 'A new nullable field `visibility` has been added to the `Weather` type. This is a non-breaking change that provides additional weather information.'}],
 'release_notes': {'summary': 'This release includes one breaking change to the `getWeather` query parameter (renamed from `location` to `city`) and adds a new non-breaking `visibility` field to the `Weather` type. Clients must update their existing queries that use the `getWeather` query to use the new parameter name `city`. The new

# Claude review
- How does self model evaluation differ, what does Claude believe is the best?

Explanation of scoring:
- All responses captured the core changes correctly (parameter rename and new visibility field)
- Minor variations in wording were overlooked as requested
- Small deductions were made for:
  * Slight differences in release note phrasing
  * Varying levels of detail in the summaries
  * Minor differences in how the changes were described
- Response 8 got a slightly lower score due to having a more concise summary that missed some details present in the expected output
- None had major errors or missing information, hence the generally high scores
- Perfect matches weren't required for full credit, as requested to be lenient with minor differences

In [67]:
from claude import create_message

prompt = f"""You are a Developer that is reviewing code changes to schemas. Given the expected output:
{expected_output}

Score each of the below responses out of 100% based on how well they match the expected output.
Be lenient with minor differences in punctuation, word order, capitalization, and spacing.
{'response: '.join(example_1.llm_change_report.values)}

Return your response in JSON format with the following structure:
{{
    "accuracy": 78%,
    "accuracy": 52%,
}}
"""
response = create_message(prompt, 4096)

Debug: Sending prompt to Claude (length: 9071): You are a Developer that is reviewing code changes to schemas. Given the expected output:
{'changes': [{'type': 'Query', 'field': 'getWeather', 'change': "Renamed input parameter 'location' to 'city'"...
Debug: Successfully received response from Claude.


In [80]:
def extract_json(res):
    s = res.find("{")
    e = res.rfind("}") + 1
    return json.loads(res[s:e])

res_dict = extract_json(response[0].text)
res_dict.values()

dict_values(['95%', '92%', '90%', '93%', '94%', '91%', '92%', '88%'])

In [84]:
example_1['llm_self_eval'] = res_dict.values()
cols = ['example_no','prompt_version','llm_bleu_score','llm_rouge_score','llm_f1','llm_self_eval']
example_1[cols].sort_values(by='llm_self_eval')

Unnamed: 0,example_no,prompt_version,llm_bleu_score,llm_rouge_score,llm_f1,llm_self_eval
10,example,2,0.389698,"{'rouge-1': {'f': 0.5794392473578479, 'p': 0.6...",0.75,88%
4,example,4,0.306604,"{'rouge-1': {'f': 0.591304342826465, 'p': 0.58...",0.75,90%
8,example,8,0.383626,"{'rouge-1': {'f': 0.5739130384786391, 'p': 0.5...",0.823529,91%
3,example,6,0.432629,"{'rouge-1': {'f': 0.6140350827192983, 'p': 0.6...",0.75,92%
9,example,3,0.364264,"{'rouge-1': {'f': 0.5840707914605686, 'p': 0.5...",0.75,92%
5,example,5,0.317677,"{'rouge-1': {'f': 0.5614035037719299, 'p': 0.5...",0.75,93%
7,example,1,0.453636,"{'rouge-1': {'f': 0.5607476585728012, 'p': 0.6...",0.75,94%
1,example,7,0.449965,"{'rouge-1': {'f': 0.6071428521444515, 'p': 0.6...",0.75,95%


## Expected output

In [101]:
expected_output

{'changes': [{'type': 'Query',
   'field': 'getWeather',
   'change': "Renamed input parameter 'location' to 'city'",
   'breaking': True,
   'release_note': 'The input parameter for `getWeather` has been renamed from `location` to `city`. This is a breaking change, so make sure to update any queries that use `location` to `city`.'},
  {'type': 'Weather',
   'field': 'visibility',
   'change': "Added new Int field 'visibility'",
   'breaking': False,
   'release_note': "We've added a new `visibility` field to the `Weather` type. You can now get visibility information in your weather queries without modifying existing ones. This is a non-breaking change."}],
 'release_notes': {'summary': 'This release introduces a breaking change with the renaming of the `location` parameter to `city` in the `getWeather` query, and a non-breaking enhancement with the addition of a new `visibility` field in the `Weather` type.'}}

## LLM output

In [100]:
json.loads(example_1[example_1.prompt_version==7].llm_change_report.values[0])

{'changes': [{'type': 'Query',
   'field': 'getWeather',
   'change': "Parameter 'location' renamed to 'city'",
   'breaking': True,
   'release_note': 'The parameter name for `getWeather` query has been changed from `location` to `city`. This is a breaking change and requires clients to update their queries to use the new parameter name.'},
  {'type': 'Weather',
   'field': 'visibility',
   'change': "Added new Int field 'visibility'",
   'breaking': False,
   'release_note': 'A new field `visibility` has been added to the `Weather` type. This optional field provides visibility information. This is a non-breaking change.'}],
 'release_notes': {'summary': 'This release includes one breaking change to the `getWeather` query, where the input parameter has been renamed from `location` to `city`. Additionally, a non-breaking change adds a new `visibility` field to the `Weather` type. Clients must update their queries to use the new parameter name `city`, but can optionally start using the 