In [8]:
from schema_parser import SchemaParser
from schema_diff import SchemaDiff, json
from prompt import get_prompt
from claude import create_message
from json_tools import safe_load_json
from eval import evaluate_response, f1_results, ab_testing, rate_coherence, expert_rubric
from base import DATA_PATH

examples = ["example","example-1"]
# Just test one example for now
example = examples[0]

schema_v1_parser = SchemaParser(f'{DATA_PATH}/schema-1-{example}.txt')
schema_v1 = schema_v1_parser.parse()

schema_v2_parser = SchemaParser(f'{DATA_PATH}/schema-2-{example}.txt')
schema_v2 = schema_v2_parser.parse()

output_path = (f'{DATA_PATH}/output-{example}.txt')
with open(output_path, 'r') as file:
    expected_output = json.load(file)
    
schema_diff = SchemaDiff(schema_v1, schema_v2)
python_change_report = schema_diff.detect_changes()
print(json.dumps(python_change_report, indent=2))

prompt = get_prompt(schema_v1, schema_v2)
response = create_message(prompt, 2048)
llm_change_report = safe_load_json(response[0].text)
print(json.dumps(llm_change_report, indent=2))

# Model eval
print("LLM Scoring:")
evaluate_response(expected_output, llm_change_report)
f1_results(llm_change_report, expected_output)

print("Python Method Scoring:")
evaluate_response(expected_output, python_change_report)
f1_results(python_change_report, expected_output)

# DUMMY DATA - below should be replaced with actual user feedback/testing
# Running A/B testing
llm_pref, python_pref = ab_testing(llm_change_report, python_change_report)
print(f"LLM Preference: {llm_pref:.2f}%")
print(f"Python Preference: {python_pref:.2f}%")

# Applying Likert scale to responses
llm_coherence_score, llm_ratings = rate_coherence(llm_change_report)
python_coherence_score, python_ratings = rate_coherence(python_change_report)

print(f"LLM Coherence Score: {llm_coherence_score}")
print(f"Python Coherence Score: {python_coherence_score}")
print("LLM Detailed Ratings:", llm_ratings)
print("Python Detailed Ratings:", python_ratings)

# Applying expert rubric to responses
llm_rubric_score, llm_rubric_details = expert_rubric(llm_change_report)
python_rubric_score, python_rubric_details = expert_rubric(python_change_report)

print(f"LLM Rubric Score: {llm_rubric_score}")
print(f"Python Rubric Score: {python_rubric_score}")
print("LLM Rubric Details:", llm_rubric_details)
print("Python Rubric Details:", python_rubric_details)


{
  "changes": [
    {
      "type": "Query",
      "field": "getWeather",
      "change": "Renamed input parameter 'location' to 'city'",
      "breaking": true,
      "release_note": "The input parameter for `getWeather` has been renamed from `location` to `city`. This is a breaking change, so make sure to update any queries that use `location` to `city`."
    },
    {
      "type": "Weather",
      "field": "visibility",
      "change": "Added new scalar field 'visibility'",
      "breaking": false,
      "release_note": "We've added a new `visibility` field to the `Weather` type. You can now get additional information without modifying existing queries. This is a non-breaking change."
    }
  ],
  "release_notes": {
    "summary": "This release introduces breaking changes including: Renamed input parameter 'location' to 'city'; non-breaking enhancements: Added new scalar field 'visibility'"
  }
}
{
  "changes": [
    {
      "type": "Query",
      "field": "getWeather",
      "chan

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
