# Are different GPT3.5 versions different?
Do different versions of GPT3.5 produce different outputs? This notebook will compare the outputs of different versions of GPT3.5 to see if they are different.

## Generate completions

In [None]:
import subprocess

import pandas as pd

In [None]:
from evals.locations import REPO_DIR, EXP_DIR

In [None]:
MODELA = "gpt-3.5-turbo-0125" # newest model as of 04/12/24
MODELB = "gpt-3.5-turbo-0613" # oldest model as of 04/12/24, will be deprecated in July 2024

In [None]:
STUDY_NAME = "comparing_gpt35s_across_versions"
TASKS = ['number_triplets', 'wikipedia', 'writing_stories', 'self_referential']
LIMIT = 100

Run the script to generate the completions

In [None]:
for task in TASKS:
    for model in [MODELA, MODELB]:
        command = f"cd {REPO_DIR} && python3 {REPO_DIR}/evals/run_object_level.py study_name={STUDY_NAME} task={task} language_model={model} limit={LIMIT} task.set=val"
        subprocess.run(command, shell=True)

We also want to extract some response properties we care about

In [None]:
RESPONSE_PROPERTIES = ['identity', 'sentiment']

In [None]:
for folder in (EXP_DIR/STUDY_NAME).iterdir():
    if folder.is_dir():
        for response_property in RESPONSE_PROPERTIES:
            command = f"cd {REPO_DIR} && python3 {REPO_DIR}/evals/run_property_extraction.py response_property={response_property} dir={folder}"
            subprocess.run(command, shell=True)

## Analysis
How similar are the object level behaviors themselves?

In [None]:
from evals.analysis.loading_data import load_dfs_with_filter
from evals.utils import get_maybe_nested_from_dict

In [None]:
dfs = load_dfs_with_filter(EXP_DIR/STUDY_NAME, conditions={}, exclude_noncompliant=False)

In [None]:
def filter_by_dataset(dfs, dataset):
    return {config: df for config, df in dfs.items() if get_maybe_nested_from_dict(config, ('task', 'name')) == dataset}

In [None]:
from evals.analysis.string_cleaning import clean_string


for task in TASKS:
    dfs_task = filter_by_dataset(dfs, task)
    assert len(dfs_task) == 2
    tdfs = list(dfs_task.values())
    models = [c['language_model']['model'] for c in dfs_task.keys()]
    joint_df = tdfs[0].merge(tdfs[1], on='string', suffixes=('_'+models[0], '_'+models[1]))
    for response_property in RESPONSE_PROPERTIES:
        joint_df[f"{response_property}_match"] = joint_df[f"{response_property}_{models[0]}"].apply(clean_string) == joint_df[f"{response_property}_{models[1]}"].apply(clean_string)
    for response_property in RESPONSE_PROPERTIES:
        print(f"Property: {response_property}")
        print(f"Task: {task}")
        print(f"% match:")
        display(joint_df[f"{response_property}_match"].value_counts(normalize=True))
    print(f"Task: {task}")
    print(f"% match:")
    cols = ['string'] + [f"{response_property}_{model}" for response_property in RESPONSE_PROPERTIES for model in models]
    display(joint_df[~joint_df['identity_match']][cols].sample(min(10, len(joint_df[~joint_df['identity_match']]))))