In [None]:
from discovery_child_development import config, PROJECT_DIR
from discovery_child_development.getters import taxonomy
from discovery_child_development.utils import jsonl_utils as jsonl
from discovery_child_development.utils import taxonomy_labelling_utils as tlu
from discovery_child_development.utils.openai_utils import client

import pandas as pd
import random
import tiktoken
import wandb

MODEL = "gpt-3.5-turbo-1106" # "gpt-4"

def get_model_cost(model):
    # based on https://openai.com/pricing
    if model == "gpt-3.5-turbo-1106":
        input = 0.001
        output = 0.002
    elif model == "gpt-4":
        input = 0.03
        output = 0.06
    return input, output

MODEL_INPUT_COST, MODEL_OUTPUT_COST = get_model_cost(MODEL)
SEED = config["seed"]

random.seed(SEED)

encoding = tiktoken.encoding_for_model(MODEL)

LABELS_PATH = PROJECT_DIR / "inputs/data/labelling/taxonomy/output/training_validation_data_patents_openalex_LABELLED.jsonl"
PROMPT_OUT_PATH = PROJECT_DIR / "inputs/data/labelling/taxonomy/output"

In [None]:
# these functions came from: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb

def num_tokens_from_string(string: str, encoding=encoding):
  return len(encoding.encode(string))

def num_tokens_from_messages(messages, model=MODEL):
    """Return the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model in {
        "gpt-3.5-turbo-0613",
        "gpt-3.5-turbo-16k-0613",
        "gpt-4-0314",
        "gpt-4-32k-0314",
        "gpt-4-0613",
        "gpt-4-32k-0613",
        }:
        tokens_per_message = 3
        tokens_per_name = 1
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif "gpt-3.5-turbo" in model:
        print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
    elif "gpt-4" in model:
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
        )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

In [None]:
# load data that has already been labelled using prodigy
human_labels = pd.DataFrame(taxonomy.get_prodigy_labelled_data())[['id', 'text', 'source', 'accept', 'model', 'model_output']]
human_labels.head()

In [None]:
gpt_labels = human_labels.explode('model_output')

gpt_labels.head()

In [None]:
sorted(gpt_labels['model_output'].unique())

# OpenAI labelling

In [None]:
categories_flat = tlu.load_categories()

function = tlu.format_function(categories_flat)

In [None]:
def eval_gpt_output(llm_output, human_output, id, text, model, prompt):
    label_diff = len(llm_output) - len(human_output)
    if label_diff < 0:
        n_missing_labels = abs(label_diff)
        n_extra_labels = 0
    elif label_diff > 0:
        n_extra_labels = label_diff
        n_missing_labels = 0
    else:
        n_extra_labels = 0
        n_missing_labels = 0
        
    return {
          "id": id,
          "text": text,
          "model": model,
           "prompt": prompt,
           "output": llm_output,
           "human_output": human_output,
           "exact_match": llm_output == human_output,
           "no_overlap": llm_output.isdisjoint(human_output),
           "label_diff": label_diff,
           "n_extra_labels": n_extra_labels,
           "n_missing_labels": n_missing_labels
        }
    
def summarise_gpt_performance(df):
    exact_match = df['exact_match'].sum()
    no_overlap = df['no_overlap'].sum()
    prop_exact_matches = df['exact_match'].sum() / len(df)
    n_missing_labels = df['n_missing_labels'].mean()
    prop_no_overlap = df['no_overlap'].sum() / len(df)
    return {'exact_matches': exact_match,
            'no_overlap': no_overlap,
            'prop_exact_matches': prop_exact_matches,
            'avg_missing_labels': n_missing_labels,
            'prop_no_overlap': prop_no_overlap}
    

## Analyse prodigy labelling

In [None]:
human_labels_dict = human_labels[['id', 'text', 'accept', 'model','model_output']].to_dict('records')

In [None]:
prodigy_results = []

for row in human_labels_dict:
    prodigy_results.append(eval_gpt_output(set(row['model_output']), set(row['accept']), row['id'], row['text'], row['model'], prompt=""))

df = pd.DataFrame(prodigy_results)

summarise_gpt_performance(df)

## Compare different models (you can also tweak the prompt and run this part again to see what changes)

This code block also logs your prompt and key metrics on weights & biases.

In [None]:
models = ['gpt-3.5-turbo-1106', 'gpt-4-0613']

results = {}

for model in models:
    run = wandb.init(
                project="ISS supervised ML",
                job_type="Taxonomy labelling_prompt_engineering",
                save_code=True,
                tags=[model],
            )
    results[model] = {}
    results[model]['outputs'] = []
    
    # Create an artifact for the prompt
    prompt_artifact = wandb.Artifact('prompt_artifact', type='text')
    temp_prompt = tlu.build_prompt("<TEXT>", categories_flat)
    str_prompt = []
    for m in temp_prompt:
        str_prompt.append(f"{m['role']}: {m['content']}\n")
    str_prompt = ''.join(str_prompt)
    with open(f"{PROMPT_OUT_PATH}/prompt.txt", "w") as file:
        file.write(str_prompt)
    prompt_artifact.add_file(f"{PROMPT_OUT_PATH}/prompt.txt")
    # Log the artifact
    wandb.log_artifact(prompt_artifact)
    
    for index, row in human_labels.iterrows():
        prompt = tlu.build_prompt(row['text'], categories_flat)
        r = client.chat.completions.create(
            model=model,
            temperature=0.0,
            messages=prompt,
            functions=[function],
            function_call={"name": "predict_category"},
            )
        llm_output = set(tlu.get_labels_from_gpt_response(r))
        human_labels_list = human_labels[human_labels['id'] == row['id']]['accept'].values
        human_output = set([label for sublist in human_labels_list for label in sublist])
        results[model]['outputs'].append(eval_gpt_output(llm_output, human_output, id=row['id'], text=row['text'], model=model, prompt=prompt))
    df = pd.DataFrame(results[model]['outputs'])
    wb_table = wandb.Table(
                data=df, columns=df.columns
            )
    run.log({"Outputs": wb_table})
    # Evaluation metrics
    summary_stats = summarise_gpt_performance(df)
    results[model]['stats'] = summary_stats
    # Log metrics
    wandb.run.summary["accuracy"] = summary_stats['prop_exact_matches']
    wandb.run.summary['prop_no_overlap'] = summary_stats['prop_no_overlap']
    wandb.run.summary['avg_missing_labels'] = summary_stats['avg_missing_labels']
    wandb.finish()

In [None]:
pd.DataFrame(results['gpt-3.5-turbo-1106']['outputs']).head()