# ChatGPT Capability Evaluation
This notebook evaluates the capabilities of the ChatGPT language model with respect to the task `sum_list`.
The goal is to evaluate to what cardinality the language model is capable of performing the primary operation `op_sum` on a given list of single-digit integers.

## Task Definition
The task `sum_list` defines the following operations:
- `op_sum`: calculates the sum of a given list
- `op_split`: splits the list in two sublists of equal size
- `op_merge`: merges two lists into a single one

The scoring on `op_sum` is handled by an execution operation, meaning the scoring is done by code execution.

## Define Evaluation
The evaluation requires a complete graph controller as well as a ChatGPT language model instance.

In [1]:
import getpass
import os

api_key = os.environ.get('api_key')
api_key = getpass.getpass('api_key') if api_key is None or api_key == '' else api_key
os.environ['api_key'] = api_key

In [2]:
from pure_graph_of_thoughts.language_model import ChatGPT, GPTModel
from pure_graph_of_thoughts.controller import CompleteGraphController
import logging

logging.basicConfig(level=logging.INFO)


def create_chat_gpt(model: GPTModel) -> ChatGPT:
    """
    Creates a new ChatGPT instance for a given model to use.
    :param model: model to use
    :return: ChatGPT instance
    """
    return ChatGPT(api_key=api_key, model=model)

## Prepare Storing and Loading

In [3]:
from pure_graph_of_thoughts.language_model.chatgpt.gpt_usage import GPTUsage
import json
from pure_graph_of_thoughts.api.schema import JsonSchemaEncoder
from auto_graph_of_thoughts.baseline.model import BaselineResultSummary

results_directory = '../../artifacts/results/chatgpt_capability_evaluation/sum_list/op_sum'


def get_model_dir_name(evaluated_model: GPTModel):
    """
    Gets the directory name for a given model.
    :param evaluated_model: evaluated model
    :return: the directory name of the model
    """
    return evaluated_model.id.replace('.', '').replace('-', '')


def get_results_file_name(evaluated_model: GPTModel, evaluated_operation_name: str, evaluated_cardinality: int) -> str:
    """
    Gets the file name for the results of a given operation with a given cardinality.
    :param evaluated_model: evaluated model
    :param evaluated_operation_name: evaluated operation
    :param evaluated_cardinality: evaluated cardinality
    :return: results file name
    """
    evaluated_model_name = get_model_dir_name(evaluated_model)
    return f'{results_directory}/{evaluated_model_name}/{evaluated_operation_name}/{evaluated_operation_name}_{str(evaluated_cardinality).zfill(2)}.json'


def store_result(evaluated_model: GPTModel, evaluated_operation_name: str, evaluated_cardinality: int,
                 result: BaselineResultSummary) -> None:
    """
    Stores a given baseline result summary as a JSON file.
    :param evaluated_model: evaluated model
    :param evaluated_operation_name: evaluated operation
    :param evaluated_cardinality: evaluated cardinality
    :param result: result to store
    """
    file_name = get_results_file_name(evaluated_model, evaluated_operation_name, evaluated_cardinality)
    os.makedirs(os.path.dirname(file_name), exist_ok=True)
    with open(file_name, 'w', encoding='utf-8') as f:
        json.dump(result, f, cls=JsonSchemaEncoder, ensure_ascii=False, indent=2)


def load_result(evaluated_model: GPTModel, evaluated_operation_name: str,
                evaluated_cardinality: int) -> BaselineResultSummary:
    """
    Loads a baseline result summary from a JSON file.
    :param evaluated_model: evaluated model
    :param evaluated_operation_name: evaluated operation
    :param evaluated_cardinality: evaluated cardinality
    :return: loaded baseline result summary
    """
    file_name = get_results_file_name(evaluated_model, evaluated_operation_name, evaluated_cardinality)
    with open(file_name, 'r', encoding='utf-8') as f:
        return BaselineResultSummary.from_dict(json.load(f))


def get_usage_file_name(evaluated_model: GPTModel, evaluated_operation_name: str) -> str:
    """
    Gets the file name of the model usage for a given evaluated operation.
    :param evaluated_model: evaluated model
    :param evaluated_operation_name: evaluated operation
    :return: usage file name
    """
    evaluated_model_name = get_model_dir_name(evaluated_model)
    return f'{results_directory}/{evaluated_model_name}/{evaluated_operation_name}/usage_{evaluated_operation_name}.json'


def store_usage(evaluated_model: GPTModel, evaluated_operation_name: str, usage: GPTUsage) -> None:
    """
    Stores the usage data of a GPT model.
    :param evaluated_model: evaluated model
    :param evaluated_operation_name: evaluated operation
    :param usage: usage
    """
    file_name = get_usage_file_name(evaluated_model, evaluated_operation_name)
    os.makedirs(os.path.dirname(file_name), exist_ok=True)
    with open(file_name, 'w', encoding='utf-8') as f:
        json.dump(usage, f, cls=JsonSchemaEncoder, ensure_ascii=False, indent=2)


def load_usage(evaluated_model: GPTModel, evaluated_operation_name: str) -> GPTUsage:
    """
    Loads a stored model usage of an evaluated operation.
    :param evaluated_model: evaluated model
    :param evaluated_operation_name: evaluated operation
    :return: GPT model usage
    """
    file_name = get_usage_file_name(evaluated_model, evaluated_operation_name)
    with open(file_name, 'r', encoding='utf-8') as f:
        return GPTUsage.from_dict(json.load(f))

## Prepare Testbed
The testbed is prepared.
For the generation of random lists of single-digit integers, the seed of the PRNG is set to `0`.
To evaluate the performance of the language model, an evaluator function is created for each cardinality to evaluate.

In [4]:
seed = 0

In [5]:
from typing import Callable
from pure_graph_of_thoughts.api.state import State
from auto_graph_of_thoughts.baseline.model import BaselineIterationResult
from pure_graph_of_thoughts.api.graph.operation import GraphOfOperations


def create_graph_of_operations_evaluator(
        controller: CompleteGraphController,
        init_state_generator: Callable[[], State],
        validate_output: Callable[[State, State], bool]
) -> Callable[[GraphOfOperations, int], BaselineIterationResult]:
    """
    Creates a graph of operations evaluator function based on an initial state generator.
    :param controller: controller
    :param init_state_generator: initial state generator
    :param validate_output: output validator
    :return: graph of operations evaluator function
    """

    def evaluate_graph_of_operations(graph_of_operations: GraphOfOperations, iteration: int) -> BaselineIterationResult:
        """
        Evaluates a graph of operations.
        :param graph_of_operations: graph of operations to evaluate
        :param iteration: current iteration
        :return: baseline iteration result
        """
        init_state: State = init_state_generator()
        graph_of_thoughts = controller.execute_graph(
                graph_of_operations=graph_of_operations,
                init_state=init_state
        )
        out_states = [sink.thought.state for sink in graph_of_thoughts.sinks]
        is_valid = validate_output(init_state, out_states)
        return BaselineIterationResult(
                graph_of_operations=graph_of_operations.to_schema(),
                graph_of_thoughts=graph_of_thoughts.to_schema(),
                is_valid=is_valid,
                cost=len(graph_of_thoughts.nodes),
                iteration=iteration
        )

    return evaluate_graph_of_operations

In [6]:
from typing import Sequence
from pure_graph_of_thoughts.api.operation import Operation
from auto_graph_of_thoughts.baseline.input_output_baseline_strategy import InputOutputBaselineStrategy
from auto_graph_of_thoughts.baseline.model import BaselineResultSummary
from random import Random


def create_single_list_init_state_generator(list_cardinality: int) -> Callable[[], State]:
    """
    Creates an initial state generator.
    :param list_cardinality: list cardinality
    :return: init state generator
    """
    rnd = Random(seed)
    return lambda: {
        'list': [
            rnd.randint(0, 9) for _ in range(list_cardinality)
        ]
    }


def create_sublist_init_state_generator(list_cardinality: int) -> Callable[[], State]:
    """
    Creates an initial state generator for generating two sublists.
    :param list_cardinality: total list cardinality
    :return: init state generator
    """
    sublist_cardinality_a = list_cardinality // 2
    sublist_cardinality_b = list_cardinality - sublist_cardinality_a
    rnd = Random(seed)
    return lambda: {
        'lists': [
            [
                rnd.randint(0, 9) for _ in range(sublist_cardinality_a)
            ],
            [
                rnd.randint(0, 9) for _ in range(sublist_cardinality_b)
            ]
        ]
    }


def create_io_baseline_strategy(
        controller: CompleteGraphController,
        operation: Operation,
        init_state_generator: Callable[[], State],
        validate_output: Callable[[State, Sequence[State]], bool]
) -> InputOutputBaselineStrategy:
    """
    Creates an IO baseline strategy for a given cardinality.
    :param controller: controller
    :param operation: operation
    :param init_state_generator: initial state generator
    :param validate_output: output state validator
    :return: IO baseline strategy
    """
    return InputOutputBaselineStrategy(
            operation=operation,
            evaluate_graph=create_graph_of_operations_evaluator(controller, init_state_generator, validate_output)
    )

## Run Evaluations
The evaluation is performed for the cardinalities `8` to `32` (inclusive) with `100` results each.

In [7]:
cardinalities = list(range(8, 32 + 1))

## Run Evaluation of `op_sum`
The primary operation `op_sum` is evaluated.

In [10]:
from auto_graph_of_thoughts.tasks.sum_list import op_sum

op_sum_name = 'op_sum'
validate_output_op_sum = lambda init_state, out_states: 'sum' in out_states[0] and sum(init_state['list']) == \
                                                        out_states[0]['sum']

def evaluate_op_sum(evaluated_model: GPTModel) -> None:
    """
    Evaluates the operation op_sum on the given model.
    :param evaluated_model: evaluated model
    """
    chat_gpt = create_chat_gpt(evaluated_model)
    ctrl = CompleteGraphController(language_model=chat_gpt)
    for cardinality in cardinalities:
        io_baseline_strategy = create_io_baseline_strategy(
                ctrl,
                op_sum,
                create_single_list_init_state_generator(cardinality),
                validate_output_op_sum
        )
        baseline_result_summary = io_baseline_strategy.generate(100)
        store_result(evaluated_model, op_sum_name, cardinality, baseline_result_summary)
        store_usage(evaluated_model, op_sum_name, chat_gpt.usage)

## Run Evaluation of `op_sum` on `gpt-3.5-turbo-0125`
The primary operation `op_sum` is evaluated on `gpt-3.5-turbo-0125`.

In [None]:
evaluate_op_sum(GPTModel.GPT_35_TURBO_0125)

## Run Evaluation of `op_sum` on `gpt-3.5-turbo-1106`
The primary operation `op_sum` is evaluated on `gpt-3.5-turbo-1106`.

In [22]:
evaluate_op_sum(GPTModel.GPT_35_TURBO_1106)

INFO:CompleteGraphController:Traversing node 27a8684b-2bcd-414a-9779-d31b0eae0336
INFO:CompleteGraphController:Processing operation PromptOperation(name='generate_single', n_inputs=1, n_outputs=1, type=<OperationType.generate: 'generate'>, output_complexity=1)
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:CompleteGraphController:Traversing node 4b366e6d-b841-4751-beee-f98c294c8922
INFO:CompleteGraphController:Processing operation PromptOperation(name='generate_single', n_inputs=1, n_outputs=1, type=<OperationType.generate: 'generate'>, output_complexity=1)
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:CompleteGraphController:Traversing node aa1d5834-dd0b-42b4-9d13-a99929cd04bb
INFO:CompleteGraphController:Processing operation PromptOperation(name='generate_single', n_inputs=1, n_outputs=1, type=<OperationType.generate: 'generate'>, output_complexity=1)
INFO:httpx:HTTP Request: POST https


## Run Evaluation of `op_sum` on `gpt-4-turbo-2024-04-09`
The primary operation `op_sum` is evaluated on `gpt-4-turbo-2024-04-09`.


In [11]:
evaluate_op_sum(GPTModel.GPT_4_TURBO_2024_04_09)

INFO:CompleteGraphController:Traversing node 5b880939-7a87-4f3e-bdf3-ad3109def590
INFO:CompleteGraphController:Processing operation PromptOperation(name='generate_single', n_inputs=1, n_outputs=1, type=<OperationType.generate: 'generate'>, output_complexity=1)
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:CompleteGraphController:Traversing node 7843945b-653a-4f89-9b8e-a1e05e866558
INFO:CompleteGraphController:Processing operation PromptOperation(name='generate_single', n_inputs=1, n_outputs=1, type=<OperationType.generate: 'generate'>, output_complexity=1)
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:CompleteGraphController:Traversing node 460ecafd-24a7-4c89-87cd-18ddfcab70f8
INFO:CompleteGraphController:Processing operation PromptOperation(name='generate_single', n_inputs=1, n_outputs=1, type=<OperationType.generate: 'generate'>, output_complexity=1)
INFO:httpx:HTTP Request: POST https

## Load Result Data
The result data is loaded from disk for further analysis.

In [12]:
from typing import Dict

def load_evaluation_results(evaluated_model: GPTModel, evaluated_operation_name: str) -> Dict[int, BaselineResultSummary]:
    """
    Loads evaluation results from JSON files.
    :param evaluated_model: evaluated model
    :param evaluated_operation_name: evaluated operation name
    :return: evaluation results
    """
    results: Dict[int, BaselineResultSummary] = {}
    for evaluated_cardinality in cardinalities:
        results[evaluated_cardinality] = load_result(evaluated_model, evaluated_operation_name, evaluated_cardinality)
    return results

In [13]:
gpt35turbo0125_op_sum_results = load_evaluation_results(GPTModel.GPT_35_TURBO_0125, op_sum_name)

In [24]:
gpt35turbo1106_op_sum_results = load_evaluation_results(GPTModel.GPT_35_TURBO_1106, op_sum_name)

In [14]:
gpt4turbo20240409_op_sum_results = load_evaluation_results(GPTModel.GPT_4_TURBO_2024_04_09, op_sum_name)

## Visualize Results
The results of the evaluation are visualized.

In [15]:
COLOR_VALID = '#6acc64'
COLOR_INVALID = '#d65f5f'

### Bar Chart All Results (atomic)

In [19]:
import plotly.express as px
import pandas as pd

is_valid = 'valid'
is_invalid = 'invalid'
validity_map = {True: is_valid, False: is_invalid}

def visualize_results_atomic(evaluated_model: GPTModel, evaluation_results: Dict[int, BaselineResultSummary]) -> None:
    """
    Visualizes evaluation results.
    :param evaluated_model: evaluated model
    :param evaluation_results: evaluation results
    """

    df_results_atomic = pd.DataFrame({
        'cardinality': [cardinality for (cardinality, summary) in evaluation_results.items() for _ in summary.results],
        'validity': [validity_map[result.is_valid] for (cardinality, summary) in evaluation_results.items() for result in
                     summary.results],
        'n_results': [1 for summary in evaluation_results.values() for _ in summary.results]
    })
    fig = px.bar(
            df_results_atomic,
            x='cardinality',
            y='n_results',
            color='validity',
            title=f'Evaluation Results for {evaluated_model.id}',
            color_discrete_map={is_valid: COLOR_VALID, is_invalid: COLOR_INVALID},
            template='simple_white',
            labels={
                'cardinality': 'list cardinality',
                'n_results': 'number of results'
            },
            height=400
    )
    fig.update_xaxes(dtick=1)
    fig.show()

In [20]:
visualize_results_atomic(GPTModel.GPT_35_TURBO_0125, gpt35turbo0125_op_sum_results)

In [25]:
visualize_results_atomic(GPTModel.GPT_35_TURBO_1106, gpt35turbo1106_op_sum_results)

In [21]:
visualize_results_atomic(GPTModel.GPT_4_TURBO_2024_04_09, gpt4turbo20240409_op_sum_results)

In [7]:
#fig.write_image(f'diagrams/chatgpt_capability_evaluation_op_sum.jpeg', scale=10)

## Determine Success Probabilities
The probability of a successful operation for each list cardinality is determined.

In [8]:
gpt35turbo0125_probabilities_op_sum = [
    {
        'cardinality': evaluated_cardinality,
        'probability': len([result.is_valid for result in summary.results if result.is_valid]) / 100.0
    }
    for (evaluated_cardinality, summary) in gpt35turbo0125_op_sum_results.items()
]
gpt35turbo0125_probabilities_op_sum

[{'cardinality': 8, 'probability': 1.0},
 {'cardinality': 9, 'probability': 0.98},
 {'cardinality': 10, 'probability': 0.88},
 {'cardinality': 11, 'probability': 0.89},
 {'cardinality': 12, 'probability': 0.75},
 {'cardinality': 13, 'probability': 0.74},
 {'cardinality': 14, 'probability': 0.58},
 {'cardinality': 15, 'probability': 0.5},
 {'cardinality': 16, 'probability': 0.34},
 {'cardinality': 17, 'probability': 0.3},
 {'cardinality': 18, 'probability': 0.21},
 {'cardinality': 19, 'probability': 0.1},
 {'cardinality': 20, 'probability': 0.09},
 {'cardinality': 21, 'probability': 0.12},
 {'cardinality': 22, 'probability': 0.05},
 {'cardinality': 23, 'probability': 0.09},
 {'cardinality': 24, 'probability': 0.04},
 {'cardinality': 25, 'probability': 0.04},
 {'cardinality': 26, 'probability': 0.04},
 {'cardinality': 27, 'probability': 0.02},
 {'cardinality': 28, 'probability': 0.0},
 {'cardinality': 29, 'probability': 0.0},
 {'cardinality': 30, 'probability': 0.01},
 {'cardinality': 31

In [26]:
gpt35turbo1106_probabilities_op_sum = [
    {
        'cardinality': evaluated_cardinality,
        'probability': len([result.is_valid for result in summary.results if result.is_valid]) / 100.0
    }
    for (evaluated_cardinality, summary) in gpt35turbo1106_op_sum_results.items()
]
gpt35turbo1106_probabilities_op_sum

[{'cardinality': 8, 'probability': 1.0},
 {'cardinality': 9, 'probability': 0.97},
 {'cardinality': 10, 'probability': 0.94},
 {'cardinality': 11, 'probability': 0.75},
 {'cardinality': 12, 'probability': 0.75},
 {'cardinality': 13, 'probability': 0.64},
 {'cardinality': 14, 'probability': 0.55},
 {'cardinality': 15, 'probability': 0.48},
 {'cardinality': 16, 'probability': 0.4},
 {'cardinality': 17, 'probability': 0.25},
 {'cardinality': 18, 'probability': 0.09},
 {'cardinality': 19, 'probability': 0.12},
 {'cardinality': 20, 'probability': 0.13},
 {'cardinality': 21, 'probability': 0.07},
 {'cardinality': 22, 'probability': 0.08},
 {'cardinality': 23, 'probability': 0.08},
 {'cardinality': 24, 'probability': 0.03},
 {'cardinality': 25, 'probability': 0.06},
 {'cardinality': 26, 'probability': 0.06},
 {'cardinality': 27, 'probability': 0.07},
 {'cardinality': 28, 'probability': 0.04},
 {'cardinality': 29, 'probability': 0.02},
 {'cardinality': 30, 'probability': 0.05},
 {'cardinality'

In [27]:
gpt4turbo20240409_probabilities_op_sum = [
    {
        'cardinality': evaluated_cardinality,
        'probability': len([result.is_valid for result in summary.results if result.is_valid]) / 100.0
    }
    for (evaluated_cardinality, summary) in gpt4turbo20240409_op_sum_results.items()
]
gpt4turbo20240409_probabilities_op_sum


[{'cardinality': 8, 'probability': 1.0},
 {'cardinality': 9, 'probability': 1.0},
 {'cardinality': 10, 'probability': 1.0},
 {'cardinality': 11, 'probability': 0.98},
 {'cardinality': 12, 'probability': 0.97},
 {'cardinality': 13, 'probability': 0.9},
 {'cardinality': 14, 'probability': 0.89},
 {'cardinality': 15, 'probability': 0.8},
 {'cardinality': 16, 'probability': 0.72},
 {'cardinality': 17, 'probability': 0.51},
 {'cardinality': 18, 'probability': 0.49},
 {'cardinality': 19, 'probability': 0.34},
 {'cardinality': 20, 'probability': 0.27},
 {'cardinality': 21, 'probability': 0.12},
 {'cardinality': 22, 'probability': 0.07},
 {'cardinality': 23, 'probability': 0.08},
 {'cardinality': 24, 'probability': 0.1},
 {'cardinality': 25, 'probability': 0.06},
 {'cardinality': 26, 'probability': 0.11},
 {'cardinality': 27, 'probability': 0.09},
 {'cardinality': 28, 'probability': 0.07},
 {'cardinality': 29, 'probability': 0.1},
 {'cardinality': 30, 'probability': 0.04},
 {'cardinality': 31,