In [6]:
import os
from dotenv import load_dotenv
import openai
import eval
import time
load_dotenv()
openai.api_key = os.getenv("GPT_SECRET_KEY")

## Few shot prompting
Here we give in the prompt a sequence of unverified (role: user) and corresponding verified (role: assistant) programs. In the end we give an unverified program, and expect the assistant to generate a verified program.

### Run single
Modify the `EXAMPLE_TO_HOLD_OUT` variable in the cell below to see the output from GPT, and corresponding output from Nagini.

In [2]:
EXAMPLE_TO_HOLD_OUT = "prepend"
MAX_ATTEMPTS = 2  # Number of times to try generating a program. ;;to-implement
response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=eval.get_few_shot_prompt(EXAMPLE_TO_HOLD_OUT),
)
program_snippet = response["choices"][0]["message"]["content"]
print("Generated program from GPT:")
print(program_snippet)
print("=====================================")
print("Verification result:")
print(eval.run_single("list", program_snippet))

Generated program from GPT:
def prepend(head: Node, val: int) -> Node:
    """Prepends a new node with the given value to the list."""
    Requires(is_list(head))
    Ensures(is_list(Result()))
    n = Node(val, head)
    Fold(is_list(n))
    return n
Verification result:
response ['', 'Verification successful', 'Verification took 4.18 seconds.']
Verification successful


### Run all
Repeat the above experiment for all examples with one example held out in each call to GPT and summarize the results.

In [7]:
from data import Data

data = Data()
examples = data.get_list_of_examples("list")
print("Running Evaluation on examples:\n", examples)

Running Evaluation on examples:
 ['prepend', 'append', 'find', 'find_iter', 'remove', 'join_lists']


In [8]:
MAX_ATTEMPTS = 2
results:dict[str,bool] = {}
for example in examples:
    for i in range(MAX_ATTEMPTS):
        print("Running example:", example, " attempt:", i)
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=eval.get_few_shot_prompt(example),
        )
        program_snippet = response["choices"][0]["message"]["content"]
        print("Generated program from GPT:")
        print(program_snippet)
        print("=====================================")
        print("Verification result:")
        result = eval.run_single("list", program_snippet)
        print(result)
        results[example] = True if result == "Verification successful" else False
        if results[example]:
            break
        time.sleep(5)

Running example: prepend  attempt: 0
Generated program from GPT:
def prepend(head: Node, val: int) -> Node:
    """Prepends a new node with the given value to the list."""
    Requires(is_list(head))
    Ensures(is_list(Result()))
    n = Node(val, head)
    Fold(is_list(n))
    return n
Verification result:
response ['', 'Verification successful', 'Verification took 2.07 seconds.']
Verification successful
Running example: append  attempt: 0
Generated program from GPT:
def append(head: Node, val: int) -> None:
    """Appends a new node with the given value to the end of the list."""
    Requires(is_list(head))
    Ensures(is_list(head))
    Unfold(is_list(head))
    if head.next is None:
        n = Node(val)
        Fold(is_list(n))
        head.next = n
    else:
        n_old_next = head.next
        n_new_next = Node(val)
        head.next = n_new_next
        n_new_next.next = n_old_next
        Fold(is_list(n_new_next))
        append(n_new_next.next, val)
    Fold(is_list(head))

In [9]:
results

{'prepend': True,
 'append': False,
 'find': True,
 'find_iter': True,
 'remove': True,
 'join_lists': True}