In [1]:
import os
from dotenv import load_dotenv
import openai
import eval
import utils
import time
load_dotenv()
openai.api_key = os.getenv("GPT_SECRET_KEY")

## Few shot prompting
Here we give in the prompt a sequence of unverified (role: user) and corresponding verified (role: assistant) programs. In the end we give an unverified program, and expect the assistant to generate a verified program.

### Run single
Modify the `EXAMPLE_TO_HOLD_OUT` variable in the cell below to see the output from GPT, and corresponding output from Nagini.

In [2]:
EXAMPLE_TO_HOLD_OUT = "append"
response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=eval.get_few_shot_prompt(EXAMPLE_TO_HOLD_OUT),
)
program_snippet = utils.print_and_process_response(response)
print("Verification result:\n", eval.run_single("list", program_snippet))

Generated program from GPT:
def append(head: Node, val: int) -> None:
    """Appends a new node with the given value to the end of the list."""
    Requires(is_list(head))
    Ensures(is_list(head))
    Unfold(is_list(head))
    if head.next is None:
        n = Node(val)
        Fold(is_list(n))
        head.next = n
    else:
        append(head.next, val)
    Fold(is_list(head))
response ['', 'Verification successful', 'Verification took 11.25 seconds.']
Verification result:
 Verification successful


### Run all
Repeat the above experiment for all examples with one example held out in each call to GPT and summarize the results.

In [3]:
from data import Data

data = Data()
examples = data.get_list_of_examples("list")
print("Running Evaluation on examples:\n", examples)

Running Evaluation on examples:
 ['prepend', 'append', 'find', 'find_iter', 'remove', 'join_lists', 'reverse']


In [4]:
MAX_ATTEMPTS = 3
results:dict[str,bool] = {}
for example in examples:
    for i in range(MAX_ATTEMPTS):
        print("Running example:", example, " attempt:", i+1)
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=eval.get_few_shot_prompt(example),
        )
        program_snippet = utils.print_and_process_response(response)
        result = eval.run_single("list", program_snippet)
        print("Verification result:\n", result,"\n\n")
        results[example] = True if result == "Verification successful" else False
        if results[example]:
            break
        time.sleep(5)

Running example: prepend  attempt: 1
Generated program from GPT:
def prepend(head: Node, val: int) -> Node:
    """Prepends a new node with the given value to the list."""
    Requires(is_list(head))
    Fold(is_list(head))
    Ensures(is_list(Result()))
    n = Node(val, head)
    Fold(is_list(n))
    return n
response ['', 'Translation failed', 'Invalid program: invalid.contract.position (/home/omkar/ethz/hs23/thesis/llms-for-verified-programs/nagini_examples/tmp.py@25.4)']
Verification result:
 Invalid program: invalid.contract.position (/home/omkar/ethz/hs23/thesis/llms-for-verified-programs/nagini_examples/tmp.py@25.4) 


Running example: prepend  attempt: 2
Generated program from GPT:
def prepend(head: Node, val: int) -> Node:
    """Prepends a new node with the given value to the list."""
    Requires(is_list(head))
    Ensures(is_list(Result()))
    
    Fold(is_list(head))
    n = Node(val, head)
    Fold(is_list(n))
    return n
response ['', 'Verification failed', 'Errors:',

In [5]:
import pandas as pd
df = pd.DataFrame(results, index=[0]).T
df.columns = ["verify@k=3"]
df

Unnamed: 0,verify@k=3
prepend,False
append,True
find,False
find_iter,True
remove,True
join_lists,True
reverse,True


In [6]:
response

<OpenAIObject chat.completion id=chatcmpl-8DT0oEcZFfAITaCn1BwpkljwwOSj9 at 0x7f3bdb7d2270> JSON: {
  "id": "chatcmpl-8DT0oEcZFfAITaCn1BwpkljwwOSj9",
  "object": "chat.completion",
  "created": 1698220478,
  "model": "gpt-3.5-turbo-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "def reverseList(head: Node) -> Optional[Node]:\n    \"\"\"Reverses the list and returns the new head.\"\"\"\n    Requires(is_list(head))\n    Ensures(Implies(Result() is not None, is_list(Result())))\n    \n    Unfold(is_list(head))\n    if head.next is None:\n        Fold(is_list(head))\n        return head\n    Fold(is_list(head))\n    prev = None # type: Optional[Node]\n    ptr = head # type: Optional[Node]\n    while ptr != None:\n        Invariant(Implies(ptr is not None, is_list(ptr)))\n        Invariant(Implies(prev is not None, is_list(prev)))\n        Unfold(is_list(ptr))\n        tmp = ptr.next\n        ptr.next = prev\n        prev = pt

## Experiment 2: prompting with errors

Evaluate how effective is nagini error message (with right line number) instead of max_attempts

## GPT 4 / sys prompt

### Questions / to-dos:
1. GPT random seed? Temperature hyperparameter? -- try: higher temperature larger k
2. tested MAX_ATTEMPTS=3 (worse results). How about a param take=n which determines (at random) how many examples to take from the list of examples in constructing the few-shot prompt
3. Experiment with a different system prompt