In [1]:
import json
import re
from typing import Iterator
from IPython.display import display, Markdown, clear_output
from utils import getExamplesSorted, Example
from processResults import fullDiffRegex
from pathlib import PosixPath as Path

namespaceRegex = re.compile(
    r"namespace task(?P<taskNum>\d+) {.*? \/\/ namespace task\d+\s*",
    flags=re.DOTALL,
)

taskRegex = re.compile(
    r"\/\/ Task (?P<taskNum>\d+).*?(?=\s*\/\/ Task \d+|$)",
    flags=re.DOTALL,
)

reviewTemplate = """\

#### {title}
{description}

<table>
<tr>
<td>

`slow`:

```cpp
{codeSlow}
```

</td>
<td>

`agent ({version})`:

```diff
{agentDiff}
```

</td>
<td>

`fast`:

```cpp
{codeFast}
```

</td>
</tr>
</table>
"""

def prompt(msg: str, valid: tuple[str, ...]):
    while (resp:=input(msg)) not in valid: pass
    return resp

def evaluateResult(testNum: int, model: str, output: dict) -> Iterator[None]:
    with open(f"../info{testNum}.json", "r") as f: info = json.load(f)
    
    chatLogs = Path("../chat-logs")
    test = f"test{testNum}"

    with Path.open(chatLogs / test / f"{model}.md", "r") as f: [match] = re.finditer(fullDiffRegex, f.read())
    fullDiff = match.group("fullDiff")
    
    namespaces = dict()
    taskBlocks = dict()
    for match in re.finditer(namespaceRegex, fullDiff):
        taskNum = int(match.group("taskNum"))
        namespaces[taskNum] = match.group(0)
    for match in re.finditer(taskRegex, fullDiff):
        taskNum = int(match.group("taskNum"))
        taskBlocks[taskNum] = match.group(0)

    tasks = output["tasks"] = []
    def zipper() -> Iterator[tuple[int, Example, int]]:
        yield from zip(range(1, 51), getExamplesSorted(), info["choices"], strict=True)

    for taskNum, example, taskVersion in zipper():
        assert taskNum == int(example._key)

        md = reviewTemplate.format(
            title=example.title,
            description=example.description,
            codeSlow=example.codeSlow,
            codeFast=example.codeFast,
            agentDiff=f"{namespaces.get(taskNum, "")}\n\n{taskBlocks[taskNum]}",
            version = "SLOW" if taskVersion == 0 else "FAST"
        )
        display(Markdown(md))
        yield
        modified = prompt("Modified?", ("y", "n"))
        if "y" in modified:
            if taskVersion == 1: improved = "~" # -> agent made changes to efficient version
            else: improved = prompt("Improved?", ("y", "n"))
        else:
            improved = "n"
        tasks.append(dict(taskNum=taskNum, modified=modified, improved=improved))
        yield
        clear_output()

In [623]:
# modified: y / n
# improved: y / n

output = {}
it = evaluateResult(testNum=2, model="o4-mini", output=output)

In [724]:
next(it)

StopIteration: 

In [723]:
next(it)

In [50]:
output["tasks"][-1]

{'taskNum': 23, 'modified': 'y', 'improved': '~'}

In [725]:
with open("../evaluation/test2/o4-mini.json", "x") as f: json.dump(output, f)

In [7]:
output

{'tasks': [{'taskNum': 1, 'modified': 'y', 'improved': '~'},
  {'taskNum': 2, 'modified': 'n', 'improved': 'n'}]}