# Setup

## Verify we're in the Conda environment

In [None]:
import sys

print(sys.executable)

## Import python packages

In [None]:
import os
import sys
import json
from PIL import Image
import base64
import io
from dotenv import load_dotenv
import requests
import pprint
from pathlib import Path
from PIL import Image
import matplotlib.pyplot as plt
import subprocess
import jupyter_black

# Activate the jupyter_black extension, which reformats code cells with black
# https://github.com/n8henrie/jupyter-black
jupyter_black.load()

In [None]:
import re


def filter_text(text):
    # Only keep the first line of the answer
    text = text.split("\n")[0]

    # Remove quotes from the answer. Both single and double quotes are removed.
    text = text.replace('"', "").replace("'", "")

    # Remove leading and trailing whitespaces
    text = text.strip()

    # Remove everything after the first space
    text = text.split(" ")[0]

    # # Use regex to remove special characters
    # text = re.sub(r"[^\x00-\x7F]+", "", text)

    # # Remove backslash from the answer.
    # text = text.replace("\\", "")

    # # Remove [end of text]
    # text = text.replace("[end of text]", "")

    return text


# Example usage
text = "Byzantine\u5171\u8bc6 ('Byzantine' \"consensus\")\n next line must be removed"
filtered_text = filter_text(text)
print(filtered_text)

In [None]:
# Define where the llama-cli is located, relative to this notebook
LLAMA_CLI_PATH = "../../../ggml_org_llama_615212.cpp/build/bin/llama-cli"

# Select a model to use
MODEL = "../../../llama_cpp_canister/models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf"
# MODEL = "../../../llama_cpp_canister/models/tensorblock/SmolLM2-135M-Instruct-GGUF/SmolLM2-135M-Instruct-Q8_0.gguf"
# MODEL = "../../../llama_cpp_canister/models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q2_K.gguf"
# MODEL = "../../../llama_cpp_canister/models/unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF/DeepSeek-R1-Distill-Qwen-7B-Q2_K.gguf"

print_command = True


def run_llama_cpp(
    prompt,
    num_tokens,
    seed,
    temp,
    top_k,
    top_p,
    min_p,
):
    command = [
        LLAMA_CLI_PATH,
        "-m",
        MODEL,
        "--no-warmup",  # needed when running from CLI. Is default for llama_cpp_canister
        "-no-cnv",  # needed when running from CLI. Is default for llama_cpp_canister
        "--simple-io",
        "--no-display-prompt",  # only return the generated text, without special characters
        # "-sp", # output special tokens
        "-n",
        f"{num_tokens}",
        "--seed",
        f"{seed}",
        "--temp",
        f"{temp}",
        "--top-k",
        f"{top_k}",
        "--top-p",
        f"{top_p}",
        "--min-p",
        f"{min_p}",
        "-p",
        prompt,
    ]

    # print this only once !
    global print_command
    if print_command:
        print_command = False
        # Print the command on a single line for terminal use, preserving \n
        print(
            "\nCommand:\n",
            f"{LLAMA_CLI_PATH} -m {MODEL} --no-warmup -no-cnv --simple-io --no-display-prompt -n {num_tokens} --seed {seed} --temp {temp} -p '{prompt}'".replace(
                "\n", "\\n"
            ),
        )

    # Run the command and capture the output
    result = subprocess.run(
        command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
    )
    output = result.stdout
    return output

In [None]:
import json
import pprint

# Specify the file path
file_path = "2-mainer.json"

# Read the list from the JSON file
with open(file_path, "r") as file:
    challenges_with_answers = json.load(file)

print(f"challenges_with_answers have been read from the file: {file_path}")

pprint.pprint(challenges_with_answers[:3])

In [None]:
judge_num_tokens = 10
judge_seed = 42

# disable all these
judge_temp = 0.0
# judge_top_k = 0  # (default: 40, 0 = disabled)
# judge_top_p = 1.0  # (default: 0.9, 1.0 = disabled)
# judge_min_p = 0.0  # (default: 0.1, 0.0 = disabled)

# Defaults for these
# judge_temp = 0.0
judge_top_k = 40  # (default: 40, 0 = disabled)
judge_top_p = 0.9  # (default: 0.9, 1.0 = disabled)
judge_min_p = 0.1  # (default: 0.1, 0.0 = disabled)

judge = challenges_with_answers.copy()

for challenge in judge[0:]:
    print_prompt = True
    print_prompt_2 = True
    challenge_id = challenge["challenge_id"]
    challenge_topic = challenge["challenge_topic"]
    challenge_question = challenge["challenge_question"]
    mainer_answers = challenge["mainer_answers"]

    # keep track of scored answers to avoid using the LLM for re-scoring
    scored_answers = []
    scores = []

    for next_mainer_answer in mainer_answers:
        filtered_answer = filter_text(next_mainer_answer["mainer_answer"])

        print("--")
        print(f"filtered_answer: {filtered_answer}")

        # Get the score if the filtered answer is already scored
        if filtered_answer in scored_answers:
            judge_score = scores[scored_answers.index(filtered_answer)]
            print(f"Using the existing score: {judge_score}")
            next_mainer_answer["judge_score"] = judge_score
            continue

        # This answer is not yet scored, so use the LLM to score it
        #         judge_prompt = f"""
        # <|im_start|>system
        # You must score answers based on its correctness to the question:

        # - {challenge_question}

        # Score the answer between 1 and 5.
        # 1 = completely wrong
        # 2 = mostly wrong
        # 3 = partially correct
        # 4 = mostly correct
        # 5 = completely correct

        # <|im_end|>
        # <|im_start|>user
        # Score this answer:

        # - {filtered_answer}

        # Respond with the score only, nothing else.
        # <|im_end|>
        # <|im_start|>assistant
        # """

        # GOOD ! but eco. gets a 4
        #         judge_prompt = f"""
        # <|im_start|>system
        # You grade answers based on its correctness to the question:

        # - {challenge_question}

        # Grade the answer between 1 and 5.
        # 1 = completely wrong
        # 2 = mostly wrong
        # 3 = partially correct
        # 4 = mostly correct
        # 5 = completely correct

        # <|im_end|>
        # <|im_start|>user
        # Grade this answer:

        # - {filtered_answer}

        # Respond with the grade only, nothing else.
        # <|im_end|>
        # <|im_start|>assistant
        # """

        # GOOD ! and eco. gets a 1 ; parachain gets a 5
        judge_prompt = f"""
<|im_start|>system
You grade answers based on its correctness to the question: 

- {challenge_question}

Grade the answer between 1 and 5.
1 = completely wrong
2 = mostly wrong
3 = partially correct
4 = mostly correct
5 = completely correct

<|im_end|>
<|im_start|>user
Grade this answer based on its correctness: 

- {filtered_answer}

Respond with the grade only, nothing else.
<|im_end|>
<|im_start|>assistant
"""

        if print_prompt:
            print_prompt = False
            print("-------------------")
            print("The prompt for the judge:")
            print(judge_prompt)
            print(" ")

        judge_answer = run_llama_cpp(
            judge_prompt,
            judge_num_tokens,
            judge_seed,
            judge_temp,
            judge_top_k,
            judge_top_p,
            judge_min_p,
        )

        print(f"judge_answer (raw): {judge_answer}")

        judge_answer = filter_text(judge_answer)

        judge_score = int(judge_answer)
        print(f"judge_score: {judge_score}")

        # --------------------------------------------------
        # if judge_score == 5:
        # # --------------------------------------------------
        # print("Score is 5 (!) - Ask 1.5B LLM to score it too")
        # judge_answer_2 = run_llama_cpp(
        #     judge_prompt,
        #     judge_num_tokens,
        #     judge_seed,
        #     judge_temp,
        #     judge_top_k,
        #     judge_top_p,
        #     judge_min_p,
        # )

        # print(f"judge_answer_2 (raw): {judge_answer_2}")

        # # judge_answer_2 = filter_text(judge_answer_2)

        # judge_score_2 = int(judge_answer_2)
        # print(f"judge_score: {judge_score_2}")

        # --------------------------------------------------
        #         print("Score is 5 (!) - Ask LLM if this is indeed a correct answer")
        #         judge_prompt_2 = f"""
        # <|im_start|>system
        # You confirm if an answer is completely correct to the question:

        # - {challenge_question}

        # You answer: yes or no
        # <|im_end|>
        # <|im_start|>user
        # Are you sure this answer is 100% correct?

        # - {filtered_answer}

        # Respond with yes or no only, nothing else.
        # <|im_end|>
        # <|im_start|>assistant
        # """
        #         if print_prompt_2:
        #             print_prompt_2 = False
        #             print("-------------------")
        #             print("The prompt_2 for the judge:")
        #             print(judge_prompt_2)
        #             print(" ")

        #         judge_answer_2 = run_llama_cpp(
        #             judge_prompt_2,
        #             judge_num_tokens,
        #             judge_seed,
        #             judge_temp,
        #             judge_top_k,
        #             judge_top_p,
        #             judge_min_p,
        #         )

        #         print(f"judge_answer_2 (raw): {judge_answer_2}")

        #         if "yes" not in judge_answer_2:
        #             judge_score = 4  # Downgrade to score if it is actually not correct

        # -----------------------------------------------

        #
        # Save the score
        scored_answers.append(filtered_answer)
        scores.append(judge_score)
        next_mainer_answer["judge_score"] = judge_score

        # On a regular basis, save it, so we can monitor progress
        # Specify the file path
        file_path = "3-judge.json"

        # Write the list to a JSON file
        with open(file_path, "w") as file:
            json.dump(judge, file, indent=4)