requirements.txt

In [None]:
!pip install inspect_ai
!pip install git+https://github.com/UKGovernmentBEIS/inspect_evals
!pip install openai
!pip install google-generativeai
!pip install groq
!pip install anthropic
!pip install langchain
!pip install -qU "langchain[openai]"
!pip install -qU "langchain[anthropic]"
!pip install -qU "langchain[groq]"
!pip install -qU "langchain[google-vertexai]"
!pip install langchain-google-genai

API Keys

In [2]:
from google.colab import userdata
import os

os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
os.environ['GOOGLE_API_KEY'] = userdata.get('GOOGLE_API_KEY')
os.environ['ANTHROPIC_API_KEY'] = userdata.get('ANTHROPIC_API_KEY')
os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
os.environ['GROQ_API_KEY'] = userdata.get('GROQ_API_KEY')

Mount Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/AISC/

/content/drive/MyDrive/AISC


Imports

In [5]:
from inspect_ai import Task, task, task_with
from inspect_ai.dataset import Sample
from inspect_ai.solver import generate, solver, TaskState, Generate, system_message, multiple_choice
from inspect_ai.solver._multiple_choice import prompt as lprompt
from inspect_ai.scorer import exact, choice
from inspect_ai import eval
from inspect_ai.model import get_model, GenerateConfig, Model, ChatMessageUser
from inspect_ai._util.dict import omit
from inspect_ai._util.format import format_template
from inspect_ai.util import resource

Models

In [8]:
BASE_MODEL_1 = get_model('groq/gemma2-9b-it',memoize=False)
BASE_MODEL_2 = get_model('groq/llama3-8b-8192',memoize=False)
PROMPT_OPTIMIZER_MODEL_1 = get_model('openai/gpt-4o-mini',memoize=False)
PROMPT_OPTIMIZER_MODEL_2 = get_model(model='anthropic/claude-3-5-haiku-20241022',memoize=False,config=GenerateConfig(max_retries=2))
PROMPT_OPTIMIZER_MODEL_3 = get_model('openai/o3-mini',memoize=False)
PROMPT_OPTIMIZER_MODEL_4 = get_model('anthropic/claude-3-7-sonnet-20250219',memoize=False)

In [9]:
# Test if the models were initialized
print(BASE_MODEL_1.name, (await BASE_MODEL_1.generate(input="Hello World")).completion)
print(BASE_MODEL_2.name, (await BASE_MODEL_2.generate(input="Hello World")).completion)
print(PROMPT_OPTIMIZER_MODEL_1.name, (await PROMPT_OPTIMIZER_MODEL_1.generate(input="Hello World")).completion)
print(PROMPT_OPTIMIZER_MODEL_2.name, (await PROMPT_OPTIMIZER_MODEL_2.generate(input="Hello World")).completion)
print(PROMPT_OPTIMIZER_MODEL_3.name, (await PROMPT_OPTIMIZER_MODEL_3.generate(input="Hello World")).completion)
print(PROMPT_OPTIMIZER_MODEL_4.name, (await PROMPT_OPTIMIZER_MODEL_4.generate(input="Hello World")).completion)

gemma2-9b-it Hello there! 👋 

How can I help you today? 😊

llama3-8b-8192 Hello World! Nice to meet you! How can I help you today?
gpt-4o-mini Hello! How can I assist you today?
claude-3-5-haiku-20241022 Hello! How are you doing today? Is there anything I can help you with?
o3-mini Hello there! "Hello World" is a timeless greeting—often used as the first step when learning a new programming language. How can I assist you today?
claude-3-7-sonnet-20250219 Hello! It's nice to meet you. I'm an AI assistant ready to help with information, answer questions, or just chat. Is there something specific you'd like to talk about or any way I can assist you today?


Custom Solver

In [10]:
@solver
def custom_solver(template: str, optimizer_model: Model):
    prompt_template = resource(template)
    async def solve(state: TaskState, generate: Generate):
      try:
        nonlocal optimizer_model
        prompt = state.user_prompt
        kwargs = omit(state.metadata | state.store._data, ["prompt"])
        prompt_optimizer_input = format_template(prompt_template, {"prompt": prompt} | kwargs)
        response = await optimizer_model.generate(input=prompt_optimizer_input, config=GenerateConfig(max_retries=2,timeout=5))
        prompt.text = format_template(response.completion, {"prompt": prompt.text} | kwargs)
        return state
      except Exception as e:
        print(response.completion)
        return state
    return solve

Custom Solver Langchain Agent

In [None]:
# https://github.com/UKGovernmentBEIS/inspect_ai/blob/1f6162992c01b0d258377527c2b8473821c56352/examples/bridge/langchain/agent.py
# https://python.langchain.com/docs/tutorials/agents/

# Import relevant functionality
from langchain_anthropic import ChatAnthropic
from langchain_openai import ChatOpenAI
from langchain_groq import ChatGroq
from langchain_core.messages import HumanMessage

Custom Scorer (Not needed atm)

Eval Tasks

In [None]:
from inspect_evals import arc
from inspect_evals import agieval
from inspect_evals import mmlu_pro

arc_evaluation_task = arc.arc_challenge()
agie_english_task= agieval.agie_lsat_ar()
agie_math_task = agieval.agie_math()
mmlu_pro_task = mmlu_pro.mmlu_pro()

In [14]:
# https://github.com/UKGovernmentBEIS/inspect_evals/blob/main/src/inspect_evals/agieval/utils.py
MULTIPLE_CHOICE_TEMPLATE_EN = r"""
{fewshot_string}

Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}. {cot_string}

{question}

{choices}
""".strip()

Eval without scaffolding

In [13]:
eval(
    tasks=arc_evaluation_task,
    model=BASE_MODEL_1,
    solver=[
        multiple_choice(),
    ],
    log_dir='./logs/',
    log_format='json',
)

Output()

In [None]:
eval(
    tasks=agie_english_task,
    model=BASE_MODEL_1,
    solver=[
        multiple_choice(template=MULTIPLE_CHOICE_TEMPLATE_EN),
        generate(),
    ],
    log_dir='./logs/',
    log_format='json',
)

In [None]:
eval(
    tasks=agie_math_task,
    model=BASE_MODEL_1,
    solver=[
        multiple_choice(template=MULTIPLE_CHOICE_TEMPLATE_EN),
        generate(),
    ],
    log_dir='./logs/',
    log_format='json',
)

In [None]:
eval(
    tasks=arc_evaluation_task,
    model=BASE_MODEL_2,
    solver=[
        multiple_choice(),
    ],
    log_dir='./logs/',
    log_format='json',
)

Output()

In [None]:
eval(
    tasks=agie_english_task,
    model=BASE_MODEL_2,
    solver=[
        multiple_choice(template=MULTIPLE_CHOICE_TEMPLATE_EN),
        generate(),
    ],
    log_dir='./logs/',
    log_format='json',
)

Eval on prompt optimizer model (openai/gpt-4o-mini)

In [None]:
eval(
    tasks=arc_evaluation_task,
    model=PROMPT_OPTIMIZER_MODEL_1,
    solver=[
        multiple_choice(),
    ],
    log_dir='./logs/',
    log_format='json',
)

Output()

Eval on prompt optimizer model (anthropic/claude-3-5-haiku-20241022)

In [None]:
eval(
    tasks=arc_evaluation_task,
    model=PROMPT_OPTIMIZER_MODEL_2,
    solver=[
        multiple_choice(),
    ],
    log_dir='./logs/',
    log_format='json',
)

Output()

Eval on prompt optimizer model (openai/o3-mini)

In [None]:
eval(
    tasks=arc_evaluation_task,
    model=PROMPT_OPTIMIZER_MODEL_3,
    solver=[
        multiple_choice(),
    ],
    log_dir='./logs/',
    log_format='json',
)

Eval on prompt optimizer model (anthropic/claude-3-7-sonnet-20250219)

In [None]:
eval(
    tasks=arc_evaluation_task,
    model=PROMPT_OPTIMIZER_MODEL_4,
    solver=[
        multiple_choice(),
    ],
    log_dir='./logs/',
    log_format='json',
)

Eval with scaffolding

gpt-4o-mini + llama3-8b-8192 as base model

In [None]:
scaffolded_evaluation_task = task_with(
    task=arc.arc_challenge(),
    solver=[
        custom_solver('./prompt_templates/generated_knowledge.txt', PROMPT_OPTIMIZER_MODEL_1),
        multiple_choice(),
    ],
)

eval(
    tasks=scaffolded_evaluation_task,
    model=BASE_MODEL_2,
    log_dir='./logs/',
    log_format='json',
)

Output()

gpt-4o-mini + gemma as base model

In [None]:
scaffolded_evaluation_task = task_with(
    task=arc.arc_challenge(),
    solver=[
        custom_solver('./prompt_templates/generated_knowledge.txt', PROMPT_OPTIMIZER_MODEL_1),
        multiple_choice(),
    ],
)

eval(
    tasks=scaffolded_evaluation_task,
    model=BASE_MODEL_1,
    log_dir='./logs/',
    log_format='json',
)

Output()

claude-haiku + llama3-8b-8192 as base model

In [None]:
scaffolded_evaluation_task = task_with(
    task=arc.arc_challenge(),
    solver=[
        custom_solver('./prompt_templates/generated_knowledge.txt', PROMPT_OPTIMIZER_MODEL_2),
        multiple_choice(),
    ],
)

eval(
    tasks=scaffolded_evaluation_task,
    model=BASE_MODEL_2,
    log_dir='./logs/',
    log_format='json',
)

claude-haiku + gemma as base model

In [None]:
scaffolded_evaluation_task = task_with(
    task=arc.arc_challenge(),
    solver=[
        custom_solver('./prompt_templates/generated_knowledge.txt', PROMPT_OPTIMIZER_MODEL_2),
        multiple_choice(),
    ],
)

eval(
    tasks=scaffolded_evaluation_task,
    model=BASE_MODEL_1,
    log_dir='./logs/',
    log_format='json',
)

Output()

o3-mini + llama3-8b-8192 as base model

In [None]:
scaffolded_evaluation_task = task_with(
    task=arc.arc_challenge(),
    solver=[
        custom_solver('./prompt_templates/generated_knowledge.txt', PROMPT_OPTIMIZER_MODEL_3),
        multiple_choice(),
    ],
)

eval(
    tasks=scaffolded_evaluation_task,
    model=BASE_MODEL_2,
    log_dir='./logs/',
    log_format='json',
)

o3-mini + gemma as base model

In [None]:
scaffolded_evaluation_task = task_with(
    task=arc.arc_challenge(),
    solver=[
        custom_solver('./prompt_templates/generated_knowledge.txt', PROMPT_OPTIMIZER_MODEL_3),
        multiple_choice(),
    ],
)

eval(
    tasks=scaffolded_evaluation_task,
    model=BASE_MODEL_1,
    log_dir='./logs/',
    log_format='json',
)

claude-sonet + llama3-8b-8192 as base model

In [None]:
scaffolded_evaluation_task = task_with(
    task=arc.arc_challenge(),
    solver=[
        custom_solver('./prompt_templates/generated_knowledge.txt', PROMPT_OPTIMIZER_MODEL_4),
        multiple_choice(),
    ],
)

eval(
    tasks=scaffolded_evaluation_task,
    model=BASE_MODEL_2,
    log_dir='./logs/',
    log_format='json',
)

claude-sonet + gemma as base model

In [None]:
scaffolded_evaluation_task = task_with(
    task=arc.arc_challenge(),
    solver=[
        custom_solver('./prompt_templates/generated_knowledge.txt', PROMPT_OPTIMIZER_MODEL_4),
        multiple_choice(),
    ],
)

eval(
    tasks=scaffolded_evaluation_task,
    model=BASE_MODEL_1,
    log_dir='./logs/',
    log_format='json',
)

Comparison

Interpreting Results

In [None]:
import json

if __name__ == '__main__':
    with open('./logs/2025-03-22T14-43-31+00-00_arc-challenge_nxg7KJPr4jbPPqsCtsXMtT.json','r') as file:
        data = json.load(file)
    outputs = {}
    for sample in data['samples']:
        outputs[sample['input']] = {
            'enhanced_prompt': sample['messages'][1]['content'] # The assistant response
        }
    with open('./enhanced_prompts/hello_world1.json','w+') as file:
        json.dump(outputs, file, indent=4)
