Setting up the environment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from google.colab import userdata
import os

os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
os.environ['GOOGLE_API_KEY'] = userdata.get('GOOGLE_API_KEY')
os.environ['ANTHROPIC_API_KEY'] = userdata.get('ANTHROPIC_API_KEY')
os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
os.environ['GROQ_API_KEY'] = userdata.get('GROQ_API_KEY')

requirements.txt

In [10]:
%pip install transformers
%pip install langchain
%pip install openai
%pip install google-generativeai
%pip install boto3
%pip install anthropic
%pip install langchain-google-genai
%pip install langchain-huggingface
%pip install langchain-openai
%pip install langchain-anthropic
%pip install langchain-groq
%pip install pandas
%pip install tensorflow-datasets



Imports

In [None]:
from langchain.chat_models import init_chat_model

"Smarter Models"

In [None]:
smarter_models = {
  'gemini-2.0-flash': init_chat_model(model="google_genai:gemini-2.0-flash-thinking-exp-01-21"),
  # 'deepseek': init_chat_model(model="perplexity-ai/r1-1776", model_provider="huggingface"),
  'claude-3.5-haiku': init_chat_model(model="anthropic:claude-3-5-haiku-20241022"),
  # 'o1-mini': init_chat_model(model="openai:o1-mini"),
}

In [None]:
from langchain_core.messages import HumanMessage, SystemMessage

task_prompt = "Write python code to determine if a number prime or not, consider all the edge cases in input"
system_prompt_generated_knowledge = "You are a teacher to an AI LLM. For the following task, generate a prompt that contains knowledge needed to solve the problem without giving the answer and start directly with the prompt, Task:\n"

messages = [
    SystemMessage(system_prompt_generated_knowledge),
    HumanMessage(task_prompt),
]



In [None]:
smarter_model_responses = {}
for model in smarter_models:
  smarter_model_responses[model] = smarter_models[model].invoke(messages)

In [None]:
for response in smarter_model_responses:
  print(response+':',smarter_model_responses[response].content,'\n')

gemini-2.0-flash: Task:
Write Python code to define a function called `is_prime(number)` that takes an integer as input and returns `True` if the number is a prime number and `False` otherwise.  Remember that a prime number is a natural number greater than 1 that has exactly two distinct positive divisors: 1 and itself.  When writing your code, consider and handle potential edge cases for the input number.  Specifically, think about how your code should behave for negative numbers, zero, and one, as these are not typically considered prime.  Also, think about the efficiency of your approach, especially when dealing with potentially large input numbers. Your function should be well-documented, explaining its purpose and how it handles different inputs. 

claude-3.5-haiku: Prompt: Create a Python function that determines whether a given integer is prime or not. Consider the following requirements:
- Handle different types of input (integers, floats, strings)
- Account for edge cases such

Benchmark Datasets

References

(1) https://github.com/openai/evals/blob/main/examples/mmlu.ipynb

In [9]:
# %cd /content/drive/MyDrive/AI\ Safety\ Camp/datasets/mmlu/
# !pwd
# !curl -O https://people.eecs.berkeley.edu/~hendrycks/data.tar
# !tar -xf data.tar
data_path = "/content/drive/MyDrive/AI Safety Camp/datasets/mmlu/"

/content/drive/MyDrive/AI Safety Camp/datasets/mmlu
/content/drive/MyDrive/AI Safety Camp/datasets/mmlu
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  158M  100  158M    0     0  28.4M      0  0:00:05  0:00:05 --:--:-- 32.8M


In [None]:
# !pip install tensorflow-datasets
import tensorflow_datasets as tfds
import tensorflow as tf

# Construct a tf.data.Dataset
ds = tfds.load('mnist', split='train', as_supervised=True, shuffle_files=True)

# Build your input pipeline
ds = ds.shuffle(1000).batch(128).prefetch(10).take(5)
for image, label in ds:
  pass

Prompt Generation and Storage Loop



(TBD) Prompt Library Database (maybe RAG?)

"Weaker" Models

In [None]:
weaker_models = {
  'gemini-1.5-flash-8b': init_chat_model(model="google_genai:gemini-1.5-flash-8b"),
  'gemma2-9b-it': init_chat_model(model="groq:gemma2-9b-it"),
}

weaker_model_messages = [
    SystemMessage("You are a coding assistant. For the following task, you have to generate solution python code and nothing else. Task:\n"),
    HumanMessage(task_prompt),
]

Testing Weaker Model performance on benchmarks without scaffolding prompts

In [None]:
weaker_model_responses = {}
for model in weaker_models:
  weaker_model_responses[model] = weaker_models[model].invoke(weaker_model_messages)

In [None]:
for response in weaker_model_responses:
  print(response+':',weaker_model_responses[response].content)

gemini-1.5-flash-8b: ```python
import math

def is_prime(number):
    """
    Checks if a number is prime.

    Args:
        number: The number to check.

    Returns:
        True if the number is prime, False otherwise.  Returns False for invalid input.
    """
    
    #Handle invalid input
    if not isinstance(number, int) or number <= 1:
        return False
    
    if number <= 3:
        return True
    
    if number % 2 == 0 or number % 3 == 0:
        return False
    
    i = 5
    while i * i <= number:
        if number % i == 0 or number % (i + 2) == 0:
            return False
        i += 6
    
    return True
```
gemma2-9b-it: ```python
def is_prime(number):
  if number <= 1:
    return False
  if number <= 3:
    return True
  if number % 2 == 0 or number % 3 == 0:
    return False
  i = 5
  while i * i <= number:
    if number % i == 0 or number % (i + 2) == 0:
      return False
    i += 6
  return True 
```


Testing Weaker Model performance on benchmarks with scaffolding prompt

In [None]:
scaffolded_messages = []
for model in smarter_model_responses:
  scaffolded_messages.append([
      SystemMessage("You are a coding assistant. For the following task, you have to generate solution python code and nothing else. Task:\n"),
      SystemMessage(smarter_model_responses[model].content),
      HumanMessage(task_prompt)
  ])

weaker_model_with_scaffolding_responses = {}
for model in weaker_models:
  for message in scaffolded_messages:
    weaker_model_with_scaffolding_responses[model] = weaker_models[model].invoke(message)

In [None]:
for response in weaker_model_with_scaffolding_responses:
  print(response+':',weaker_model_with_scaffolding_responses[response].content)

gemini-1.5-flash-8b: ```python
import math

def is_prime(num):
    """
    Efficiently determines if a given number is prime.

    Args:
        num: The number to check.

    Returns:
        True if the number is prime, False otherwise.  Returns False for invalid input.
    """

    # Input validation
    if not isinstance(num, (int, float)):
        print("Error: Input must be an integer or a float.")
        return False
    
    num = int(num)  # Crucial for handling potential float imprecision
    
    if num <= 1:
        return False
    elif num <= 3:
        return True
    elif num % 2 == 0 or num % 3 == 0:
        return False
    
    # Optimized primality test: Check divisibility up to the square root
    i = 5
    while i * i <= num:
        if num % i == 0 or num % (i + 2) == 0:
            return False
        i += 6
    
    return True
```
gemma2-9b-it: ```python
def is_prime(num):
  """
  Determines if a given number is prime.

  Args:
    num: The number to check.


Comparing and plotting change in performance