In [1]:
from platform import python_version

python_version()

'3.10.12'

In [8]:
import json 
import spacy

In [3]:
import pandas as pd 

In [4]:
from typing import List 

from prompt_lib.backends import openai_api

In [5]:
import openai

with open("path/to/key.txt", "r") as f:
    openai_key = f.readline()
    
assert openai_key is not None

openai.api_key = openai_key.strip()

## Get data 

In [106]:
with open("complex/samples.json", "r") as f:
    sample_problems = json.load(f)
    
sample_problems[0]["questions"][0]

{'question': 'How many manufacturers collaborated on this project?',
 'answer': '3',
 'acceptable_answers': ['three',
  '3.0',
  'three manufacturers',
  'Tesla, Volkswagen, and Bosch']}

## Helper functions for evaluation

In [15]:
"""
    To attempt to compare numerical answers
"""
def compare_numericals(solution, model_response, tolerance: float=1e-3):
    
    if model_response is None or solution is None:
        raise ValueError("Nothing to compare!")
    
    # Check if they're numbers 
    try:
        res = float(model_response.strip())
        sol = float(solution.strip())
        
        return abs(res - sol) <= tolerance
    
    except Exception:
        return False


In [47]:
nlp = spacy.load("en_core_web_sm")
"""
Tokenize and compare string tokens 
"""
def compare_tokens(solution: str, response: str) -> bool:
    sol_doc = nlp(solution.strip())
    sol_tokens = [t.text.lower() for t in sol_doc]  # todo: consider using lemma_
    
    res_doc = nlp(response.strip())
    res_tokens = [t.text.lower() for t in res_doc]  # todo: same here, consider using lemma_
    
    # comparisons
    overlap = [t for t in res_tokens if t in sol_tokens]
    
    return set(overlap) == set(sol_tokens)



In [110]:
# Some constants
MAX_TOKENS = 300
ENGINE = "gpt-3.5-turbo-0613"  # "text-davinci-003"
NUM_COMPLETIONS = 1

In [102]:
"""
   This function will craft a prompt to ChatGPT, asking it to compare 
   the two answers given to a question and check if they both answer 
   the question correctly or no. Obviously one does, because it's the 
   correct answer from the dataset. 
"""
def chatgpt_verifier(question: str, solution: str, resp: str) -> bool:
    command = f"In the context of this question, do both following answers correctly answer the question exactly? Respond with Yes or No.\n\nQuestion: {question}\nAnswer 1: {solution}\n\n Answer 2: {resp}"
    
    verification_response = openai_api.OpenaiAPIWrapper.call(
        prompt=command, 
        max_tokens=MAX_TOKENS,
        engine="gpt-3.5-turbo-0613",  # default to this version for reproducibility
        stop_token="\n\n\n",
        temperature=0.0,
        num_completions=NUM_COMPLETIONS,
    )
    
    first_resp = openai_api.OpenaiAPIWrapper.get_first_response(verification_response)
    
    return "yes" in first_resp.strip().lower()


In [14]:
# todo: at some point, we'll probably ask each model to verify the result and take a majority vote

In [86]:
# This function receives a model response and does comparisons with the ground truth
def evaluate_response(
        full_prompt: str,
        model_response: str=None,
        ground_truth: str=None,
        acceptable_answers: List[str]=None
) -> bool:
    
    if model_response is None or ground_truth is None:
        raise ValueError("Nothing to compare")
    
    # Answers line up exactly 
    if ground_truth.strip().lower() == model_response.strip().lower():
        return True
    
    elif compare_numericals(ground_truth, model_response) or\
            compare_tokens(ground_truth, model_response):
        return True

    elif chatgpt_verifier(full_prompt, ground_truth, model_response):
        return True

    else:
        # Check other acceptable answers
        if acceptable_answers is not None:
            for ans in acceptable_answers:
                if compare_tokens(ans, model_response) or\
                        compare_numericals(ans, model_response) or\
                        chatgpt_verifier(full_prompt, ans, model_response):
                    return True
    
    return False

    

### OpenAI models

In [111]:
chatgpt_df = pd.DataFrame(columns=['question', 'answer', 'model_response', 'correct?'])

dict_record = {}

for body_qa_group in sample_problems:
    
    body = body_qa_group['body']
    
    for qa_pair in body_qa_group['questions']:
        question = qa_pair['question']
        answer = qa_pair['answer']
        
        # Prompt contents 
        prompt = body + '\n' + question + '\n\n\n'
        
        response = openai_api.OpenaiAPIWrapper.call(
            prompt=prompt,
            max_tokens=MAX_TOKENS,
            engine=ENGINE,
            stop_token='\n\n\n',
            temperature=0.0,
            num_completions=NUM_COMPLETIONS
        )
        
        first_response = openai_api.OpenaiAPIWrapper.get_first_response(response)
        
        bool_check = evaluate_response(
            prompt,
            first_response,
            answer,
            # qa_pair["acceptable_answers"]
        )

        # Record each result
        dict_record.update({
            'question': question,
            'answer': answer,
            'model_response': first_response.strip(),
            'correct?': bool_check
        })
        
        chatgpt_df = pd.concat([
            chatgpt_df,
            pd.DataFrame.from_dict(dict_record, orient='index').T
        ])


In [112]:
dict_record

{'question': 'Which companies produce the Tesla coils in question?',
 'answer': 'Tesla, Volkswagen and Bosch',
 'model_response': 'The Tesla coils produced in the small educative factory for high schoolers and middle schoolers are likely produced by Tesla itself. Tesla, the electric vehicle and clean energy company founded by Elon Musk, is known for its namesake Tesla coils, which are devices that produce high-voltage, low-current electricity.',
 'correct?': False}

In [113]:
chatgpt_df

Unnamed: 0,question,answer,model_response,correct?
0,How many manufacturers collaborated on this pr...,3,Three manufacturers collaborated on this proje...,True
0,How many corporations collaborated on this pro...,3,Three corporations collaborated on this projec...,True
0,How many labels cooperated on this project?,3,"Based on the information provided, three label...",True
0,How many countries collaborated on this project?,2,"Based on the information provided, it is not e...",True
0,How many different items do they produce?,4,"Based on the information provided, the small e...",True
0,What is the target school grade range for the ...,7-12,The target school grade range for the children...,True
0,What is the target age range for the children ...,12-18,The target age range for children visiting the...,True
0,Which companies produce the Tesla coils in que...,"Tesla, Volkswagen and Bosch",The Tesla coils produced in the small educativ...,False


## ii) PaLM

In [25]:
with open('/path/to/key.txt', 'r') as f:
    makersuite_key = f.readline().strip()
    
assert makersuite_key is not None


In [26]:
import google.generativeai as palm

palm.configure(api_key=makersuite_key)

In [27]:
list(palm.list_models())

[Model(name='models/chat-bison-001', base_model_id='', version='001', display_name='Chat Bison', description='Chat-optimized generative language model.', input_token_limit=4096, output_token_limit=1024, supported_generation_methods=['generateMessage', 'countMessageTokens'], temperature=0.25, top_p=0.95, top_k=40),
 Model(name='models/text-bison-001', base_model_id='', version='001', display_name='Text Bison', description='Model targeted for text generation.', input_token_limit=8196, output_token_limit=1024, supported_generation_methods=['generateText', 'countTextTokens'], temperature=0.7, top_p=0.95, top_k=40),
 Model(name='models/embedding-gecko-001', base_model_id='', version='001', display_name='Embedding Gecko', description='Obtain a distributed representation of a text.', input_token_limit=1024, output_token_limit=1, supported_generation_methods=['embedText'], temperature=None, top_p=None, top_k=None)]

In [28]:
palm_model = palm.get_model('models/text-bison-001')


In [29]:
# create a dataframe that we'll append each dict record to
palm_df = pd.DataFrame(columns=["question", "answer", "model_response", "correct?"])

dict_record = {}

for body_question_group in sample_problems:

    body = body_question_group["body"]

    for qa_pair in body_question_group["questions"]:
        question = qa_pair["question"]
        answer = qa_pair["answer"]

        # form the prompt contents
        prompt = body + "\n" + question + "\n\n\n"

        response = palm.generate_text(prompt=prompt)

        boolean_check = evaluate_response(prompt, response.result, answer)

        # Update the record dict with each result
        dict_record.update({
            "question": question,
            "answer": answer,
            "model_response": response.result,
            "correct?": boolean_check
        })

        # concat the record dict with the dataframe
        palm_df = pd.concat([palm_df, pd.DataFrame.from_dict(dict_record, orient="index").T])
        

In [30]:
palm_df

Unnamed: 0,question,answer,model_response,correct?
0,How many manufacturers collaborated on this pr...,3,3,True
0,How many corporations collaborated on this pro...,3,3,True
0,How many labels cooperated on this project?,3,3,True
0,How many countries collaborated on this project?,2,3,True
0,How many different items do they produce?,4,4,True
0,What is the target school grade range for the ...,7-12,7-12,True
0,What is the target age range for the children ...,12-18,7 to 12,False
0,Which companies produce the Tesla coils in que...,"Tesla, Volkswagen and Bosch","Tesla, Volkswagen and Bosch",True


## iii) Claude