In [1]:
from groq import Groq
import os
import sys
import anthropic
import ollama
import random
import pandas as pd
from tqdm import tqdm
from google.generativeai.types import RequestOptions
from google.api_core import retry
from typing import List, Tuple
import json
from openai import OpenAI
import datetime
import openai
import time

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

if parent_dir not in sys.path:
    sys.path.append(parent_dir)

from concurrent.futures import ThreadPoolExecutor, TimeoutError

# prompts

### Zero Shot - Vanilla CoT

In [2]:
zero_shot_vanilla_cot = """
Think through your answer step by step. Put the concise form of your final answer in curly brackets e.g. {A}, {True} or {3.0}.
"""

In [3]:
def get_prompt(prompt_type: str, few_shot_prompt: str, question: str) -> str:
    prompts = {
        "zero_shot_vanilla_cot": f"{question}\n{zero_shot_vanilla_cot}",
    }
    return prompts[prompt_type]

def save_results(save_path: str, ids: List[str], questions: List[str], answers: List[str], append: bool = False):
    df = pd.DataFrame({'id': ids, 'question': questions, 'answer': answers})
    if append and os.path.exists(save_path):
        df.to_csv(save_path, mode='a', index=False, header=False)
    else:
        df.to_csv(save_path, index=False)

def read_jsonl_file(filepath: str) -> List[dict]:
    data = []
    with open(filepath, 'r') as file:
        for line in file:
            json_obj = json.loads(line)
            data.append(json_obj)
    return data

def load_data_size_specific(data_path: str, sample_size: int = 0, random_seed: int = 0):
    random.seed(random_seed)

    if data_path.endswith('.jsonl'):
        data = read_jsonl_file(data_path)
    elif data_path.endswith('.json'):
        with open(data_path, 'r') as file:
            data = json.load(file)
    
    question_length = 0
    eligible_data = [x for x in data if len(x["question"]) >= question_length]
    
    if sample_size > 0 and sample_size < len(eligible_data):
        sampled_data = random.sample(eligible_data, sample_size)
    else:
        sampled_data = eligible_data
    
    ids = [x["id"] for x in sampled_data]
    questions = [x["question"] for x in sampled_data]
    
    return ids, questions

def load_already_answered_ids(save_path: str) -> set:
    if os.path.exists(save_path):
        df = pd.read_csv(save_path)
        answered_ids = set(df['id'].tolist())
        # print(f"Loaded {len(answered_ids)} already answered IDs from: {save_path}")
        print(f"Already answered IDs: {answered_ids}")
        return answered_ids
    else:
        print(f"No existing save file found at: {save_path}. Starting fresh.")
        return set()

def initialize_save_file(save_path: str):
    if not os.path.exists(save_path):
        # Create an empty DataFrame with headers and save
        df = pd.DataFrame(columns=['id', 'question', 'answer'])
        df.to_csv(save_path, index=False)
        print(f"Initialized new save file with headers at: {save_path}")

In [20]:
def query_4o(prompt: str) -> str:
    client = OpenAI()

    completion = client.chat.completions.create(
        model="gpt-4o-2024-08-06",
        messages=[
            {
                "role": "user",
                "content": f"{prompt}"
            }
        ],
        temperature=0
    )

    return completion.choices[0].message.content

def query_llama(prompt):
    client = openai.OpenAI(
        api_key=os.environ.get("SAMBANOVA_API_KEY"),
        base_url="https://api.sambanova.ai/v1",
    )

    response = client.chat.completions.create(
        model='Meta-Llama-3.1-8B-Instruct',
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
        temperature=0.6, # Meta default
        top_p = 0.9 # Meta default
    )
    time.sleep(2)  # Pause execution for 2 seconds
    return response.choices[0].message.content

def query_llama_70b(prompt):
    client = openai.OpenAI(
        api_key=os.environ.get("SAMBANOVA_API_KEY"),
        base_url="https://api.sambanova.ai/v1",
    )

    response = client.chat.completions.create(
        model='Meta-Llama-3.1-70B-Instruct',
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
        temperature=0.6, # Meta default
        top_p = 0.9 # Meta default
    )
    time.sleep(2)  # Pause execution for 2 seconds
    return response.choices[0].message.content

def query_llama_405b(prompt):
    client = openai.OpenAI(
        api_key=os.environ.get("SAMBANOVA_API_KEY"),
        base_url="https://api.sambanova.ai/v1",
    )

    response = client.chat.completions.create(
        model='Meta-Llama-3.1-405B-Instruct',
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
        temperature=0.6, # Meta default
        top_p = 0.9 # Meta default
    )
    time.sleep(5)  # Pause execution for 2 seconds
    return response.choices[0].message.content

In [17]:
def query_llm(llm_model: str, ids: List[str], questions: List[str], few_shot_prompt: str, prompt_type: str, save_path: str, already_answered_ids: set) -> Tuple[List[str], List[str], List[str]]:
    answers = []
    ids_can_be_answered = []
    questions_can_be_answered = []
    
    for id, q in tqdm(zip(ids, questions), total=len(ids)):
        # print(q)
        # print(f"Processing ID: {id}")
        if id in already_answered_ids:
            print(f"Skipping: {id}", end=' ')
            continue
        if id == 1146: # weird ID that breaks llama
            continue
        
        prompt = get_prompt(prompt_type, few_shot_prompt, q)
        try:
            if llm_model == 'gemini':
                answer = query_gemini(prompt, id)
            elif llm_model == 'claude':
                answer = query_claude(prompt)
            elif llm_model == '4o':
                # answer = query_4o_multiturn(prompt)
                if prompt_type == 'multi_convo':
                    fact_prompt = get_prompt(prompt_type="fact_prompt", few_shot_prompt="", question=q)
                    
                    answer_prompt = get_prompt(prompt_type="answer_prompt_data", few_shot_prompt="", question=q)
                    answer = query_4o_multiconvo(fact_prompt=fact_prompt, answer_prompt=answer_prompt, extracted_question=q)
                else:
                    answer = query_4o(prompt)
                
            elif llm_model == 'llama3.18b':
                answer = query_llama(prompt)
            elif llm_model == 'llama3.170b':
                answer = query_llama_70b(prompt)
            elif llm_model == 'llama3.1405b':
                answer = query_llama_405b(prompt)
            else:
                raise ValueError(f"Unsupported LLM model: {llm_model}")
            # print(f"Answer for ID {id}: {answer}")
            
            answers.append(answer)
            questions_can_be_answered.append(q)
            ids_can_be_answered.append(id)

            # Save after each answer
            save_results(save_path, [id], [q], [answer], append=True)
        except Exception as e:
            print(f"Error processing question {id}: {str(e)}")
            continue
    
    return ids_can_be_answered, questions_can_be_answered, answers

# Driver

In [13]:
json_datasets = ['logical_deduction_seven_objects','reasoning_about_colored_objects']
jsonl_datasets = ['GSM8K', 'date', 'GSM_Plus', 'MultiArith', 'ASDiv', 'SVAMP', 'AQUA', 'p_GSM8K', 'StrategyQA', 'commonsenseQA','SPARTQA']
all_datasets = jsonl_datasets + json_datasets

In [21]:
def run_model(llm_model, prompt_type, few_shot_txt, sample_size, project_root):
    for dataset in all_datasets:
        print(f"------Processing dataset: {dataset}-------")
        if few_shot_txt:
            fewshot_prompt_path = os.path.join(project_root, "prompt", dataset, few_shot_txt)
            
        save_dir = os.path.join(project_root, 'logan/results/final/VanillaCoT', dataset, f'{llm_model}')
        os.makedirs(save_dir, exist_ok=True)  # Ensure the directory exists
        save_path = os.path.join(save_dir, f'{prompt_type}_{few_shot_txt}_{dataset}_{llm_model}.csv')

        if dataset in json_datasets:
            data_path = os.path.join(project_root, 'data', dataset, 'test.json')
        else:
            data_path = os.path.join(project_root, 'data', dataset, 'test.jsonl')


        ids, questions = load_data_size_specific(data_path, sample_size=sample_size)
        if few_shot_txt:
            with open(fewshot_prompt_path, 'r') as file:
                few_shot_prompt = file.read()
        else:
            few_shot_prompt = ""

        initialize_save_file(save_path)
        already_answered_ids = load_already_answered_ids(save_path)

        ids_answered, questions_answered, answers = query_llm(
            llm_model=llm_model,
            ids=ids,
            questions=questions,
            few_shot_prompt=few_shot_prompt,
            prompt_type=prompt_type,
            save_path=save_path,
            already_answered_ids=already_answered_ids
        )

        print(f"Processing complete for {dataset}. {len(ids_answered)} new answers saved to {save_path}.")

In [24]:
llm_model = 'llama3.1405b'
project_root = '/Users/log/Github/textual_grounding/'
prompt_type = 'zero_shot_vanilla_cot'
few_shot_txt = None
sample_size = 400

run_model(llm_model, prompt_type, few_shot_txt, sample_size, project_root)

------Processing dataset: GSM8K-------
Already answered IDs: {0, 2, 3, 4, 6, 8, 18, 25, 27, 28, 29, 30, 33, 36, 38, 42, 44, 46, 49, 54, 62, 64, 65, 67, 72, 74, 79, 80, 82, 83, 85, 86, 89, 92, 93, 95, 100, 101, 102, 111, 117, 118, 125, 127, 128, 130, 132, 141, 143, 147, 151, 153, 159, 161, 164, 165, 166, 167, 168, 170, 183, 186, 187, 191, 194, 196, 200, 202, 205, 206, 207, 222, 223, 225, 226, 235, 236, 237, 239, 240, 243, 245, 253, 254, 255, 266, 267, 271, 273, 276, 283, 285, 286, 291, 296, 299, 300, 306, 309, 314, 315, 316, 322, 324, 327, 331, 336, 339, 343, 345, 346, 349, 351, 358, 365, 367, 369, 373, 376, 377, 378, 382, 384, 385, 386, 387, 389, 390, 391, 393, 394, 398, 403, 404, 406, 408, 412, 416, 418, 421, 424, 425, 431, 433, 436, 437, 440, 441, 447, 448, 451, 452, 454, 456, 459, 482, 485, 488, 489, 490, 491, 492, 495, 497, 499, 500, 509, 513, 525, 530, 531, 532, 533, 535, 555, 561, 562, 563, 564, 569, 570, 573, 574, 575, 577, 578, 589, 594, 596, 597, 598, 599, 616, 617, 619, 621, 

100%|██████████| 400/400 [00:00<00:00, 650027.74it/s]




Skipping: 788 Skipping: 861 Skipping: 82 Skipping: 530 Skipping: 1047 Skipping: 995 Skipping: 829 Skipping: 621 Skipping: 976 Skipping: 733 Skipping: 1194 Skipping: 447 Skipping: 1033 Skipping: 285 Skipping: 577 Skipping: 286 Skipping: 194 Skipping: 1266 Skipping: 513 Skipping: 1090 Skipping: 1232 Skipping: 300 Skipping: 635 Skipping: 202 Skipping: 151 Skipping: 676 Skipping: 966 Skipping: 206 Skipping: 724 Skipping: 889 Skipping: 647 Skipping: 1251 Skipping: 418 Skipping: 1131 Skipping: 1310 Skipping: 906 Skipping: 1067 Skipping: 533 Skipping: 127 Skipping: 1123 Skipping: 28 Skipping: 191 Skipping: 816 Skipping: 2 Skipping: 1253 Skipping: 1010 Skipping: 682 Skipping: 499 Skipping: 666 Skipping: 128 Skipping: 391 Skipping: 1162 Skipping: 454 Skipping: 488 Skipping: 291 Skipping: 1112 Skipping: 917 Skipping: 186 Skipping: 164 Skipping: 655 Skipping: 1040 Skipping: 1002 Skipping: 223 Skipping: 617 Skipping: 1128 Skipping: 596 Skipping: 255 Skipping: 1121 Skipping: 681 Skipping: 1106 Skip

100%|██████████| 359/359 [00:00<00:00, 702638.89it/s]


Skipping: 0 Skipping: 1 Skipping: 2 Skipping: 3 Skipping: 4 Skipping: 5 Skipping: 6 Skipping: 7 Skipping: 8 Skipping: 9 Skipping: 10 Skipping: 11 Skipping: 12 Skipping: 13 Skipping: 14 Skipping: 15 Skipping: 16 Skipping: 17 Skipping: 18 Skipping: 19 Skipping: 20 Skipping: 21 Skipping: 22 Skipping: 23 Skipping: 24 Skipping: 25 Skipping: 26 Skipping: 27 Skipping: 28 Skipping: 29 Skipping: 30 Skipping: 31 Skipping: 32 Skipping: 33 Skipping: 34 Skipping: 35 Skipping: 36 Skipping: 37 Skipping: 38 Skipping: 39 Skipping: 40 Skipping: 41 Skipping: 42 Skipping: 43 Skipping: 44 Skipping: 45 Skipping: 46 Skipping: 47 Skipping: 48 Skipping: 49 Skipping: 50 Skipping: 51 Skipping: 52 Skipping: 53 Skipping: 54 Skipping: 55 Skipping: 56 Skipping: 57 Skipping: 58 Skipping: 59 Skipping: 60 Skipping: 61 Skipping: 62 Skipping: 63 Skipping: 64 Skipping: 65 Skipping: 66 Skipping: 67 Skipping: 68 Skipping: 69 Skipping: 70 Skipping: 71 Skipping: 72 Skipping: 73 Skipping: 74 Skipping: 75 Skipping: 76 Skipping:

100%|██████████| 400/400 [00:00<00:00, 746649.58it/s]


Skipping: 1578 Skipping: 1723 Skipping: 166 Skipping: 1061 Skipping: 2095 Skipping: 1991 Skipping: 1659 Skipping: 1243 Skipping: 1953 Skipping: 1467 Skipping: 2390 Skipping: 895 Skipping: 2068 Skipping: 571 Skipping: 1155 Skipping: 573 Skipping: 389 Skipping: 1027 Skipping: 2182 Skipping: 602 Skipping: 1271 Skipping: 405 Skipping: 303 Skipping: 1353 Skipping: 1934 Skipping: 2293 Skipping: 413 Skipping: 1450 Skipping: 1779 Skipping: 1296 Skipping: 838 Skipping: 2264 Skipping: 1954 Skipping: 1814 Skipping: 2136 Skipping: 1067 Skipping: 256 Skipping: 2248 Skipping: 58 Skipping: 383 Skipping: 1634 Skipping: 5 Skipping: 2022 Skipping: 1365 Skipping: 1000 Skipping: 1333 Skipping: 258 Skipping: 783 Skipping: 2325 Skipping: 909 Skipping: 978 Skipping: 584 Skipping: 2225 Skipping: 1835 Skipping: 374 Skipping: 330 Skipping: 1311 Skipping: 2081 Skipping: 2005 Skipping: 447 Skipping: 1235 Skipping: 2258 Skipping: 1193 Skipping: 512 Skipping: 2243 Skipping: 1363 Skipping: 2214 Skipping: 833 Skippin

100%|██████████| 400/400 [00:00<00:00, 740715.94it/s]


Skipping: 397 Skipping: 433 Skipping: 41 Skipping: 268 Skipping: 526 Skipping: 500 Skipping: 417 Skipping: 313 Skipping: 491 Skipping: 369 Skipping: 226 Skipping: 519 Skipping: 144 Skipping: 291 Skipping: 145 Skipping: 99 Skipping: 259 Skipping: 548 Skipping: 152 Skipping: 320 Skipping: 103 Skipping: 77 Skipping: 341 Skipping: 486 Skipping: 105 Skipping: 365 Skipping: 447 Skipping: 326 Skipping: 212 Skipping: 569 Skipping: 591 Skipping: 456 Skipping: 536 Skipping: 269 Skipping: 65 Skipping: 14 Skipping: 97 Skipping: 411 Skipping: 1 Skipping: 508 Skipping: 344 Skipping: 252 Skipping: 336 Skipping: 66 Skipping: 198 Skipping: 230 Skipping: 247 Skipping: 147 Skipping: 461 Skipping: 95 Skipping: 84 Skipping: 330 Skipping: 523 Skipping: 504 Skipping: 113 Skipping: 311 Skipping: 301 Skipping: 129 Skipping: 343 Skipping: 211 Skipping: 297 Skipping: 458 Skipping: 549 Skipping: 599 Skipping: 327 Skipping: 250 Skipping: 300 Skipping: 190 Skipping: 196 Skipping: 194 Skipping: 33 Skipping: 566 Skip

100%|██████████| 400/400 [00:00<00:00, 746317.44it/s]


Skipping: 1774 Skipping: 1919 Skipping: 171 Skipping: 1180 Skipping: 2198 Skipping: 1855 Skipping: 1377 Skipping: 2160 Skipping: 1623 Skipping: 959 Skipping: 2275 Skipping: 591 Skipping: 1274 Skipping: 593 Skipping: 408 Skipping: 1118 Skipping: 626 Skipping: 1407 Skipping: 424 Skipping: 322 Skipping: 1502 Skipping: 2141 Skipping: 432 Skipping: 1606 Skipping: 1975 Skipping: 1432 Skipping: 902 Skipping: 2161 Skipping: 2019 Skipping: 1186 Skipping: 275 Skipping: 58 Skipping: 402 Skipping: 1830 Skipping: 4 Skipping: 2229 Skipping: 1520 Skipping: 1082 Skipping: 1480 Skipping: 277 Skipping: 815 Skipping: 973 Skipping: 1042 Skipping: 606 Skipping: 2040 Skipping: 393 Skipping: 349 Skipping: 1457 Skipping: 2212 Skipping: 466 Skipping: 645 Skipping: 1248 Skipping: 621 Skipping: 1604 Skipping: 2274 Skipping: 1241 Skipping: 714 Skipping: 1865 Skipping: 2096 Skipping: 1226 Skipping: 436 Skipping: 2181 Skipping: 1834 Skipping: 1370 Skipping: 1240 Skipping: 1333 Skipping: 614 Skipping: 976 Skipping: 

100%|██████████| 400/400 [00:00<00:00, 739410.14it/s]


Skipping: 864 Skipping: 394 Skipping: 776 Skipping: 911 Skipping: 430 Skipping: 41 Skipping: 265 Skipping: 988 Skipping: 523 Skipping: 497 Skipping: 414 Skipping: 940 Skipping: 802 Skipping: 849 Skipping: 310 Skipping: 488 Skipping: 366 Skipping: 597 Skipping: 913 Skipping: 929 Skipping: 223 Skipping: 516 Skipping: 142 Skipping: 288 Skipping: 143 Skipping: 773 Skipping: 97 Skipping: 633 Skipping: 818 Skipping: 256 Skipping: 931 Skipping: 545 Skipping: 722 Skipping: 829 Skipping: 616 Skipping: 923 Skipping: 150 Skipping: 317 Skipping: 101 Skipping: 747 Skipping: 75 Skipping: 920 Skipping: 870 Skipping: 700 Skipping: 338 Skipping: 483 Skipping: 573 Skipping: 103 Skipping: 362 Skipping: 444 Skipping: 323 Skipping: 625 Skipping: 655 Skipping: 934 Skipping: 209 Skipping: 565 Skipping: 984 Skipping: 453 Skipping: 886 Skipping: 533 Skipping: 266 Skipping: 63 Skipping: 824 Skipping: 561 Skipping: 14 Skipping: 95 Skipping: 736 Skipping: 860 Skipping: 408 Skipping: 727 Skipping: 844 Skipping: 80

100%|██████████| 254/254 [00:00<00:00, 241139.25it/s]


Skipping: 0 Skipping: 1 Skipping: 2 Skipping: 3 Skipping: 4 Skipping: 5 Skipping: 6 Skipping: 7 Skipping: 8 Skipping: 9 Skipping: 10 Skipping: 11 Skipping: 12 Skipping: 13 Skipping: 14 Skipping: 15 Skipping: 16 Skipping: 17 Skipping: 18 Skipping: 19 Skipping: 20 Skipping: 21 Skipping: 22 Skipping: 23 Skipping: 24 Skipping: 25 Skipping: 26 Skipping: 27 Skipping: 28 Skipping: 29 Skipping: 30 Skipping: 31 Skipping: 32 Skipping: 33 Skipping: 34 Skipping: 35 Skipping: 36 Skipping: 37 Skipping: 38 Skipping: 39 Skipping: 40 Skipping: 41 Skipping: 42 Skipping: 43 Skipping: 44 Skipping: 45 Skipping: 46 Skipping: 47 Skipping: 48 Skipping: 49 Skipping: 50 Skipping: 51 Skipping: 52 Skipping: 53 Skipping: 54 Skipping: 55 Skipping: 56 Skipping: 57 Skipping: 58 Skipping: 59 Skipping: 60 Skipping: 61 Skipping: 62 Skipping: 63 Skipping: 64 Skipping: 65 Skipping: 66 Skipping: 67 Skipping: 68 Skipping: 69 Skipping: 70 Skipping: 71 Skipping: 72 Skipping: 73 Skipping: 74 Skipping: 75 Skipping: 76 Skipping:

100%|██████████| 220/220 [00:00<00:00, 935848.76it/s]


Skipping: 0 Skipping: 1 Skipping: 2 Skipping: 3 Skipping: 4 Skipping: 5 Skipping: 6 Skipping: 7 Skipping: 8 Skipping: 9 Skipping: 10 Skipping: 11 Skipping: 12 Skipping: 13 Skipping: 14 Skipping: 15 Skipping: 16 Skipping: 17 Skipping: 18 Skipping: 19 Skipping: 20 Skipping: 21 Skipping: 22 Skipping: 23 Skipping: 24 Skipping: 25 Skipping: 26 Skipping: 27 Skipping: 28 Skipping: 29 Skipping: 30 Skipping: 31 Skipping: 32 Skipping: 33 Skipping: 34 Skipping: 35 Skipping: 36 Skipping: 37 Skipping: 38 Skipping: 39 Skipping: 40 Skipping: 41 Skipping: 42 Skipping: 43 Skipping: 44 Skipping: 45 Skipping: 46 Skipping: 47 Skipping: 48 Skipping: 49 Skipping: 50 Skipping: 51 Skipping: 52 Skipping: 53 Skipping: 54 Skipping: 55 Skipping: 56 Skipping: 57 Skipping: 58 Skipping: 59 Skipping: 60 Skipping: 61 Skipping: 62 Skipping: 63 Skipping: 64 Skipping: 65 Skipping: 66 Skipping: 67 Skipping: 68 Skipping: 69 Skipping: 70 Skipping: 71 Skipping: 72 Skipping: 73 Skipping: 74 Skipping: 75 Skipping: 76 Skipping:

100%|██████████| 400/400 [00:00<00:00, 975986.97it/s]


Skipping: 1577 Skipping: 1722 Skipping: 165 Skipping: 1060 Skipping: 2094 Skipping: 1990 Skipping: 1658 Skipping: 1242 Skipping: 1952 Skipping: 1466 Skipping: 894 Skipping: 2067 Skipping: 570 Skipping: 1154 Skipping: 572 Skipping: 388 Skipping: 1026 Skipping: 2181 Skipping: 601 Skipping: 1270 Skipping: 404 Skipping: 302 Skipping: 1352 Skipping: 1933 Skipping: 412 Skipping: 1449 Skipping: 1778 Skipping: 1295 Skipping: 837 Skipping: 1953 Skipping: 1813 Skipping: 2135 Skipping: 1066 Skipping: 255 Skipping: 2247 Skipping: 57 Skipping: 382 Skipping: 1633 Skipping: 4 Skipping: 2021 Skipping: 1364 Skipping: 999 Skipping: 1332 Skipping: 257 Skipping: 782 Skipping: 908 Skipping: 977 Skipping: 583 Skipping: 2224 Skipping: 1834 Skipping: 373 Skipping: 329 Skipping: 1310 Skipping: 2080 Skipping: 2004 Skipping: 446 Skipping: 1234 Skipping: 1192 Skipping: 511 Skipping: 1362 Skipping: 2213 Skipping: 832 Skipping: 1178 Skipping: 1822 Skipping: 375 Skipping: 1576 Skipping: 1298 Skipping: 991 Skipping: 

100%|██████████| 400/400 [00:00<00:00, 750658.43it/s]


Skipping: 7b95825a19d6930d6aed35c7c57a2d82 Skipping: e56c56c3cfe50ba0c787c2bd67255be8 Skipping: 9aff72f0c480c2b4edde45bd2e7e4870 Skipping: 0854478d174c9127064f0d4b58df7e62 Skipping: 790b3f583e9bc9424c771691ecc70c20 Skipping: d33a81660058e570a18fb2eafa284a78 Skipping: 37d88a9bb24913c1973cc26d4ce3394f Skipping: 9382bc51ba092f55a494eff8615899de Skipping: 2b9b625c788584b8d41f1a74d740e126 Skipping: 32d5b7fcae24f0d4871cfb219c5a4b47 Skipping: 1ddd239a2a6438a891cb411b82e7f450 Skipping: f13efb91090dd28fd2b3c1f4dde680fd Skipping: 916bbd27545446ca5d83d07c10d013ea Skipping: 93b52e7ea1acf10db891e9355e234123 Skipping: 7044d82a456d0fa6f0210abb03cbf2c4 Skipping: dbdad44029098d4b1d202d6d857d6092 Skipping: f20d40bc4af588223e880e0bb58b27b8 Skipping: a1303b5177df0a5b653c9abd7d5f5e08 Skipping: 163898952cb6baf3a6440696e1352e86 Skipping: 8fd82cdc253835814153fe7222e9967c Skipping: 6c4b2c93a4bdafb6cbf2b2ef2439b06f Skipping: e61891746aa94ab57aaa754614034aef Skipping: 0e80f2afe5c4f652e8720b52d7c06c87 Skipping: 3

100%|██████████| 400/400 [00:00<00:00, 946261.48it/s]


Skipping: 3458 Skipping: 1577 Skipping: 3104 Skipping: 1722 Skipping: 165 Skipping: 1060 Skipping: 2094 Skipping: 1990 Skipping: 1658 Skipping: 3210 Skipping: 3399 Skipping: 1242 Skipping: 1952 Skipping: 1466 Skipping: 2389 Skipping: 894 Skipping: 2067 Skipping: 570 Skipping: 1154 Skipping: 572 Skipping: 3095 Skipping: 388 Skipping: 2532 Skipping: 3274 Skipping: 1026 Skipping: 2181 Skipping: 2888 Skipping: 3318 Skipping: 2465 Skipping: 601 Skipping: 1270 Skipping: 404 Skipping: 2989 Skipping: 302 Skipping: 3483 Skipping: 2801 Skipping: 1352 Skipping: 1933 Skipping: 2292 Skipping: 412 Skipping: 1449 Skipping: 1778 Skipping: 1295 Skipping: 2502 Skipping: 2623 Skipping: 837 Skipping: 2263 Skipping: 1953 Skipping: 1813 Skipping: 3544 Skipping: 2135 Skipping: 1066 Skipping: 255 Skipping: 3297 Skipping: 2247 Skipping: 57 Skipping: 382 Skipping: 2947 Skipping: 3441 Skipping: 1633 Skipping: 2909 Skipping: 3378 Skipping: 3215 Skipping: 2736 Skipping: 2561 Skipping: 4 Skipping: 2506 Skipping: 20

100%|██████████| 250/250 [00:00<00:00, 972705.01it/s]


Skipping: 0 Skipping: 1 Skipping: 2 Skipping: 3 Skipping: 4 Skipping: 5 Skipping: 6 Skipping: 7 Skipping: 8 Skipping: 9 Skipping: 10 Skipping: 11 Skipping: 12 Skipping: 13 Skipping: 14 Skipping: 15 Skipping: 16 Skipping: 17 Skipping: 18 Skipping: 19 Skipping: 20 Skipping: 21 Skipping: 22 Skipping: 23 Skipping: 24 Skipping: 25 Skipping: 26 Skipping: 27 Skipping: 28 Skipping: 29 Skipping: 30 Skipping: 31 Skipping: 32 Skipping: 33 Skipping: 34 Skipping: 35 Skipping: 36 Skipping: 37 Skipping: 38 Skipping: 39 Skipping: 40 Skipping: 41 Skipping: 42 Skipping: 43 Skipping: 44 Skipping: 45 Skipping: 46 Skipping: 47 Skipping: 48 Skipping: 49 Skipping: 50 Skipping: 51 Skipping: 52 Skipping: 53 Skipping: 54 Skipping: 55 Skipping: 56 Skipping: 57 Skipping: 58 Skipping: 59 Skipping: 60 Skipping: 61 Skipping: 62 Skipping: 63 Skipping: 64 Skipping: 65 Skipping: 66 Skipping: 67 Skipping: 68 Skipping: 69 Skipping: 70 Skipping: 71 Skipping: 72 Skipping: 73 Skipping: 74 Skipping: 75 Skipping: 76 Skipping:

100%|██████████| 250/250 [00:00<00:00, 648069.22it/s]

Skipping: 0 Skipping: 1 Skipping: 2 Skipping: 3 Skipping: 4 Skipping: 5 Skipping: 6 Skipping: 7 Skipping: 8 Skipping: 9 Skipping: 10 Skipping: 11 Skipping: 12 Skipping: 13 Skipping: 14 Skipping: 15 Skipping: 16 Skipping: 17 Skipping: 18 Skipping: 19 Skipping: 20 Skipping: 21 Skipping: 22 Skipping: 23 Skipping: 24 Skipping: 25 Skipping: 26 Skipping: 27 Skipping: 28 Skipping: 29 Skipping: 30 Skipping: 31 Skipping: 32 Skipping: 33 Skipping: 34 Skipping: 35 Skipping: 36 Skipping: 37 Skipping: 38 Skipping: 39 Skipping: 40 Skipping: 41 Skipping: 42 Skipping: 43 Skipping: 44 Skipping: 45 Skipping: 46 Skipping: 47 Skipping: 48 Skipping: 49 Skipping: 50 Skipping: 51 Skipping: 52 Skipping: 53 Skipping: 54 Skipping: 55 Skipping: 56 Skipping: 57 Skipping: 58 Skipping: 59 Skipping: 60 Skipping: 61 Skipping: 62 Skipping: 63 Skipping: 64 Skipping: 65 Skipping: 66 Skipping: 67 Skipping: 68 Skipping: 69 Skipping: 70 Skipping: 71 Skipping: 72 Skipping: 73 Skipping: 74 Skipping: 75 Skipping: 76 Skipping:




## Batch

In [31]:
import os
import json
from pathlib import Path
from agents.batch_api_agents import prepare_batch_input, batch_api_agent
from openai import OpenAI
import importlib
import agents.batch_api_agents as batch_agents

# Reload the module to ensure the latest changes are loaded
importlib.reload(batch_agents)

llm_model = 'claude-3-5-sonnet-20240620'
project_root = '/Users/log/Github/textual_grounding/'
prompt_type = 'zero_shot_vanilla_cot'
few_shot_txt = None
sample_size = 2
json_datasets = ['logical_deduction_seven_objects','reasoning_about_colored_objects']
jsonl_datasets = ['GSM_Plus', 'MultiArith', 'SVAMP', 'p_GSM8K', 'StrategyQA', 'commonsenseQA','SPARTQA']
all_datasets = jsonl_datasets + json_datasets
all_datasets = ['MultiArith']
for dataset in all_datasets:
    if dataset in json_datasets:
        data_path = os.path.join(project_root, 'data', dataset, 'test.json')
    else:
        data_path = os.path.join(project_root, 'data', dataset, 'test.jsonl')
        
    save_dir = os.path.join(project_root, 'logan/results/final/VanillaCoT', dataset, f'{llm_model}')
    batch_dir = os.path.join(project_root, 'logan/batch_files/VanillaCoT', dataset, f'{llm_model}')
    os.makedirs(save_dir, exist_ok=True)  # Ensure the directory exists
    os.makedirs(batch_dir, exist_ok=True)  # Ensure the directory exists

    batch_results_path = os.path.join(save_dir, f'{prompt_type}_{few_shot_txt}_{dataset}_{llm_model}.jsonl')   
    batch_output_file = os.path.join(batch_dir, f'{prompt_type}_{few_shot_txt}_{dataset}_{llm_model}.jsonl')   

    ids, questions = load_data_size_specific(data_path, sample_size=sample_size)
    if few_shot_txt:
        with open(fewshot_prompt_path, 'r') as file:
            few_shot_prompt = file.read()
    else:
        few_shot_prompt = ""
    prompts = []
    for question in questions:
        prompt = get_prompt(prompt_type, few_shot_prompt, question)
        # print(prompt)
        prompts.append(prompt)

    # tasks = batch_agents.prepare_batch_input(
    #     llm_model=llm_model,
    #     ids=ids,
    #     prompts=prompts,
    #     batch_output_file=batch_output_file
    # )

    #Execute the batch processing with GPT-4 and save results
    batch_agents.batch_api_agent(
        llm_model=llm_model,
        ids=ids,
        prompts=prompts,
        batch_output_file=batch_output_file,
        batch_results_file=batch_results_path
    )


2024-11-10 17:46:09,175 - INFO - Batch input file created at: /Users/log/Github/textual_grounding/logan/batch_files/VanillaCoT/MultiArith/claude-3-5-sonnet-20240620/zero_shot_vanilla_cot_None_MultiArith_claude-3-5-sonnet-20240620.jsonl


[{'custom_id': '397', 'params': {'model': 'claude-3-5-sonnet-20240620', 'max_tokens': 1024, 'messages': [{'role': 'user', 'content': "Debby's class is going on a field trip to the zoo. If each van can hold 9 people and there are 40 students and 14 adults going, how many vans will they need?\n\nThink through your answer step by step. Put the concise form of your final answer in curly brackets e.g. {A}, {True} or {3.0}.\n"}]}}, {'custom_id': '433', 'params': {'model': 'claude-3-5-sonnet-20240620', 'max_tokens': 1024, 'messages': [{'role': 'user', 'content': 'John had 5 action figures, but needed 7 total for a complete collection. If each one costs $5, how much money would he need to finish his collection?\n\nThink through your answer step by step. Put the concise form of your final answer in curly brackets e.g. {A}, {True} or {3.0}.\n'}]}}]



2024-11-10 17:46:09,633 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages/batches?beta=true "HTTP/1.1 200 OK"
2024-11-10 17:46:09,637 - INFO - Created batch with ID: msgbatch_014xdLrN6YFLTrqHmYTxghjf
2024-11-10 17:46:09,767 - INFO - HTTP Request: GET https://api.anthropic.com/v1/messages/batches/msgbatch_014xdLrN6YFLTrqHmYTxghjf?beta=true "HTTP/1.1 200 OK"


KeyboardInterrupt: 