In [49]:
from groq import Groq
import os
import sys
import anthropic
import ollama
import random
import pandas as pd
from tqdm import tqdm
from google.generativeai.types import RequestOptions
from google.api_core import retry
from typing import List, Tuple
import json
from openai import OpenAI
import datetime
import openai
import time
import re
from tqdm import tqdm

In [50]:
def query_llama_70b(prompt):
    client = openai.OpenAI(
        api_key=os.environ.get("SAMBANOVA_API_KEY"),
        base_url="https://api.sambanova.ai/v1",
    )

    response = client.chat.completions.create(
        model='Meta-Llama-3.1-70B-Instruct',
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
        temperature=0.1, 
        top_p = 0.9 # Meta default
    )
    time.sleep(8)  # Pause execution for 2 seconds
    return response.choices[0].message.content

def query_llama_405b(prompt):
    client = openai.OpenAI(
        api_key=os.environ.get("SAMBANOVA_API_KEY"),
        base_url="https://api.sambanova.ai/v1",
    )

    response = client.chat.completions.create(
        model='Meta-Llama-3.1-405B-Instruct',
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
        temperature=0.1, 
        top_p = 0.9 # Meta default
    )
    time.sleep(15)  # Pause execution for 2 seconds
    return response.choices[0].message.content

def query_llama_8b(prompt):
    client = openai.OpenAI(
        api_key=os.environ.get("SAMBANOVA_API_KEY"),
        base_url="https://api.sambanova.ai/v1",
    )

    response = client.chat.completions.create(
        model='Meta-Llama-3.1-8B-Instruct',
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
        temperature=0.1, 
        top_p = 0.9 # Meta default
    )
    time.sleep(4)  # Pause execution for 2 seconds
    return response.choices[0].message.content

In [51]:
judge_prompt = """
You are an expert mathematical evaluator tasked with determining the correctness of solutions to mathematical problems. Your role is to compare a proposed solution to the expected solution (referred to as "Ground Truth") and decide whether the Answer is CORRECT or INCORRECT.

An answer is considered correct if the final result is equivalent to the Ground Truth. If the final result is not simplified, but mathematically equivalent to the Ground Truth, it should still be considered correct. For example, 2/4 is mathematically equivalent to 1/2. Output only the words "{CORRECT}" or "{INCORRECT}" in your response. Do not attempt to solve the problem.

## Example 1
Formatted Question: If $<fact1>2^8</fact1>=<fact2>4^x</fact2>$, what is the value of $<fact3>x</fact3>$?

Answer: We can rewrite $<fact2>4^x</fact2>$ as $(<fact4>2^2</fact4>)^x=<fact5>2^{2x}</fact5>$. Thus, we have $<fact1>2^8</fact1>=<fact5>2^{2x}</fact5>$. Since the bases are the same, the exponents must be equal, so $<fact6>8=2x</fact6>$. Solving for $<fact3>x</fact3>$, we find $<fact3>x</fact3>=<fact6>8/2</fact6>=<fact7>4</fact7>$

Ground Truth: 4

Judge Answer: {CORRECT}

Example 2:

What is the sum of all values of $y$ for which the expression $\\frac{y+6}{y^2-5y+4}$ is undefined?
Answer: The given expression is undefined when the denominator is zero. Thus, we want to find the sum of the zeros $y$ to the quadratic $y^2-5y+4$. Since for a quadratic with the equation $ax^2+bx+c=0$, the sum of the solutions is $-b/a$, the sum of the zeros of the quadratic $y^2-5y+4$ is $5/1=\\boxed{5}$.

Ground Truth: 5

Judge Answer: {CORRECT}

## Example 3

Answer: More generally, suppose $(a_i),$ $(b_i),$ $(c_i)$ represent the entries in rows $n - 1,$ $n,$ $n + 1$ of Pascal's triangle.  Then\n\\[a_i = \\binom{n - 1}{i}, \\ b_i = \\binom{n}{i}, \\ c_i = \\binom{n + 1}{i},\\]so\n\\begin{align*}\n\\frac{a_i}{b_i} &= \\frac{\\binom{n - 1}{i}}{\\binom{n}{i}} \\\\\n&= \\frac{\\frac{(n - 1)!}{i! (n - i - 1)!}}{\\frac{n!}{i! (n - i)!}} \\\\\n&= \\frac{(n - 1)! (n - i)!}{n! (n - i - 1)!} \\\\\n&= \\frac{n - i}{n} \\\\\n&= 1 - \\frac{i}{n}.\n\\end{align*}Hence,\n\\begin{align*}\n\\sum_{i = 0}^{n - 1} \\frac{a_i}{b_i} &= \\sum_{i = 0}^{n - 1} \\left( 1 - \\frac{i}{n} \\right) \\\\\n&= n - \\frac{(n - 1)n/2}{n} \\\\\n&= n - \\frac{n - 1}{2} = \\frac{n + 1}{2}.\n\\end{align*}Likewise,\n\\[\\frac{b_i}{c_i} = 1 - \\frac{i}{n + 1},\\]and\n\\[\\sum_{i = 0}^n \\frac{b_i}{c_i} = \\frac{n + 2}{2}.\\]Hence,\n\\[\\sum_{i = 0}^n \\frac{b_i}{c_i} - \\sum_{i = 0}^{n - 1} \\frac{a_i}{b_i} = \\frac{n + 2}{2} - \\frac{n + 1}{2} = \\boxed{\\frac{1}{2}}.\\]

Ground Truth: \\frac{1}{3}

Judge Answer: {INCORRECT}

--------------------------------

Judge if the following solution is correct or incorrect:
"""

In [52]:
def save_results(save_path: str, ids: List[str], questions: List[str], responses: List[str], gts, answers, append: bool = False):
    df = pd.DataFrame({'id': ids, 'question': questions, 'response': responses, 'gt': gts, 'answer': answers})
    if append and os.path.exists(save_path):
        df.to_csv(save_path, mode='a', index=False, header=False)
    else:
        df.to_csv(save_path, index=False)

In [61]:
gt_path = '/Users/log/Github/textual_grounding/data/MATH/test.jsonl'
with open(gt_path, 'r') as f:
    gts = [json.loads(line) for line in f]
models = ['70', '405']

for model in models:
    answers = pd.read_csv(f'/Users/log/Github/textual_grounding/logan/results/final/GCoT/MATH/llama3.1{model}b/gcot_random_gcot_examples.txt_MATH_llama3.1{model}b.csv')
    ids, questions, responses = [], [], []
    output_path = f'/Users/log/Github/textual_grounding/logan/results/final/fewshot_CoT/MATH/llama3.1{model}b/405bjudge_cot_random_cot_examples.txt_MATH_llama3.1{model}b.csv'
    for _, row in tqdm(answers.iterrows(), total=answers.shape[0], desc="Processing answers"):
        answer = row['answer']
        question = row['question']
        gt_row = [gt for gt in gts if gt['id'] == row['id']][0]
        gt = "Ground Truth: " + gt_row["answer"]
        prompt = f"{judge_prompt}\n{answer}\n\n{gt}"
        # print(prompt)
        # print("------------------------------------------------------------------")
        response = query_llama_405b(prompt)
        
        ids.append(row['id'])
        questions.append(question)
        responses.append(response)

        # Save results in chunks or every iteration (optional)
        save_results(output_path, [row['id']], [question], [response], [gt], [answer], append=True)
        # print(response)


Processing answers: 100%|██████████| 200/200 [58:45<00:00, 17.63s/it]
Processing answers: 100%|██████████| 200/200 [57:30<00:00, 17.25s/it] 


In [68]:
csv_path = '/Users/log/Github/textual_grounding/logan/results/final/GCoT/MATH/llama3.170b/405bjudge_gcot_random_gcot_examples.txt_MATH_llama3.170b.csv'
df = pd.read_csv(csv_path)
correct_count = 0
incorrect_count = 0

for _, row in df.iterrows():
    if "INCORRECT" in row['response']:
        incorrect_count += 1
    elif "CORRECT" in row['response']:
        correct_count += 1
    else:
        print("ERROR: ", row['response'])
print("Correct: ", correct_count)
print("Incorrect: ", incorrect_count)
print("Total: ", df.shape[0])
print("Accuracy: ", round((correct_count / df.shape[0]) * 100, 2))

Correct:  114
Incorrect:  86
Total:  200
Accuracy:  57.0


In [None]:
csv_path = '/Users/log/Github/textual_grounding/logan/results/final/fewshot_CoT/MATH/llama3.170b/cot_random_cot_examples.txt_MATH_llama3.170b.csv'
df1 = pd.read_csv(csv_path)
ids1 = df1['id'].tolist()

csv_path = '/Users/log/Github/textual_grounding/logan/results/final/fewshot_CoT/MATH/llama3.18b/cot_random_cot_examples.txt_MATH_llama3.18b.csv'
df2 = pd.read_csv(csv_path)
ids2 = df2['id'].tolist()

print(sorted(ids1))
print(sorted(ids2))

['test/algebra/1004.json', 'test/algebra/101.json', 'test/algebra/1072.json', 'test/algebra/1082.json', 'test/algebra/1098.json', 'test/algebra/1199.json', 'test/algebra/1214.json', 'test/algebra/1303.json', 'test/algebra/1332.json', 'test/algebra/1338.json', 'test/algebra/1349.json', 'test/algebra/1547.json', 'test/algebra/1553.json', 'test/algebra/1578.json', 'test/algebra/1787.json', 'test/algebra/187.json', 'test/algebra/1934.json', 'test/algebra/2023.json', 'test/algebra/2046.json', 'test/algebra/2058.json', 'test/algebra/2102.json', 'test/algebra/2176.json', 'test/algebra/2214.json', 'test/algebra/2257.json', 'test/algebra/2264.json', 'test/algebra/2277.json', 'test/algebra/2427.json', 'test/algebra/246.json', 'test/algebra/2486.json', 'test/algebra/2592.json', 'test/algebra/2593.json', 'test/algebra/2626.json', 'test/algebra/2680.json', 'test/algebra/2700.json', 'test/algebra/2743.json', 'test/algebra/2779.json', 'test/algebra/291.json', 'test/algebra/297.json', 'test/algebra/30