In [1]:
import sys
sys.path.append('tools')
import os
import re
from collections import Counter
import json

from tqdm import tqdm
from sympy import sympify, Eq, simplify
from latex2sympy2 import latex2sympy

import parse_data



In [2]:
import os

cwd = os.getcwd()
res_dir = os.path.join(cwd, 'result')
files = parse_data.find_files_with_suffix(res_dir, '.jsonl')

papers = ['2024-1', '2024-2']

models = os.listdir(res_dir)
models.remove(".DS_Store")

# Check Response Number

In [25]:
for model in models:
    model_res_dir = os.path.join(res_dir, model)
    model_files = parse_data.find_files_with_suffix(model_res_dir, '.jsonl')

    if len(model_files) != 28:
        print(model)

    for fpath in model_files:

        content = parse_data.read_jsonl(fpath)

        if len(content) != 80:
            print(fpath)

baichuan4


# Replace `list` with `dict` in `infer_time(s)` and `token_usage`

At first, I tended to run the model for multiple turns to check its self-correction capability. Thus, I employed the `list` instead of the `dict` in the `jsonl` file.

However, multiple-turn running is expensive.

In [9]:
def modify_data_type(data):
    """
    Modifies 'token_usage' and 'infer_times(s)' values within a list of dictionaries.
    If they are lists, extract the first element from each list and reassign them.

    Args:
    data (list of dict): The list of dictionaries containing 'token_usage' and 'infer_times(s)' keys.

    Returns:
    None: The function modifies the list in place.
    """
    for item in data:
        if 'token_usage' in item and isinstance(item['token_usage'], list):
            # Extract the first element from the list and reassign it to 'token_usage'
            item['token_usage'] = item['token_usage'][0]
        
        if 'infer_time(s)' in item and isinstance(item['infer_time(s)'], list):
            # Extract the first element from the list and reassign it to 'infer_times'
            item['infer_time(s)'] = item['infer_time(s)'][0]
        
        if 'infer_time' in item and isinstance(item['infer_time'], list):
            # Extract the first element from the list and reassign it to 'infer_time'
            item['infer_time(s)'] = item['infer_time(s)'][0]

for fpath in tqdm(files):
    content = parse_data.read_jsonl(fpath)
    modify_data_type(content)
    flag = True
    for i in content:
        if isinstance(i['token_usage'], list): 
            print("TypeError", i['llm_parameter']['model_name'], i['llm_parameter']['exp_uuid'])
            flag = False
        if isinstance(i['infer_time(s)'], list): 
            print("TypeError", i['llm_parameter']['model_name'], i['llm_parameter']['exp_uuid'])
            flag = False
    if flag:
        parse_data.write_jsonl(content, fpath)

100%|██████████| 447/447 [00:01<00:00, 251.50it/s]


# Unify the terminology in `token_usage`

Models come from different servers, and they have different terms to express input tokens, output tokens, and total tokens. To benefit future analysis, I need to unify the terms in `token_usage`.

Note:

+ Since I didn't back up the files in GitHub after retrieving the results and manipulated the raw files, I lost some `token_usage` data in `yi-large` and `abab6.5s-chat`.
+ Minimax(abab6.5s-chat) only returns `total_tokens` and doesn't provide a token-count api

In [10]:
# get the terms
usage = set()
for fpath in files:
    i = parse_data.read_jsonl(fpath)
    for j in i:
        if isinstance(j['token_usage'], list):
            for k in j['token_usage'][0].keys():
                usage.add(k)
        else:
            for k in j['token_usage'].keys():
                usage.add(k)

usage

# transform dict
token_term_tras = {
    "input_tokens": "input_tokens",
    "prompt_tokens": "input_tokens",
    "prompt_token_count": "input_tokens",
    "completion_tokens": "output_tokens",
    "output_tokens": "output_tokens",
    "candidates_token_count": "output_tokens",
    "total_token_count": "total_tokens",
    "total_tokens": "total_tokens"
}

{'completion_tokens',
 'input_tokens',
 'output_tokens',
 'prompt_tokens',
 'total_tokens'}

In [28]:
# add total tokens in claude-3-5-sonnet-20240620
for fpath in files:
    if "claude-3-5-sonnet-20240620" not in fpath:
        continue
    content = parse_data.read_jsonl(fpath)
    for i in content:
        total_tokens = i['token_usage']['input_tokens'] + i['token_usage']['output_tokens']
        i['token_usage']['total_tokens'] = total_tokens
    parse_data.write_jsonl(content, fpath)

In [11]:
def modify_token_usage_keys(data, key_map):
    """
    Modifies the keys of 'token_usage' sub-dictionaries within a list of dictionaries
    based on a given mapping.

    Args:
    data (list of dict): The list of dictionaries containing the 'token_usage' key.
    key_map (dict): A dictionary where each key-value pair represents an old key (from 'token_usage')
                    and its new key name.

    Returns:
    None: The function modifies the list in place.
    """
    for item in data:
        if 'token_usage' not in item:
            continue

        # Create a new dictionary for updated keys
        new_token_usage = {}
        if isinstance(item['token_usage'], dict):
            # Iterate over each key-value pair in the original token_usage dictionary
            for key, value in item['token_usage'].items():
                # Use the new key from key_map if it exists, else keep the old key
                new_key = key_map.get(key, key)
                new_token_usage[new_key] = value
        elif isinstance(item['token_usage'], list):
            # Iterate over each key-value pair in the original token_usage dictionary
            for key, value in item['token_usage'][0].items():
                # Use the new key from key_map if it exists, else keep the old key
                new_key = key_map.get(key, key)
                new_token_usage[new_key] = value
        else:
            ValueError("without value")
        # Update the original dictionary with the new token_usage
        item['token_usage'] = new_token_usage


for fpath in tqdm(files):
    content = parse_data.read_jsonl(fpath)
    modify_token_usage_keys(content, token_term_tras)
    flag = True
    for i in content:
        if not isinstance(i['token_usage'], dict): 
            # print("TypeError", i['llm_parameter']['model_name'])
            flag = False
            continue
        if not i['token_usage']:
            # print("None", i['llm_parameter']['model_name'])
            flag = False
            continue
        # Note, minimax(abab6.5s-chat) only returns `total_tokens` and doesn't provide a token-count api
        if set(i['token_usage'].keys()) != set(['input_tokens', 'output_tokens', 'total_tokens']) and set(i['token_usage'].keys()) != set(['total_tokens']):
            # print("KeyError", i['llm_parameter']['model_name'])
            flag = False
    if flag:
        parse_data.write_jsonl(content, fpath)


100%|██████████| 447/447 [00:01<00:00, 241.79it/s]


# Obatin Answers

## Check Answers

In [None]:
# Iterate through each model
bad_cases = []
for model_name in tqdm(models):
    results = []
    model_res_dir = os.path.join(res_dir, model_name)
    # Iterate through each paper
    for paper in papers:
        paper_res_dir = os.path.join(model_res_dir, paper)
        
        # Get all .jsonl files
        fnames = [i for i in os.listdir(paper_res_dir) if i.endswith('.jsonl')]
        
        # Process each file
        for fname in fnames:
            fpath = os.path.join(paper_res_dir, fname)
            content = parse_data.read_jsonl(fpath)

            # Iterate through each item in the file
            for i in content:
                std_answer = i['question']['answer']
                ques_idx = i['question']['idx']
                res = i['result']

                ans_pattern = "{\n*\s*\t*" + f"['\"]?{ques_idx}['\"]?" + ": \[['\"]?.+\n*\s*\t*}"
                ans = re.search(re.compile(ans_pattern), res)
                
                if ans:
                    results.append(ans[0])
                else:
                    results.append(str({i['exp_uuid']: res}))
                    # Add question idx in the result if it doesn't include one
                    if re.search('^## \d{,2}\n', res):
                        bad_cases.append({'exp_uuid': i['exp_uuid'], 'result': res})
                    else:
                        bad_cases.append({'exp_uuid': i['exp_uuid'], 'result': f"## {i['question']['idx']}\n\n" + res})

    ans_fpath = f'{model_res_dir}{os.sep}{model_name}.txt'
    with open(ans_fpath, 'w') as f:
        for i in results:
            tmp = f.write(i + '\n')

bad_case_fpath = os.path.join(cwd, 'batch_api' + os.sep + 'openai' + os.sep + '2024-GaoKao_ans-bad-case_raw-data.jsonl')
bad_case_fpath
parse_data.write_jsonl(bad_cases, bad_case_fpath)

## Extract Answers 

This section consists of two parts:

1. using gpt4o extract answers and classify the repetition (completed in `./cookbook/batch_api_ans.ipynb`);
2. using regex to extract answers.

The workflow in there would be:

1. using regex to extract answers if 'answer' not in the key;
2. put the 'raw_answer' into the 'answer' dict;
3. determine whether the llm_answer is correct, and compute the score

Correct standard:

+ if a question has multiple answers, all the answers are correct and the question is correct, otherwise, wrong.

Data Structure:

+ 'answer'
  + 'raw_answer': the extracted JSON-like answer, `None` refers to can't extract by regex;
  + 'llm_answer': the processed answer;
  + 'correct': whether the answer is correct;
  + 'score': the score

In [17]:
# extract answers
for fpath in files:
    content = parse_data.read_jsonl(fpath)

    for i in content:
        ques_idx = i['question']['idx']
        std_answer = i['question']['answer']
        res = i['result']

        # pass the answer filtered by llm
        if 'answer' in i.keys():
            if 'repetition' in i['answer'].keys():
                continue

        i['answer'] = {}
        ans_pattern = "{\n*\s*\t*" + f"['\"]?{ques_idx}['\"]?" + ": \[['\"]?.+\n*\s*\t*}"
        ans = re.search(re.compile(ans_pattern), res)
        if not ans:
            i['answer']['raw_answer'] = []
            i['answer']['llm_answer'] = []
            continue
        ans = ans.group()
        i['answer']['raw_answer'] = ans

        # parse raw answer
        # replace \} with }
        ans = ans.replace('\\}', '}')
        # repalce ' with ""
        ans = ans.replace('\'', '"')
        # remove \n \s \t
        ans = re.sub(r'[\s\n\t]+', '', ans)
        # remove raw string indefier
        ans = ans.replace('r"', '"')
        # if 'frac' in ans:
        #         print(ans)
        # handle \ error
        ans = ans.replace('\\\\', '\\')
        ans = ans.replace('\\', '\\\\')
        try:
            # use json.loads instead of eval
            ans = json.loads(ans)
        except:
            # unable load the ans to json
            print('unable load the ans to json:', i['exp_uuid'], ans)
            i['answer']['llm_answer'] = []
            continue
        for k, v in ans.items():
            ans = v
        i['answer']['llm_answer'] = ans
    
    try:
        parse_data.write_jsonl(content, fpath)
    except:
        print('unable to save file:', fpath)

unable load the ans to json: 1895c17a-17e4-4dd3-b3d8-c82d69b17154 {12:["$\\frac{13}{12}$"]}
unable load the ans to json: bfc76c8a-7a14-47ec-998e-20b02bcc3289 {13:["$\\frac{4\\sqrt{2}}{7}$"]}
unable load the ans to json: 6a2cd8c0-f9a5-49a8-8592-a3a76a555cb0 {13:["$-\\frac{\\sqrt{2}}{2}$"]}
unable load the ans to json: f6ef3ea4-6d0b-4c8c-bde4-5de1a9a29938 {"12":["$S_{10}$"]}`,replacing$S_{10}
unable load the ans to json: 3555d06f-5d22-464b-8fa3-f3afcc406bca {"12":["$\\frac{13}{18}$"]}\\right}
unable load the ans to json: 206aafcd-a0af-4c17-ab41-f26c1ac1cc19 {6:["B"]}
unable load the ans to json: 4b0511b0-b23d-4f67-bb16-bef84416dbaf {2:["C"]}
unable load the ans to json: 23489ae8-14da-4833-a050-7e9c6e9e2fb5 {"7":["C"]}}
unable load the ans to json: 28a7dc13-adfb-4200-9c85-c6a8fdeb8319 {5:["B"]}}
unable load the ans to json: 221c75f6-2045-440d-8f19-3f0bdbb66829 {5:["C"]}
unable load the ans to json: dc2e311c-6373-4d95-8ce8-9aad0fb61aec {9:["B","C"]}
unable load the ans to json: 5444c5ad-ad

In [18]:
# check latex expression

# mannaully set \\-\\\\frac to -\\\\frac in claude

for fpath in files:
    content = parse_data.read_jsonl(fpath)

    for i in content:
        if i['question']['idx'] in ['12', '13', '13']:
            try:
                sympy_expr1 = latex2sympy(i['question']['answer'][0])
                # print(sympy_expr1)
            except:
                print(i['exp_uuid'], i['question']['answer'])

In [19]:
# check latex expression

# mannaully set \\-\\\\frac to -\\\\frac in claude

for fpath in files:
    content = parse_data.read_jsonl(fpath)

    for i in content:
        if i['question']['idx'] in ['12', '13', '13']:
            if i['answer']['llm_answer']:
                try:
                    sympy_expr1 = latex2sympy(i['answer']['llm_answer'][0])
                except:
                    print(i['exp_uuid'], i['answer']['llm_answer'])

8d1f3f16-6487-44bb-85f9-eea8d2ec18d6 ['\\boxed{\\frac{3}{2}}']
fed2d9f6-a83d-43ad-a742-1b3b149adc40 ['\\boxed{\\frac{3}{4}}']
c4ce99ff-26b1-4f34-8b97-8e9e4394a0bc ['\\boxed{-\\frac{2\\sqrt{2}}{3}}']
b9ed660b-3e9a-492a-9246-f3d3a5a22bf8 ['$\\-\\frac{2\\sqrt{2}}{3}$']
6e15cc83-5b44-4362-825b-6666da9aa1b6 ['\\boxed{-\\frac{2\\sqrt{2}}{3}}']
bda44b04-d24a-4e31-9532-8173006c2966 ['$\\-\\frac{2\\sqrt{2}}{3}$']
66213b49-ea59-4d1c-bd7a-411ebe602e38 ['$\\-\\frac{3\\sqrt{10}}{10}$']
769e683b-3713-4cf3-b4b3-e67ae6c8bc3a ['$\\-\\frac{2\\sqrt{2}}{3}$']
e15f8545-88e0-4341-b64a-21cebb70fd2d ['$\\-\\frac{2\\sqrt{2}}{3}$']
cc399e8a-6d51-4e65-a233-6609f1dc77da ['\\boxed{-\\frac{2\\sqrt{2}}{3}}']
eb238bd2-9c6b-463e-b11a-12e27c552abc ['$\\-\\frac{2\\sqrt{2}}{3}$']
5bcaf655-1d6e-465a-908d-dcbeb3cb4963 ['$\\-\\frac{2\\sqrt{2}}{3}$']
28b7ac0f-1b72-4b93-8751-3111b422837a ['\\boxed{95}']
815c6653-638d-4d00-9e21-4ef893162f29 ['$e\\in(\\sqrt{2},+\\infty)$']
970a3cd6-7102-4ddb-aa68-609f146ff218 ['$\\pm\\frac{2\\s

In [20]:
def are_latex_expressions_equal(expr1: str, expr2: str) -> bool:
    """
    Check if two LaTeX expressions are mathematically equivalent.

    Args:
    expr1 (str): First LaTeX expression.
    expr2 (str): Second LaTeX expression.

    Returns:
    bool: True if the expressions are equivalent, False otherwise.
    """
    try:
        # Convert LaTeX expressions to SymPy expressions
        sympy_expr1 = latex2sympy(expr1)
        sympy_expr2 = latex2sympy(expr2)
        
        # Simplify both expressions to their simplest form
        simplified_expr1 = simplify(sympy_expr1)
        simplified_expr2 = simplify(sympy_expr2)
        
        # Compare the simplified expressions for equality
        return simplified_expr1 == simplified_expr2
    except Exception:
        print("expr1:", expr1, "expr2:", expr2)
        return False
    
def compute_score(ques_idx: str, llm_ans: list, std_ans: list):
    if not llm_ans:
        correct = False
        score = 0  
        return correct, score 
    if ques_idx in ['1', '2', '3', '4', '5', '6', '7', '8']:
        if llm_ans == std_ans:
            correct = True
            score = 5
        else:
            correct = False
            score = 0
    # multiple choice: multiple answer
    elif ques_idx in ['9', '10', '11']:
        std_ans = set(std_ans)
        llm_ans = set(llm_ans)
        if std_ans == llm_ans:
            correct = True
            score = 6
        elif llm_ans.issubset(std_ans):
            correct = False
            score = 3
        else:
            correct = False
            score = 0
    elif ques_idx in ['12', '13', '14']:
        if len(std_ans) == 1:
            if are_latex_expressions_equal(llm_ans[0], std_ans[0]):
                correct = True
                score = 5
            else:
                correct = False
                score = 0
        elif len(std_ans) == 2:
            if len(llm_ans) == 2:
                if are_latex_expressions_equal(llm_ans[0], std_ans[0]) and are_latex_expressions_equal(llm_ans[1], std_ans[1]):
                    correct = True
                    score = 5
                elif not are_latex_expressions_equal(llm_ans[0], std_ans[0]) and not are_latex_expressions_equal(llm_ans[1], std_ans[1]):
                    correct = False
                    score = 0
                else:
                    correct = False
                    score = 3
            else:
                correct = False
                score = 0              
        else:
            correct = False
            score = 0  
    else:
        correct = False
        score = 0  

    if 'correct' not in locals():
        print(ques_idx, llm_ans, std_ans)
    return correct, score

In [24]:
### manully set
# 1. replace $\\-\\frac{2\\sqrt{2}}{3}$ with $-\\frac{2\\sqrt{2}}{3}$, deepseek-chat/2024-2/13.jsonl
# 2. replace \\\\frac{-2\\sqrt{2}}{3} with \\frac{-2\\sqrt{2}}{3}, gpt-4o-2024-05-13/2024-2/13.jsonl
# 3. repalce ["\\boxed{95}"] => ["$95$"], /deepseek-chat/2024-2/12.jsonl
# 4. "llm_answer": ["\\boxed{-\\frac{2\\sqrt{2}}{3}}"] => "llm_answer": ["-\\frac{2\\sqrt{2}}{3}"], deepseek-chat/2024-2/13.jsonl
# 5. \\\\frac{3}{2} => \\frac{3}{2}, deepseek-chat/2024-1/12.jsonl
# 6. \\boxed{\\frac{3}{2}} => \\frac{3}{2}, deepseek-chat/2024-1/12.jsonl

#   \\$95\\$ to $95$ in gpt4o and qwen2-72b
### 

for fpath in files:
    content = parse_data.read_jsonl(fpath)
    # print(fpath)

    for i in content:
        ques_idx = i['question']['idx']
        llm_ans = i['answer']['llm_answer']
        std_ans = i['question']['answer']
        correct, score = compute_score(ques_idx, llm_ans, std_ans)
        i['answer']['correct'] = correct
        i['answer']['score'] = score

    # parse_data.write_jsonl(content, fpath)

expr1: $ln(2)$ expr2: $\ln2$
expr1: $ln(2)$ expr2: $\ln2$
expr1: $ln(2)$ expr2: $\ln2$
expr1: $ln(2)$ expr2: $\ln2$
expr1: \boxed{\frac{3}{4}} expr2: $\frac{3}{2}$
expr1: \boxed{\frac{17}{32}} expr2: $\frac{1}{2}$
expr1: \boxed{\frac{1971}{4096}} expr2: $\frac{1}{2}$
expr1: \boxed{\frac{175}{256}} expr2: $\frac{1}{2}$
expr1: $1-\frac{1}{2^{2n}}$ expr2: $\frac{1}{2}$
expr1: \boxed{\frac{11}{16}} expr2: $\frac{1}{2}$
expr1: \boxed{\frac{11}{16}} expr2: $\frac{1}{2}$
expr1: \boxed{\frac{15}{16}} expr2: $\frac{1}{2}$
expr1: \boxed{\frac{11}{16}} expr2: $\frac{1}{2}$
expr1: $\-\frac{3\sqrt{10}}{10}$ expr2: $-\frac{2\sqrt{2}}{3}$
expr1: $a=1-\ln(2)-\frac{1}{2}e^{-\frac{1}{2}}$ expr2: $\ln2$
expr1: Nosolution expr2: $\ln2$
expr1: $e\in(\sqrt{2},+\infty)$ expr2: $\frac{3}{2}$
expr1: $\frac{5}{b}$ expr2: $\frac{3}{2}$
expr1: $\frac{18}{a}$ expr2: $\frac{3}{2}$
expr1: $-4\sqrt{\frac{(4x-\sqrt{1-y^2}y)^2-2(\sqrt{2}+1)x-1}{2\sqrt{2}+1}}$ expr2: $-\frac{2\sqrt{2}}{3}$
expr1: $a=\ln(2)$ expr2: $\ln2