In [None]:
import sys
sys.path.append('tools')
import os
import re
import uuid
import json
import time
from collections import Counter

from IPython.core.display import HTML
import numpy as np
import pandas as pd
from sympy import sympify, Eq, simplify
from latex2sympy2 import latex2sympy

import parse_data
import api



In [8]:
cwd = os.getcwd()
res_dir = os.path.join(cwd, 'result')

papers = ['2024-1', '2024-2']

models = os.listdir(res_dir)
models.remove(".DS_Store")

# Inference Results

In [4]:
results = {}

for model in models:

    model_res_dir = os.path.join(res_dir, model)
    fpaths = parse_data.find_files_with_suffix(model_res_dir, '.jsonl')
    results[model] = {}
    total_tokens, infer_time = 0, 0

    for fpath in fpaths:
        content = parse_data.read_jsonl(fpath)
        for i in content:
            if not i['token_usage']:
                continue

            total_tokens += i['token_usage']['total_tokens']
            # gpt use batch api
            if "gpt" in model:
                infer_time = 2136
            else:
                infer_time += i['infer_time(s)']
            

    infer_speed = round(total_tokens/infer_time, 2)

    results[model]['Total_tokens'] = total_tokens
    results[model]['Infer_time(s)'] = infer_time
    results[model]['Infer_speed(tokens/s)'] = infer_speed

# Recreate DataFrame with numeric values for processing and displaying without formatting
df = pd.DataFrame(results).T

# Sort the DataFrame by the Average column
df.sort_values(by='Infer_speed(tokens/s)', ascending=False, inplace=True)

# Function to highlight the top three values in each column
def highlight_top_three(s):
    is_numeric = pd.to_numeric(s, errors='coerce').notnull().all()
    if is_numeric:
        sorted_s = s.sort_values(ascending=False)
        colors = ['color: blue', 'color: green', 'color: pink']
        top3 = sorted_s.head(3)
        highlight_colors = [''] * len(s)
        for i, v in enumerate(s):
            if v in top3.values:
                highlight_colors[i] = colors[top3.values.tolist().index(v)]
        return highlight_colors
    return ['' for _ in s]

# Apply the highlighting function
styled_df = df.style.apply(highlight_top_three, subset=['Total_tokens', 'Infer_time(s)', 'Infer_speed(tokens/s)']).set_caption("Model Inference Results")

# Format the values as percentages with two decimal places
styled_df = styled_df.format({
    'Total_tokens': '{:,.2f}',
    'Infer_time(s)': '{:,.2f}',
    'Infer_speed(tokens/s)': '{:,.2f}'
})

styled_df
print("""Note:
- 因为数据管理的失误导致 abab6.5s-chat 和 yi-large 的 total_otkens 部分丢失，
      所以这两个的 total_tokens 最低的。
- 因为 gpt4o 使用了 batch API 进行推理，所以推理速度高出了两个量级。
""")

Unnamed: 0,Total_tokens,Infer_time(s),Infer_speed(tokens/s)
gpt-4o-2024-05-13,2432776.0,2136.0,1138.94
abab6.5s-chat,1693468.0,24510.42,69.09
doubao-pro-32k-240615,1780180.0,35288.62,50.45
mistral-large-2402,2255747.0,45889.72,49.16
glm-4-0520,2675433.0,57578.3,46.47
qwen2-72b-instruct,2299737.0,61043.24,37.67
moonshot-v1-8k,2701808.0,77959.69,34.66
ERNIE-4.0-Turbo-8K,2122628.0,63461.0,33.45
llama3-70b-instruct,1994395.0,61382.33,32.49
deepseek-coder,2606396.0,82008.87,31.78


Note:
- 因为数据管理的失误导致 abab6.5s-chat 和 yi-large 的 total_otkens 部分丢失，
      所以这两个的 total_tokens 最低的。
- 因为 gpt4o 使用了 batch API 进行推理，所以推理速度高出了两个量级。



# Instruction-following

## Response Structure Format

+ **Motivation**: To investigate whether the model strictly adheres to the specified response format given in the system prompt or at the beginning of the user prompt.
+ **Explanation**:
  + The system prompt typically consists of two or three parts: "### Original Question" (Optional), "### Solution Process", and "### Answer". Additionally, there is always a question index at the beginning, such as "## 1".
  + There are two modes in the evaluation: strict or lax. Strict mode means concatenating `\n` to the titles as mentioned in the system prompt, such as "### Original Question\n", while lax mode only includes the text.
  + If a model response includes all essential parts, that response follows the instruction and returns `True`; otherwise, it returns `False`.
  + The types of listing bad cases are missing a single part, missing all parts ("ALL"), and missing a combination of parts ("Others").
  + The table is sorted by the `("instruction-following", "True")`.

In [5]:
def validate_content_format(content_text: str, format_settings: dict, validation_mode: str) -> tuple:
    """
    Validate the content text against predefined format patterns based on language and the inclusion of a question section.
    Supports different validation modes ('strict' or 'lax') to adjust the pattern matching criteria.

    Args:
    content_text (str): The text to be validated.
    format_settings (dict): Dictionary specifying 'lang' (language) and 'print_question' (bool for including the question section).
    validation_mode (str): Mode of validation ('strict' or 'lax') to determine pattern matching criteria.

    Returns:
    tuple: A tuple where the first element is a boolean indicating if all required patterns are found,
           and the second element is a tuple of missing patterns, if any.
    """
    # Patterns based on language and whether the question is included in the format
    format_patterns = {
        "zh": {
            True: ["^## \d{,2}", "### 原题", "### 答案", "### 解题过程"],
            False: ["^## \d{,2}", "### 答案", "### 解题过程"]
        },
        "en": {
            True: ["^## \d{,2}", "### Original Problem", "### Answer", "### Solution Process"],
            False: ["^## \d{,2}", "### Answer", "### Solution Process"]
        }
    }

    # Apply stricter criteria based on the validation mode
    if validation_mode == 'strict':
        for language in format_patterns:
            for question_presence in [True, False]:
                format_patterns[language][question_presence] = [pattern + "\n" for pattern in format_patterns[language][question_presence]]

    # Retrieve the relevant patterns based on language and question inclusion setting
    required_patterns = format_patterns[format_settings['lang']][format_settings['print_question']]

    patterns_found = True
    missing_patterns = []

    # Evaluate each pattern against the content text
    for pattern in required_patterns:
        compiled_pattern = re.compile(pattern)
        if not re.search(compiled_pattern, content_text):
            missing_patterns.append(pattern)
            patterns_found = False

    # Sort missing patterns for consistent output
    sorted_missing_patterns = sorted(missing_patterns)

    if set(sorted_missing_patterns) == set(required_patterns):
        sorted_missing_patterns.append('ALL')

    return patterns_found, tuple(sorted_missing_patterns)

def process_results(models, papers, res_dir, validation_mode):
    """
    Processes the results of different models based on their response formatting.

    Args:
        models (list): List of model names to process.
        papers (list): List of papers (currently unused in this function).
        res_dir (str): Directory where the model result files are located.
        validation_mode (str): Mode of validation, can be 'strict' or another mode.

    Returns:
        pd.DataFrame: A DataFrame containing the evaluation results for each model.
    """
    results = []

    # Iterate over each model
    for model_name in models[:]:
        model_res_dir = os.path.join(res_dir, model_name)
        fpaths = parse_data.find_files_with_suffix(model_res_dir, '.jsonl')

        # Initialize counters for existence and type
        res = {
            'exist': Counter(),
            'type': Counter()
        }

        # Process each file
        for fpath in fpaths:
            content = parse_data.read_jsonl(fpath)

            # Validate content format
            for entry in content:
                result, prompt_parameter = entry['result'], entry['prompt_parameter']
                r1, r2 = validate_content_format(result, prompt_parameter, validation_mode)

                # if not r1:
                #     print("sdafsd", r2)
                #     print(result)
                
                res['exist'][r1] += 1
                if not r1:
                    res["type"][r2] += 1

        # Calculate true, false counts and rate
        true_count = res['exist'][True]
        false_count = res['exist'][False]
        total = true_count + false_count
        rate = f"{true_count / total:.2%}" if total != 0 else "0.00%"

        # Define patterns to aggregate bad case details
        patterns = [
            "^## \d{,2}", "### 原题", "### Original Problem", "### 答案", 
            "### Answer", "### 解题过程", "### Solution Process"
        ]
        if validation_mode == 'strict':
            patterns = [pattern + '\n' for pattern in patterns]
        
        bad_cases_details = {pattern: res['type'][(pattern, )] for pattern in patterns}
        bad_cases_details['ALL'] = sum(res['type'][i] for i in res['type'] if i[-1] == 'ALL')
        bad_cases_details['Others'] = false_count - sum(bad_cases_details.values())

        # Collect model data
        model_data = [true_count, false_count, rate] + list(bad_cases_details.values())
        results.append((model_name, model_data))

    # Define columns based on validation mode
    if validation_mode == 'strict':
        columns = pd.MultiIndex.from_tuples([
            ('Instruction Following', 'True'),
            ('Instruction Following', 'False'),
            ('Instruction Following', 'Rate'),
            ('Bad Cases', r"^## \d{,2}\n"),
            ('Bad Cases', r"### 原题\n"),
            ('Bad Cases', r"### Original Problem\n"),
            ('Bad Cases', r"### 答案\n"),
            ('Bad Cases', r"### Answer\n"),
            ('Bad Cases', r"### 解题过程\n"),
            ('Bad Cases', r"### Solution Process\n"),
            ('Bad Cases', 'ALL'),
            ('Bad Cases', 'Others')
        ])
    else:
        columns = pd.MultiIndex.from_tuples([
            ('Instruction Following', 'True'),
            ('Instruction Following', 'False'),
            ('Instruction Following', 'Rate'),
            ('Bad Cases', "^## \d{,2}"),
            ('Bad Cases', "### 原题"),
            ('Bad Cases', "### Original Problem"),
            ('Bad Cases', "### 答案"),
            ('Bad Cases', "### Answer"),
            ('Bad Cases', "### 解题过程"),
            ('Bad Cases', "### Solution Process"),
            ('Bad Cases', 'ALL'),
            ('Bad Cases', 'Others')
        ])

    # Create DataFrame and sort by 'True' count in descending order
    df = pd.DataFrame([data for _, data in results], index=[model for model, _ in results], columns=columns)
    df.index.name = 'Model'
    df.sort_values(by=('Instruction Following', 'True'), ascending=False, inplace=True)
    df = df.style.set_caption(f'Instruction-Following Degree in Response Structure Formatting: {validation_mode} mode')

    return df

df = process_results(models, papers, res_dir, 'strict')
df

df = process_results(models, papers, res_dir, 'lax')
df

Unnamed: 0_level_0,Instruction Following,Instruction Following,Instruction Following,Bad Cases,Bad Cases,Bad Cases,Bad Cases,Bad Cases,Bad Cases,Bad Cases,Bad Cases,Bad Cases
Unnamed: 0_level_1,True,False,Rate,"^## \d{,2}\n",### 原题\n,### Original Problem\n,### 答案\n,### Answer\n,### 解题过程\n,### Solution Process\n,ALL,Others
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
gpt-4o-2024-05-13,2177,63,97.19%,52,0,0,6,3,0,0,0,2
moonshot-v1-8k,2172,68,96.96%,59,0,0,5,4,0,0,0,0
claude-3-5-sonnet-20240620,2081,159,92.90%,159,0,0,0,0,0,0,0,0
qwen-max,2043,197,91.21%,164,0,0,11,22,0,0,0,0
deepseek-chat,2038,202,90.98%,197,0,0,0,1,0,0,4,0
deepseek-coder,2004,236,89.46%,230,0,0,6,0,0,0,0,0
llama3-70b-instruct,1987,253,88.71%,248,0,0,2,3,0,0,0,0
qwen2-72b-instruct,1899,341,84.78%,340,0,0,0,0,0,0,0,1
mistral-large-2402,1861,379,83.08%,379,0,0,0,0,0,0,0,0
glm-4-0520,1842,398,82.23%,246,0,0,12,7,51,3,0,79


Unnamed: 0_level_0,Instruction Following,Instruction Following,Instruction Following,Bad Cases,Bad Cases,Bad Cases,Bad Cases,Bad Cases,Bad Cases,Bad Cases,Bad Cases,Bad Cases
Unnamed: 0_level_1,True,False,Rate,"^## \d{,2}",### 原题,### Original Problem,### 答案,### Answer,### 解题过程,### Solution Process,ALL,Others
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
gemini-1.5-pro-001,2240,0,100.00%,0,0,0,0,0,0,0,0,0
gpt-4o-2024-05-13,2228,12,99.46%,1,0,2,6,3,0,0,0,0
moonshot-v1-8k,2175,65,97.10%,56,0,0,5,4,0,0,0,0
claude-3-5-sonnet-20240620,2081,159,92.90%,159,0,0,0,0,0,0,0,0
deepseek-chat,2077,163,92.72%,158,0,0,0,1,0,0,0,4
glm-4-0520,2074,166,92.59%,14,0,66,12,14,51,4,0,5
qwen-max,2047,193,91.38%,164,0,0,9,20,0,0,0,0
deepseek-coder,2004,236,89.46%,230,0,0,6,0,0,0,0,0
llama3-70b-instruct,1995,245,89.06%,240,0,0,2,3,0,0,0,0
qwen2-72b-instruct,1911,329,85.31%,328,0,0,0,0,0,0,0,1


## Answer Format

+ **Motivation**: To investigate whether the model strictly adheres to the specified answer format given in the system prompt or at the beginning of the user prompt.
+ **Required Format**: 
  - **Multiple Choice**: `{'{question number}': [{answers}]}`, for example, `{'99': ['E']}` or `{'98': ['E', 'F']}`.
  - **Fill-in-the-Blank**: `{'{question number}': [{LaTex format answers}]}`, for example, `{'97': ['$0$']}` or `{'96': ['$0$', '$1$']}`.
+ **Explanation**:
  + Investigated three types:
    + The required format: `{'{ques_idx}': \['.+'\]}`
    + Allowed single quotes and double quotes: `{['\"]?{ques_idx}['\"]?: \[['\"]?.+['\"]?\]}`
    + Allowed `\n\s\t`: `{\n*\s*\t*['\"]?{ques_idx}['\"]?: \[['\"]?.+\n*\s*\t*}`
  + The table is sorted by the average column.
  + Mark the top 3 values for each column as "blue", "green", and "pink" respectively.

In [6]:
results = []

# Iterate through each model
for model_name in models:
    model_res_dir = os.path.join(res_dir, model_name)

    # Initialize counters for each pattern
    n1, n2, n3 = 0, 0, 0
    
    # Iterate through each paper
    for paper in papers:
        paper_res_dir = os.path.join(model_res_dir, paper)
        
        # Get all .jsonl files
        fnames = [i for i in os.listdir(paper_res_dir) if i.endswith('.jsonl')]
        
        # Process each file
        for fname in fnames:
            fpath = os.path.join(paper_res_dir, fname)
            content = parse_data.read_jsonl(fpath)

            # Iterate through each item in the file
            for i in range(len(content)):
                std_answer = content[i]['question']['answer']
                ques_idx = content[i]['question']['idx']
                res = content[i]['result']

                # Check the result against three different patterns
                pattern_1 = "{" + f"'{ques_idx}': \['.+'\]" + "}"
                if not re.search(re.compile(pattern_1), res):
                    n1 += 1

                pattern_2 = "{" + f"['\"]?{ques_idx}['\"]?: \[['\"]?.+['\"]?\]" + "}"
                if not re.search(re.compile(pattern_2), res):
                    n2 += 1

                pattern_3 = "{\n*\s*\t*" + f"['\"]?{ques_idx}['\"]?" + ": \[['\"]?.+\n*\s*\t*}"
                if not re.search(re.compile(pattern_3), res):
                    n3 += 1

    # Calculate accuracy for each pattern and the average
    accuracy_1 = (2240 - n1) / 2240
    accuracy_2 = (2240 - n2) / 2240
    accuracy_3 = (2240 - n3) / 2240
    avg_accuracy = (accuracy_1 + accuracy_2 + accuracy_3) / 3
    
    # Append the results for the current model
    results.append((model_name, [f"{accuracy_1:8.2%}", f"{accuracy_2:8.2%}", f"{accuracy_3:8.2%}", f"{avg_accuracy:8.2%}"]))

# Define column names
columns = pd.Index(["{'{ques_idx}': \['.+'\]}", r"{['\"]?{ques_idx}['\"]?: \[['\"]?.+['\"]?\]}", r"{\n*\s*\t*['\"]?{ques_idx}['\"]?: \[['\"]?.+\n*\s*\t*}", 'Average'])

# Create DataFrame
df = pd.DataFrame([data for _, data in results], index=[model for model, _ in results], columns=columns)
df.index.name = 'Model'

# Sort the DataFrame by the Average column
df.sort_values(by='Average', ascending=False, inplace=True)

# Function to highlight the top three values in each column
def highlight_top_three(s):
    sorted_values = s.str.rstrip('%').astype('float').sort_values(ascending=False)
    max_val, second_val, third_val = sorted_values.iloc[0], sorted_values.iloc[1], sorted_values.iloc[2]
    
    def color_font(val):
        val_float = float(val.rstrip('%'))
        if val_float == max_val:
            return 'color: blue'
        elif val_float == second_val:
            return 'color: green'
        elif val_float == third_val:
            return 'color: pink'
        return ''

    return [color_font(v) for v in s]

# Apply the highlighting function and set caption
df = df.style.apply(highlight_top_three, subset=columns).set_caption('Instruction-Following Degree in Answer Formatting')

# Display the styled DataFrame
df

Unnamed: 0_level_0,{'{ques_idx}': \['.+'\]},"{['\""]?{ques_idx}['\""]?: \[['\""]?.+['\""]?\]}","{\n*\s*\t*['\""]?{ques_idx}['\""]?: \[['\""]?.+\n*\s*\t*}",Average
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
claude-3-5-sonnet-20240620,99.73%,99.73%,100.00%,99.82%
gemini-1.5-pro-001,98.93%,98.93%,99.64%,99.17%
llama3-70b-instruct,98.35%,99.06%,99.55%,98.99%
doubao-pro-32k-240615,97.63%,98.35%,99.91%,98.63%
qwen2-72b-instruct,97.28%,98.62%,99.73%,98.54%
glm-4-0520,96.88%,97.19%,98.53%,97.53%
mistral-large-2402,96.65%,96.88%,98.48%,97.34%
moonshot-v1-8k,95.22%,97.68%,99.06%,97.32%
gpt-4o-2024-05-13,95.49%,95.71%,97.99%,96.40%
deepseek-chat,92.99%,95.31%,97.86%,95.39%


## Reponse Structure & Answer Format

+ Gather the two together to compare the models.

In [7]:
# Define the original dictionaries
# response structure: strict
dict1 = {
    'gpt-4o-2024-05-13': '97.19%',
    'moonshot-v1-8k': '96.96%',
    'claude-3-5-sonnet-20240620': '92.90%',
    'qwen-max': '91.21%',
    'deepseek-chat': '90.98%',
    'deepseek-coder': '89.46%',
    'llama3-70b-instruct': '88.71%',
    'qwen2-72b-instruct': '84.78%',
    'mistral-large-2402': '83.08%',
    'glm-4-0520': '82.37%',
    'doubao-pro-32k-240615': '78.66%',
    'ERNIE-4.0-Turbo-8K': '77.14%',
    'abab6.5s-chat': '68.12%',
    'gemini-1.5-pro-001': '56.25%',
    'yi-large': '33.93%',
    'baichuan4': '13.98%'
}

# response structure: lax
dict2 = {
    'gemini-1.5-pro-001': '100.00%',
    'gpt-4o-2024-05-13': '99.46%',
    'moonshot-v1-8k': '97.10%',
    'claude-3-5-sonnet-20240620': '92.90%',
    'glm-4-0520': '92.77%',
    'deepseek-chat': '92.72%',
    'qwen-max': '91.38%',
    'deepseek-coder': '89.46%',
    'llama3-70b-instruct': '89.06%',
    'qwen2-72b-instruct': '85.31%',
    'mistral-large-2402': '83.84%',
    'yi-large': '81.29%',
    'doubao-pro-32k-240615': '78.66%',
    'ERNIE-4.0-Turbo-8K': '78.08%',
    'abab6.5s-chat': '68.57%',
    'baichuan4': '13.98%'
}

# answer format
dict3 = {
    'claude-3-5-sonnet-20240620': '99.82%',
    'gemini-1.5-pro-001': '99.17%',
    'llama3-70b-instruct': '98.99%',
    'doubao-pro-32k-240615': '98.63%',
    'qwen2-72b-instruct': '98.54%',
    'glm-4-0520': '97.68%',
    'mistral-large-2402': '97.34%',
    'moonshot-v1-8k': '97.32%',
    'gpt-4o-2024-05-13': '96.40%',
    'deepseek-chat': '95.39%',
    'abab6.5s-chat': '93.38%',
    'yi-large': '91.32%',
    'qwen-max': '90.25%',
    'baichuan4': '81.46%',
    'deepseek-coder': '70.64%',
    'ERNIE-4.0-Turbo-8K': '67.07%'
}

# Calculate the average values for dict1 and dict2
avg_dict = {}
for model in dict1:
    if model in dict2:
        value1 = float(dict1[model].strip('%'))
        value2 = float(dict2[model].strip('%'))
        average_value = (value1 + value2) / 2
        avg_dict[model] = average_value

# Integrate avg_dict and dict3 into a single DataFrame
models = sorted(set(avg_dict.keys()).union(dict3.keys()))
data = {
    'Response Structure': [avg_dict.get(model, 'N/A') for model in models],
    'Answer': [float(dict3.get(model, 'N/A').strip('%')) for model in models]
}

# Create the DataFrame
df = pd.DataFrame(data, index=models)
df.index.name = 'Model'

# Calculate the average of the two columns and add it as a new column
df['Average'] = df.mean(axis=1)

# Sort the DataFrame by the Average column
df.sort_values(by='Average', ascending=False, inplace=True)

# Function to highlight the top three values in each column
def highlight_top_three(s):
    is_numeric = pd.to_numeric(s, errors='coerce').notnull().all()
    if is_numeric:
        sorted_s = s.sort_values(ascending=False)
        colors = ['color: blue', 'color: green', 'color: pink']
        top3 = sorted_s.head(3)
        highlight_colors = [''] * len(s)
        for i, v in enumerate(s):
            if v in top3.values:
                highlight_colors[i] = colors[top3.values.tolist().index(v)]
        return highlight_colors
    return ['' for _ in s]

# Apply the highlighting function
styled_df = df.style.apply(highlight_top_three, subset=['Response Structure', 'Answer', 'Average'])

# Format the values as percentages with two decimal places
styled_df = styled_df.format({
    'Response Structure': '{:.2f}%',
    'Answer': '{:.2f}%',
    'Average': '{:.2f}%'
})

# Set the caption for the DataFrame
styled_df = styled_df.set_caption('Instruction-following in Formatting')

# Display the styled DataFrame
styled_df


Unnamed: 0_level_0,Response Structure,Answer,Average
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gpt-4o-2024-05-13,98.32%,96.40%,97.36%
moonshot-v1-8k,97.03%,97.32%,97.17%
claude-3-5-sonnet-20240620,92.90%,99.82%,96.36%
llama3-70b-instruct,88.88%,98.99%,93.94%
deepseek-chat,91.85%,95.39%,93.62%
glm-4-0520,87.57%,97.68%,92.62%
qwen2-72b-instruct,85.05%,98.54%,91.79%
qwen-max,91.29%,90.25%,90.77%
mistral-large-2402,83.46%,97.34%,90.40%
gemini-1.5-pro-001,78.12%,99.17%,88.65%


# Answer Accuracy

## Question Index

```json
ans_data = {
    "paper_type": 
    {
        "model_name": {
            "question_idx":{
                "accuracy": ..,
                "avg_score": ..., 
                "total_score": ..., 
                "correct_cnt": ..., 
                "total_cnt": ...
            }
        }
    }
}
```

In [10]:
# Define a function to set background colors for specified row name and its data cells
def highlight_row_name(df, row_name):
    # Function to highlight data cells of the specified row
    def highlight_avg(row):
        if row.name == row_name:
            return ['background-color: yellow']*len(row)
        else:
            return ['']*len(row)
    
    # Apply the cell style
    styled = df.apply(highlight_avg, axis=1)
    # Highlight the specified row name
    styled.set_table_styles({
        row_name: [{'selector': 'th', 'props': 'background-color: yellow;'}],
    }, overwrite=False, axis=1)
    return styled

def generate_acc_table(ans_data, papers, paper_type):

    tables = []

    all_data = []

    for paper in papers:
        data = []
        for model_name, questions in ans_data[paper].items():
            for question_idx, metrics in questions.items():
                question_num = int(question_idx.replace('question', ''))
                data.append([model_name, question_num, metrics["accuracy"]])

        df = pd.DataFrame(data, columns=["Model", "Question Index", "Accuracy"])

        # Pivot table to get the desired format
        pivot_df = df.pivot(index="Model", columns="Question Index", values="Accuracy")

        # Calculate the average accuracy for each model and add as a new column
        pivot_df['Avg'] = pivot_df.mean(axis=1)

        # Sort by Avg column in descending order
        pivot_df = pivot_df.sort_values(by='Avg', ascending=False)

        column_avg = pivot_df.mean()
        pivot_df.loc['Avg'] = column_avg  # 在排序之后添加平均值行

        # Apply the highlighting function
        styled_df = pivot_df.style.apply(highlight_top_three, subset=[i for i in range(1, 15)] + ['Avg'])

        # Set the caption for the DataFrame
        styled_df = styled_df.set_caption(f'{paper} Question Accuracy: {paper_type}')

        styled_df = styled_df.format('{:.2%}')

        # Display the styled DataFrame
        tables.append(styled_df)
        
        # Collect model-avg values for each paper
        for model_name, avg in pivot_df['Avg'].items():
            all_data.append([model_name, paper, avg])

    # Construct final DataFrame for model-avg values
    final_df = pd.DataFrame(all_data, columns=["Model", "Paper", "Avg"])

    # Pivot the final DataFrame to get the desired format
    final_pivot_df = final_df.pivot(index="Model", columns="Paper", values="Avg")

    # Calculate the average accuracy for each model and add as a new column
    final_pivot_df['Avg'] = final_pivot_df.mean(axis=1)
    final_pivot_df = final_pivot_df.sort_values(by='Avg', ascending=False, axis=0)

    # Apply the highlighting function
    styled_df = final_pivot_df.style.apply(highlight_top_three, subset=[i for i in papers] + ['Avg'])
    styled_df = highlight_row_name(styled_df, 'Avg')
    # Format the final DataFrame
    styled_df = styled_df.format('{:.2%}')

    # Set the caption for the final DataFrame
    styled_df = styled_df.set_caption(f'Average Accuracy for {paper_type}')

    # Display the final styled DataFrame
    tables.append(styled_df)

    return tables

def display_multi_table(table_list):
    ''' 
    Accepts a list of pandas DataFrame objects. Displays the first n-1 DataFrames in one vertical column,
    and the last DataFrame in a separate column alongside, with improved visual distinction.
    '''
    if not table_list:
        return None

    # 为 df1 和 df2 的排列添加边框和背景色，以便在视觉上区分
    vertical_tables = '<table style="border-spacing: 10px 0;">' + \
                      ''.join(['<tr><td style="background-color: #f0f0f0; border: 1px solid #ddd;">' + df.to_html() + '</td></tr>' for df in table_list[:-1]]) + \
                      '</table>'
    
    # 最后一个 DataFrame 单独在一个单元格中
    horizontal_table = '<td>' + table_list[-1].to_html() + '</td>'

    # 构造最终的表格布局
    final_layout = '<table><tr><td>' + vertical_tables + '</td>' + horizontal_table + '</tr></table>'

    return HTML(final_layout)

In [11]:
ans_data = {}

for paper in papers:
    ans_data[paper] = {}

    for model in models:

        ans_data[paper][model] = {}
        model_res_dir = os.path.join(res_dir, model)
        paper_res_dir = os.path.join(model_res_dir, paper)
        fpaths = parse_data.find_files_with_suffix(paper_res_dir, '.jsonl')

        for fpath in fpaths:
            
            ques_idx = fpath.split(os.sep)[-1].split('.')[0]
            ans_data[paper][model][ques_idx] = {}
            total_cnt, correct_cnt, total_score = 0, 0, 0

            content = parse_data.read_jsonl(fpath)

            for i in content:
                total_cnt += 1
                if i['answer']['correct']:
                    correct_cnt += 1
                total_score += i['answer']['score']

            ans_data[paper][model][ques_idx]['total_cnt'] = total_cnt
            ans_data[paper][model][ques_idx]['correct_cnt'] = correct_cnt
            ans_data[paper][model][ques_idx]['total_score'] = total_score
            ans_data[paper][model][ques_idx]['avg_score'] = round(total_score/total_cnt, 4)
            ans_data[paper][model][ques_idx]['accuracy'] = round(correct_cnt/total_cnt, 4)

tables = generate_acc_table(ans_data, papers, 'ALL')
display_multi_table(tables)

Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Avg
Model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3
Paper,2024-1,2024-2,Avg,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4
Model,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5
deepseek-coder,98.75%,32.50%,50.00%,52.50%,91.25%,76.25%,100.00%,63.75%,65.00%,23.75%,23.75%,28.75%,45.00%,1.25%,53.75%
qwen2-72b-instruct,98.75%,57.50%,58.75%,23.75%,97.50%,95.00%,86.25%,56.25%,70.00%,7.50%,36.25%,5.00%,18.75%,20.00%,52.23%
deepseek-chat,100.00%,27.50%,50.00%,52.50%,98.75%,86.25%,100.00%,38.75%,60.00%,28.75%,21.25%,22.50%,35.00%,2.50%,51.70%
gemini-1.5-pro-001,38.75%,42.50%,95.00%,26.25%,100.00%,77.50%,90.00%,62.50%,63.75%,25.00%,0.00%,7.50%,32.50%,2.50%,47.41%
gpt-4o-2024-05-13,97.50%,63.75%,56.25%,20.00%,65.00%,57.50%,92.50%,3.75%,42.50%,1.25%,56.25%,15.00%,36.25%,7.50%,43.93%
claude-3-5-sonnet-20240620,43.75%,93.75%,81.25%,28.75%,75.00%,95.00%,56.25%,18.75%,42.50%,13.75%,0.00%,10.00%,26.25%,3.75%,42.05%
baichuan4,57.50%,52.50%,70.00%,33.75%,60.00%,70.00%,67.50%,40.00%,38.75%,3.75%,13.75%,0.00%,13.75%,35.00%,39.73%
yi-large,56.25%,60.00%,75.00%,32.50%,62.50%,52.50%,67.50%,42.50%,45.00%,2.50%,26.25%,0.00%,18.75%,13.75%,39.64%
ERNIE-4.0-Turbo-8K,43.75%,36.25%,73.75%,25.00%,53.75%,88.75%,63.75%,17.50%,17.50%,3.75%,31.25%,3.75%,11.25%,0.00%,33.57%
llama3-70b-instruct,31.25%,30.00%,46.25%,11.25%,70.00%,47.50%,92.50%,36.25%,20.00%,21.25%,47.50%,0.00%,1.25%,10.00%,33.21%

Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Avg
Model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3
deepseek-coder,98.75%,32.50%,50.00%,52.50%,91.25%,76.25%,100.00%,63.75%,65.00%,23.75%,23.75%,28.75%,45.00%,1.25%,53.75%
qwen2-72b-instruct,98.75%,57.50%,58.75%,23.75%,97.50%,95.00%,86.25%,56.25%,70.00%,7.50%,36.25%,5.00%,18.75%,20.00%,52.23%
deepseek-chat,100.00%,27.50%,50.00%,52.50%,98.75%,86.25%,100.00%,38.75%,60.00%,28.75%,21.25%,22.50%,35.00%,2.50%,51.70%
gemini-1.5-pro-001,38.75%,42.50%,95.00%,26.25%,100.00%,77.50%,90.00%,62.50%,63.75%,25.00%,0.00%,7.50%,32.50%,2.50%,47.41%
gpt-4o-2024-05-13,97.50%,63.75%,56.25%,20.00%,65.00%,57.50%,92.50%,3.75%,42.50%,1.25%,56.25%,15.00%,36.25%,7.50%,43.93%
claude-3-5-sonnet-20240620,43.75%,93.75%,81.25%,28.75%,75.00%,95.00%,56.25%,18.75%,42.50%,13.75%,0.00%,10.00%,26.25%,3.75%,42.05%
baichuan4,57.50%,52.50%,70.00%,33.75%,60.00%,70.00%,67.50%,40.00%,38.75%,3.75%,13.75%,0.00%,13.75%,35.00%,39.73%
yi-large,56.25%,60.00%,75.00%,32.50%,62.50%,52.50%,67.50%,42.50%,45.00%,2.50%,26.25%,0.00%,18.75%,13.75%,39.64%
ERNIE-4.0-Turbo-8K,43.75%,36.25%,73.75%,25.00%,53.75%,88.75%,63.75%,17.50%,17.50%,3.75%,31.25%,3.75%,11.25%,0.00%,33.57%
llama3-70b-instruct,31.25%,30.00%,46.25%,11.25%,70.00%,47.50%,92.50%,36.25%,20.00%,21.25%,47.50%,0.00%,1.25%,10.00%,33.21%

Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
deepseek-coder,98.75%,32.50%,50.00%,52.50%,91.25%,76.25%,100.00%,63.75%,65.00%,23.75%,23.75%,28.75%,45.00%,1.25%,53.75%
qwen2-72b-instruct,98.75%,57.50%,58.75%,23.75%,97.50%,95.00%,86.25%,56.25%,70.00%,7.50%,36.25%,5.00%,18.75%,20.00%,52.23%
deepseek-chat,100.00%,27.50%,50.00%,52.50%,98.75%,86.25%,100.00%,38.75%,60.00%,28.75%,21.25%,22.50%,35.00%,2.50%,51.70%
gemini-1.5-pro-001,38.75%,42.50%,95.00%,26.25%,100.00%,77.50%,90.00%,62.50%,63.75%,25.00%,0.00%,7.50%,32.50%,2.50%,47.41%
gpt-4o-2024-05-13,97.50%,63.75%,56.25%,20.00%,65.00%,57.50%,92.50%,3.75%,42.50%,1.25%,56.25%,15.00%,36.25%,7.50%,43.93%
claude-3-5-sonnet-20240620,43.75%,93.75%,81.25%,28.75%,75.00%,95.00%,56.25%,18.75%,42.50%,13.75%,0.00%,10.00%,26.25%,3.75%,42.05%
baichuan4,57.50%,52.50%,70.00%,33.75%,60.00%,70.00%,67.50%,40.00%,38.75%,3.75%,13.75%,0.00%,13.75%,35.00%,39.73%
yi-large,56.25%,60.00%,75.00%,32.50%,62.50%,52.50%,67.50%,42.50%,45.00%,2.50%,26.25%,0.00%,18.75%,13.75%,39.64%
ERNIE-4.0-Turbo-8K,43.75%,36.25%,73.75%,25.00%,53.75%,88.75%,63.75%,17.50%,17.50%,3.75%,31.25%,3.75%,11.25%,0.00%,33.57%
llama3-70b-instruct,31.25%,30.00%,46.25%,11.25%,70.00%,47.50%,92.50%,36.25%,20.00%,21.25%,47.50%,0.00%,1.25%,10.00%,33.21%

Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
deepseek-chat,100.00%,98.75%,97.50%,42.50%,97.50%,10.00%,32.50%,28.75%,96.25%,2.50%,15.00%,50.00%,21.25%,0.00%,49.46%
gpt-4o-2024-05-13,100.00%,100.00%,57.50%,38.75%,67.50%,30.00%,18.75%,42.50%,100.00%,7.50%,11.25%,48.75%,37.50%,0.00%,47.14%
deepseek-coder,100.00%,61.25%,96.25%,51.25%,100.00%,26.25%,23.75%,22.50%,86.25%,0.00%,21.25%,48.75%,17.50%,0.00%,46.79%
claude-3-5-sonnet-20240620,100.00%,100.00%,48.75%,78.75%,45.00%,20.00%,27.50%,71.25%,100.00%,5.00%,0.00%,43.75%,7.50%,0.00%,46.25%
doubao-pro-32k-240615,100.00%,100.00%,33.75%,71.25%,98.75%,3.75%,1.25%,66.25%,48.75%,5.00%,11.25%,40.00%,31.25%,5.00%,44.02%
gemini-1.5-pro-001,100.00%,100.00%,42.50%,73.75%,46.25%,26.25%,1.25%,41.25%,98.75%,20.00%,7.50%,8.75%,21.25%,3.75%,42.23%
qwen2-72b-instruct,100.00%,98.75%,26.25%,60.00%,52.50%,10.00%,51.25%,33.75%,95.00%,5.00%,2.50%,31.25%,8.75%,6.25%,41.52%
qwen-max,100.00%,100.00%,40.00%,98.75%,98.75%,1.25%,0.00%,12.50%,83.75%,5.00%,0.00%,15.00%,11.25%,1.25%,40.54%
moonshot-v1-8k,91.25%,51.25%,22.50%,67.50%,82.50%,6.25%,43.75%,26.25%,96.25%,3.75%,2.50%,21.25%,15.00%,0.00%,37.86%
ERNIE-4.0-Turbo-8K,91.25%,91.25%,30.00%,8.75%,62.50%,18.75%,17.50%,60.00%,80.00%,5.00%,1.25%,27.50%,16.25%,0.00%,36.43%

Paper,2024-1,2024-2,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
deepseek-chat,51.70%,49.46%,50.58%
deepseek-coder,53.75%,46.79%,50.27%
qwen2-72b-instruct,52.23%,41.52%,46.88%
gpt-4o-2024-05-13,43.93%,47.14%,45.54%
gemini-1.5-pro-001,47.41%,42.23%,44.82%
claude-3-5-sonnet-20240620,42.05%,46.25%,44.15%
Avg,38.42%,38.85%,38.63%
doubao-pro-32k-240615,30.54%,44.02%,37.28%
baichuan4,39.73%,34.71%,37.22%
qwen-max,32.14%,40.54%,36.34%


In [12]:
ans_data = {}

for paper in papers:
    ans_data[paper] = {}

    for model in models:

        ans_data[paper][model] = {}
        model_res_dir = os.path.join(res_dir, model)
        paper_res_dir = os.path.join(model_res_dir, paper)
        fpaths = parse_data.find_files_with_suffix(paper_res_dir, '.jsonl')

        for fpath in fpaths:
            
            ques_idx = fpath.split(os.sep)[-1].split('.')[0]
            ans_data[paper][model][ques_idx] = {}
            total_cnt, correct_cnt, total_score = 0, 0, 0

            content = parse_data.read_jsonl(fpath)

            for i in content:
                if i['prompt_parameter']['lang'] == 'en':
                    total_cnt += 1
                    if i['answer']['correct']:
                        correct_cnt += 1
                    total_score += i['answer']['score']

            ans_data[paper][model][ques_idx]['total_cnt'] = total_cnt
            ans_data[paper][model][ques_idx]['correct_cnt'] = correct_cnt
            ans_data[paper][model][ques_idx]['total_score'] = total_score
            ans_data[paper][model][ques_idx]['avg_score'] = round(total_score/total_cnt, 4)
            ans_data[paper][model][ques_idx]['accuracy'] = round(correct_cnt/total_cnt, 4)

tables = generate_acc_table(ans_data, papers, 'EN')
display_multi_table(tables)

Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Avg
Model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3
Paper,2024-1,2024-2,Avg,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4
Model,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5
qwen2-72b-instruct,100.00%,32.50%,67.50%,25.00%,100.00%,92.50%,77.50%,77.50%,70.00%,5.00%,57.50%,7.50%,0.00%,17.50%,52.14%
deepseek-chat,100.00%,12.50%,50.00%,45.00%,100.00%,72.50%,100.00%,32.50%,45.00%,15.00%,30.00%,5.00%,30.00%,5.00%,45.89%
gemini-1.5-pro-001,37.50%,45.00%,92.50%,27.50%,100.00%,70.00%,85.00%,65.00%,50.00%,25.00%,0.00%,2.50%,32.50%,2.50%,45.36%
deepseek-coder,100.00%,20.00%,50.00%,47.50%,90.00%,57.50%,100.00%,42.50%,32.50%,2.50%,17.50%,22.50%,50.00%,2.50%,45.36%
gpt-4o-2024-05-13,100.00%,55.00%,55.00%,25.00%,72.50%,55.00%,97.50%,0.00%,40.00%,0.00%,60.00%,20.00%,35.00%,10.00%,44.64%
claude-3-5-sonnet-20240620,37.50%,95.00%,90.00%,25.00%,75.00%,100.00%,27.50%,17.50%,57.50%,20.00%,0.00%,2.50%,27.50%,0.00%,41.07%
baichuan4,65.00%,52.50%,72.50%,30.00%,85.00%,72.50%,47.50%,32.50%,42.50%,0.00%,15.00%,0.00%,15.00%,37.50%,40.54%
yi-large,50.00%,60.00%,75.00%,32.50%,57.50%,42.50%,55.00%,35.00%,55.00%,5.00%,30.00%,0.00%,15.00%,10.00%,37.32%
ERNIE-4.0-Turbo-8K,40.00%,35.00%,57.50%,22.50%,57.50%,90.00%,82.50%,22.50%,25.00%,7.50%,35.00%,2.50%,15.00%,0.00%,35.18%
llama3-70b-instruct,42.50%,32.50%,45.00%,10.00%,72.50%,50.00%,87.50%,25.00%,7.50%,27.50%,67.50%,0.00%,2.50%,0.00%,33.57%

Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Avg
Model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3
qwen2-72b-instruct,100.00%,32.50%,67.50%,25.00%,100.00%,92.50%,77.50%,77.50%,70.00%,5.00%,57.50%,7.50%,0.00%,17.50%,52.14%
deepseek-chat,100.00%,12.50%,50.00%,45.00%,100.00%,72.50%,100.00%,32.50%,45.00%,15.00%,30.00%,5.00%,30.00%,5.00%,45.89%
gemini-1.5-pro-001,37.50%,45.00%,92.50%,27.50%,100.00%,70.00%,85.00%,65.00%,50.00%,25.00%,0.00%,2.50%,32.50%,2.50%,45.36%
deepseek-coder,100.00%,20.00%,50.00%,47.50%,90.00%,57.50%,100.00%,42.50%,32.50%,2.50%,17.50%,22.50%,50.00%,2.50%,45.36%
gpt-4o-2024-05-13,100.00%,55.00%,55.00%,25.00%,72.50%,55.00%,97.50%,0.00%,40.00%,0.00%,60.00%,20.00%,35.00%,10.00%,44.64%
claude-3-5-sonnet-20240620,37.50%,95.00%,90.00%,25.00%,75.00%,100.00%,27.50%,17.50%,57.50%,20.00%,0.00%,2.50%,27.50%,0.00%,41.07%
baichuan4,65.00%,52.50%,72.50%,30.00%,85.00%,72.50%,47.50%,32.50%,42.50%,0.00%,15.00%,0.00%,15.00%,37.50%,40.54%
yi-large,50.00%,60.00%,75.00%,32.50%,57.50%,42.50%,55.00%,35.00%,55.00%,5.00%,30.00%,0.00%,15.00%,10.00%,37.32%
ERNIE-4.0-Turbo-8K,40.00%,35.00%,57.50%,22.50%,57.50%,90.00%,82.50%,22.50%,25.00%,7.50%,35.00%,2.50%,15.00%,0.00%,35.18%
llama3-70b-instruct,42.50%,32.50%,45.00%,10.00%,72.50%,50.00%,87.50%,25.00%,7.50%,27.50%,67.50%,0.00%,2.50%,0.00%,33.57%

Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
qwen2-72b-instruct,100.00%,32.50%,67.50%,25.00%,100.00%,92.50%,77.50%,77.50%,70.00%,5.00%,57.50%,7.50%,0.00%,17.50%,52.14%
deepseek-chat,100.00%,12.50%,50.00%,45.00%,100.00%,72.50%,100.00%,32.50%,45.00%,15.00%,30.00%,5.00%,30.00%,5.00%,45.89%
gemini-1.5-pro-001,37.50%,45.00%,92.50%,27.50%,100.00%,70.00%,85.00%,65.00%,50.00%,25.00%,0.00%,2.50%,32.50%,2.50%,45.36%
deepseek-coder,100.00%,20.00%,50.00%,47.50%,90.00%,57.50%,100.00%,42.50%,32.50%,2.50%,17.50%,22.50%,50.00%,2.50%,45.36%
gpt-4o-2024-05-13,100.00%,55.00%,55.00%,25.00%,72.50%,55.00%,97.50%,0.00%,40.00%,0.00%,60.00%,20.00%,35.00%,10.00%,44.64%
claude-3-5-sonnet-20240620,37.50%,95.00%,90.00%,25.00%,75.00%,100.00%,27.50%,17.50%,57.50%,20.00%,0.00%,2.50%,27.50%,0.00%,41.07%
baichuan4,65.00%,52.50%,72.50%,30.00%,85.00%,72.50%,47.50%,32.50%,42.50%,0.00%,15.00%,0.00%,15.00%,37.50%,40.54%
yi-large,50.00%,60.00%,75.00%,32.50%,57.50%,42.50%,55.00%,35.00%,55.00%,5.00%,30.00%,0.00%,15.00%,10.00%,37.32%
ERNIE-4.0-Turbo-8K,40.00%,35.00%,57.50%,22.50%,57.50%,90.00%,82.50%,22.50%,25.00%,7.50%,35.00%,2.50%,15.00%,0.00%,35.18%
llama3-70b-instruct,42.50%,32.50%,45.00%,10.00%,72.50%,50.00%,87.50%,25.00%,7.50%,27.50%,67.50%,0.00%,2.50%,0.00%,33.57%

Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
deepseek-chat,100.00%,100.00%,95.00%,2.50%,95.00%,7.50%,32.50%,32.50%,92.50%,2.50%,15.00%,50.00%,25.00%,0.00%,46.43%
gpt-4o-2024-05-13,100.00%,100.00%,62.50%,17.50%,60.00%,25.00%,22.50%,52.50%,100.00%,0.00%,12.50%,50.00%,37.50%,0.00%,45.71%
claude-3-5-sonnet-20240620,100.00%,100.00%,50.00%,60.00%,55.00%,17.50%,35.00%,70.00%,100.00%,5.00%,0.00%,47.50%,0.00%,0.00%,45.71%
doubao-pro-32k-240615,100.00%,100.00%,30.00%,62.50%,97.50%,0.00%,2.50%,77.50%,47.50%,2.50%,10.00%,40.00%,40.00%,7.50%,44.11%
gemini-1.5-pro-001,100.00%,100.00%,40.00%,55.00%,47.50%,35.00%,2.50%,30.00%,97.50%,22.50%,7.50%,17.50%,32.50%,2.50%,42.14%
deepseek-coder,100.00%,80.00%,97.50%,12.50%,100.00%,10.00%,27.50%,30.00%,72.50%,0.00%,2.50%,50.00%,5.00%,0.00%,41.96%
qwen-max,100.00%,100.00%,55.00%,97.50%,97.50%,2.50%,0.00%,10.00%,85.00%,0.00%,0.00%,17.50%,7.50%,0.00%,40.89%
qwen2-72b-instruct,100.00%,100.00%,25.00%,45.00%,42.50%,7.50%,55.00%,25.00%,97.50%,5.00%,5.00%,32.50%,15.00%,12.50%,40.54%
ERNIE-4.0-Turbo-8K,97.50%,90.00%,37.50%,0.00%,77.50%,20.00%,20.00%,52.50%,80.00%,5.00%,2.50%,50.00%,30.00%,0.00%,40.18%
moonshot-v1-8k,100.00%,50.00%,20.00%,60.00%,77.50%,12.50%,40.00%,17.50%,100.00%,2.50%,0.00%,17.50%,15.00%,0.00%,36.61%

Paper,2024-1,2024-2,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
qwen2-72b-instruct,52.14%,40.54%,46.34%
deepseek-chat,45.89%,46.43%,46.16%
gpt-4o-2024-05-13,44.64%,45.71%,45.18%
gemini-1.5-pro-001,45.36%,42.14%,43.75%
deepseek-coder,45.36%,41.96%,43.66%
claude-3-5-sonnet-20240620,41.07%,45.71%,43.39%
ERNIE-4.0-Turbo-8K,35.18%,40.18%,37.68%
Avg,36.90%,38.12%,37.51%
baichuan4,40.54%,30.96%,35.75%
qwen-max,30.18%,40.89%,35.54%


In [13]:
ans_data = {}

for paper in papers:
    ans_data[paper] = {}

    for model in models:

        ans_data[paper][model] = {}
        model_res_dir = os.path.join(res_dir, model)
        paper_res_dir = os.path.join(model_res_dir, paper)
        fpaths = parse_data.find_files_with_suffix(paper_res_dir, '.jsonl')

        for fpath in fpaths:
            
            ques_idx = fpath.split(os.sep)[-1].split('.')[0]
            ans_data[paper][model][ques_idx] = {}
            total_cnt, correct_cnt, total_score = 0, 0, 0

            content = parse_data.read_jsonl(fpath)

            for i in content:
                if i['prompt_parameter']['lang'] == 'zh':
                    total_cnt += 1
                    if i['answer']['correct']:
                        correct_cnt += 1
                    total_score += i['answer']['score']

            ans_data[paper][model][ques_idx]['total_cnt'] = total_cnt
            ans_data[paper][model][ques_idx]['correct_cnt'] = correct_cnt
            ans_data[paper][model][ques_idx]['total_score'] = total_score
            ans_data[paper][model][ques_idx]['avg_score'] = round(total_score/total_cnt, 4)
            ans_data[paper][model][ques_idx]['accuracy'] = round(correct_cnt/total_cnt, 4)

tables = generate_acc_table(ans_data, papers, 'ZH')
display_multi_table(tables)

Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Avg
Model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3
Paper,2024-1,2024-2,Avg,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4
Model,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5
deepseek-coder,97.50%,45.00%,50.00%,57.50%,92.50%,95.00%,100.00%,85.00%,97.50%,45.00%,30.00%,35.00%,40.00%,0.00%,62.14%
deepseek-chat,100.00%,42.50%,50.00%,60.00%,97.50%,100.00%,100.00%,45.00%,75.00%,42.50%,12.50%,40.00%,40.00%,0.00%,57.50%
qwen2-72b-instruct,97.50%,82.50%,50.00%,22.50%,95.00%,97.50%,95.00%,35.00%,70.00%,10.00%,15.00%,2.50%,37.50%,22.50%,52.32%
gemini-1.5-pro-001,40.00%,40.00%,97.50%,25.00%,100.00%,85.00%,95.00%,60.00%,77.50%,25.00%,0.00%,12.50%,32.50%,2.50%,49.46%
gpt-4o-2024-05-13,95.00%,72.50%,57.50%,15.00%,57.50%,60.00%,87.50%,7.50%,45.00%,2.50%,52.50%,10.00%,37.50%,5.00%,43.21%
claude-3-5-sonnet-20240620,50.00%,92.50%,72.50%,32.50%,75.00%,90.00%,85.00%,20.00%,27.50%,7.50%,0.00%,17.50%,25.00%,7.50%,43.04%
yi-large,62.50%,60.00%,75.00%,32.50%,67.50%,62.50%,80.00%,50.00%,35.00%,0.00%,22.50%,0.00%,22.50%,17.50%,41.96%
baichuan4,50.00%,52.50%,67.50%,37.50%,35.00%,67.50%,87.50%,47.50%,35.00%,7.50%,12.50%,0.00%,12.50%,32.50%,38.93%
doubao-pro-32k-240615,45.00%,47.50%,75.00%,32.50%,55.00%,95.00%,97.50%,2.50%,5.00%,2.50%,0.00%,45.00%,20.00%,0.00%,37.32%
qwen-max,25.00%,35.00%,57.50%,12.50%,82.50%,77.50%,77.50%,52.50%,37.50%,0.00%,17.50%,2.50%,0.00%,0.00%,34.11%

Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Avg
Model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3
deepseek-coder,97.50%,45.00%,50.00%,57.50%,92.50%,95.00%,100.00%,85.00%,97.50%,45.00%,30.00%,35.00%,40.00%,0.00%,62.14%
deepseek-chat,100.00%,42.50%,50.00%,60.00%,97.50%,100.00%,100.00%,45.00%,75.00%,42.50%,12.50%,40.00%,40.00%,0.00%,57.50%
qwen2-72b-instruct,97.50%,82.50%,50.00%,22.50%,95.00%,97.50%,95.00%,35.00%,70.00%,10.00%,15.00%,2.50%,37.50%,22.50%,52.32%
gemini-1.5-pro-001,40.00%,40.00%,97.50%,25.00%,100.00%,85.00%,95.00%,60.00%,77.50%,25.00%,0.00%,12.50%,32.50%,2.50%,49.46%
gpt-4o-2024-05-13,95.00%,72.50%,57.50%,15.00%,57.50%,60.00%,87.50%,7.50%,45.00%,2.50%,52.50%,10.00%,37.50%,5.00%,43.21%
claude-3-5-sonnet-20240620,50.00%,92.50%,72.50%,32.50%,75.00%,90.00%,85.00%,20.00%,27.50%,7.50%,0.00%,17.50%,25.00%,7.50%,43.04%
yi-large,62.50%,60.00%,75.00%,32.50%,67.50%,62.50%,80.00%,50.00%,35.00%,0.00%,22.50%,0.00%,22.50%,17.50%,41.96%
baichuan4,50.00%,52.50%,67.50%,37.50%,35.00%,67.50%,87.50%,47.50%,35.00%,7.50%,12.50%,0.00%,12.50%,32.50%,38.93%
doubao-pro-32k-240615,45.00%,47.50%,75.00%,32.50%,55.00%,95.00%,97.50%,2.50%,5.00%,2.50%,0.00%,45.00%,20.00%,0.00%,37.32%
qwen-max,25.00%,35.00%,57.50%,12.50%,82.50%,77.50%,77.50%,52.50%,37.50%,0.00%,17.50%,2.50%,0.00%,0.00%,34.11%

Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
deepseek-coder,97.50%,45.00%,50.00%,57.50%,92.50%,95.00%,100.00%,85.00%,97.50%,45.00%,30.00%,35.00%,40.00%,0.00%,62.14%
deepseek-chat,100.00%,42.50%,50.00%,60.00%,97.50%,100.00%,100.00%,45.00%,75.00%,42.50%,12.50%,40.00%,40.00%,0.00%,57.50%
qwen2-72b-instruct,97.50%,82.50%,50.00%,22.50%,95.00%,97.50%,95.00%,35.00%,70.00%,10.00%,15.00%,2.50%,37.50%,22.50%,52.32%
gemini-1.5-pro-001,40.00%,40.00%,97.50%,25.00%,100.00%,85.00%,95.00%,60.00%,77.50%,25.00%,0.00%,12.50%,32.50%,2.50%,49.46%
gpt-4o-2024-05-13,95.00%,72.50%,57.50%,15.00%,57.50%,60.00%,87.50%,7.50%,45.00%,2.50%,52.50%,10.00%,37.50%,5.00%,43.21%
claude-3-5-sonnet-20240620,50.00%,92.50%,72.50%,32.50%,75.00%,90.00%,85.00%,20.00%,27.50%,7.50%,0.00%,17.50%,25.00%,7.50%,43.04%
yi-large,62.50%,60.00%,75.00%,32.50%,67.50%,62.50%,80.00%,50.00%,35.00%,0.00%,22.50%,0.00%,22.50%,17.50%,41.96%
baichuan4,50.00%,52.50%,67.50%,37.50%,35.00%,67.50%,87.50%,47.50%,35.00%,7.50%,12.50%,0.00%,12.50%,32.50%,38.93%
doubao-pro-32k-240615,45.00%,47.50%,75.00%,32.50%,55.00%,95.00%,97.50%,2.50%,5.00%,2.50%,0.00%,45.00%,20.00%,0.00%,37.32%
qwen-max,25.00%,35.00%,57.50%,12.50%,82.50%,77.50%,77.50%,52.50%,37.50%,0.00%,17.50%,2.50%,0.00%,0.00%,34.11%

Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
deepseek-chat,100.00%,97.50%,100.00%,82.50%,100.00%,12.50%,32.50%,25.00%,100.00%,2.50%,15.00%,50.00%,17.50%,0.00%,52.50%
deepseek-coder,100.00%,42.50%,95.00%,90.00%,100.00%,42.50%,20.00%,15.00%,100.00%,0.00%,40.00%,47.50%,30.00%,0.00%,51.61%
gpt-4o-2024-05-13,100.00%,100.00%,52.50%,60.00%,75.00%,35.00%,15.00%,32.50%,100.00%,15.00%,10.00%,47.50%,37.50%,0.00%,48.57%
claude-3-5-sonnet-20240620,100.00%,100.00%,47.50%,97.50%,35.00%,22.50%,20.00%,72.50%,100.00%,5.00%,0.00%,40.00%,15.00%,0.00%,46.79%
doubao-pro-32k-240615,100.00%,100.00%,37.50%,80.00%,100.00%,7.50%,0.00%,55.00%,50.00%,7.50%,12.50%,40.00%,22.50%,2.50%,43.93%
qwen2-72b-instruct,100.00%,97.50%,27.50%,75.00%,62.50%,12.50%,47.50%,42.50%,92.50%,5.00%,0.00%,30.00%,2.50%,0.00%,42.50%
gemini-1.5-pro-001,100.00%,100.00%,45.00%,92.50%,45.00%,17.50%,0.00%,52.50%,100.00%,17.50%,7.50%,0.00%,10.00%,5.00%,42.32%
qwen-max,100.00%,100.00%,25.00%,100.00%,100.00%,0.00%,0.00%,15.00%,82.50%,10.00%,0.00%,12.50%,15.00%,2.50%,40.18%
moonshot-v1-8k,82.50%,52.50%,25.00%,75.00%,87.50%,0.00%,47.50%,35.00%,92.50%,5.00%,5.00%,25.00%,15.00%,0.00%,39.11%
baichuan4,92.50%,57.50%,50.00%,85.00%,57.50%,25.00%,0.00%,30.00%,72.50%,5.00%,0.00%,nan%,25.00%,0.00%,38.46%

Paper,2024-1,2024-2,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
deepseek-coder,62.14%,51.61%,56.88%
deepseek-chat,57.50%,52.50%,55.00%
qwen2-72b-instruct,52.32%,42.50%,47.41%
gemini-1.5-pro-001,49.46%,42.32%,45.89%
gpt-4o-2024-05-13,43.21%,48.57%,45.89%
claude-3-5-sonnet-20240620,43.04%,46.79%,44.91%
doubao-pro-32k-240615,37.32%,43.93%,40.63%
Avg,39.93%,39.58%,39.76%
baichuan4,38.93%,38.46%,38.70%
qwen-max,34.11%,40.18%,37.14%


## Temperature

```json
ans_data = {
    "paper_type": 
    {
        "model_name": {
            "temperature":{
                "accuracy": ..,
                "avg_score": ..., 
                "total_score": ..., 
                "correct_cnt": ..., 
                "total_cnt": ...
            }
        }
    }
}
```

temperature values: [0.1, 0.3, 0.5, 0.7, 0.9]

In [14]:
values = [0.1, 0.3, 0.5, 0.7, 0.9]

In [15]:
def generate_table(ans_data, papers, paper_type):

    def highlight_max_in_row(df):
        styles = pd.DataFrame('', index=df.index, columns=df.columns)
        for row in df.index:  
            # ignore the Total/Average column
            max_col = df.loc[row][:-1].idxmax()
            styles.loc[row, max_col] = 'text-decoration: underline'
        return styles

    tables = []

    # 用于收集所有模型在所有paper中的精度数据
    all_data = []

    for paper in papers:
        data = []
        for model_name, questions in ans_data[paper].items():
            for temp_value, metrics in questions.items():
                question_num = round(temp_value, 2)
                data.append([model_name, question_num, metrics["accuracy"]])

        df = pd.DataFrame(data, columns=["Model", "Temperature Value", "Accuracy"])
        pivot_df = df.pivot(index="Model", columns="Temperature Value", values="Accuracy")
        pivot_df['Avg'] = pivot_df.mean(axis=1)
        pivot_df = pivot_df.sort_values(by='Avg', ascending=False)
        pivot_df.loc['Avg'] = pivot_df.mean()
        styled_df = pivot_df.style.apply(highlight_max_in_row, axis=None).apply(highlight_top_three, subset=pivot_df.columns.drop('Avg').tolist() + ['Avg'])
        styled_df = styled_df.set_caption(f'{paper} Question Accuracy: {paper_type}')
        styled_df = styled_df.format('{:.2%}')
        tables.append(styled_df)
        
        # 为最终的汇总表收集数据
        for index, row in pivot_df.iterrows():
            for col in pivot_df.columns.drop('Avg'):
                all_data.append([index, col, row[col]])

    # 构建一个包含所有数据的DataFrame
    all_df = pd.DataFrame(all_data, columns=["Model", "Temperature Value", "Accuracy"])

    # 对所有paper的数据进行平均计算
    final_avg_df = all_df.groupby(["Model", "Temperature Value"]).mean().reset_index()

    # 转换格式以方便展示
    final_pivot_df = final_avg_df.pivot(index="Model", columns="Temperature Value", values="Accuracy")
    final_pivot_df['Avg'] = final_pivot_df.mean(axis=1)
    final_pivot_df.loc['Avg'] = final_pivot_df.mean()
    final_pivot_df = final_pivot_df.sort_values(by='Avg', ascending=False)

    styled_final_df = final_pivot_df.style.apply(highlight_max_in_row, axis=None).apply(highlight_top_three, subset=final_pivot_df.columns.drop('Avg').tolist() + ['Avg'])
    styled_final_df = highlight_row_name(styled_final_df, 'Avg')
    styled_final_df = styled_final_df.set_caption(f'All Questions\' Accuracy: {paper_type}')
    styled_final_df = styled_final_df.format('{:.2%}')

    tables.append(styled_final_df)

    return tables


def display_multi_table(table_list):
    ''' Acceps a list of IpyTable objects and returns a table which contains each IpyTable in a cell
    '''
    return HTML(
        '<table><tr style="background-color:white;">' + 
        ''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list]) +
        '</tr></table>'
    )


In [16]:
ans_data = {}

for paper in papers:
    ans_data[paper] = {}

    for model in models:

        ans_data[paper][model] = {}
        model_res_dir = os.path.join(res_dir, model)
        paper_res_dir = os.path.join(model_res_dir, paper)
        fpaths = parse_data.find_files_with_suffix(paper_res_dir, '.jsonl')

        for temp in values:

            ans_data[paper][model][temp] = {}

            total_cnt, correct_cnt, total_score = 0, 0, 0

            for fpath in fpaths:
                
                content = parse_data.read_jsonl(fpath)

                for i in content:
                    if i['llm_parameter']['temperature'] == temp:
                        total_cnt += 1
                        if i['answer']['correct']:
                            correct_cnt += 1
                        total_score += i['answer']['score']

            ans_data[paper][model][temp]['total_cnt'] = total_cnt
            ans_data[paper][model][temp]['correct_cnt'] = correct_cnt
            ans_data[paper][model][temp]['total_score'] = total_score
            ans_data[paper][model][temp]['avg_score'] = round(total_score/total_cnt, 4)
            ans_data[paper][model][temp]['accuracy'] = round(correct_cnt/total_cnt, 4)

tables = generate_table(ans_data, papers, 'ALL')
display_multi_table(tables)

Temperature Value,0.100000,0.300000,0.500000,0.700000,0.900000,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Temperature Value,0.100000,0.300000,0.500000,0.700000,0.900000,Avg
Model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
Temperature Value,0.100000,0.300000,0.500000,0.700000,0.900000,Avg
Model,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5
deepseek-coder,56.25%,54.91%,52.68%,53.57%,51.34%,53.75%
qwen2-72b-instruct,53.57%,54.91%,53.57%,48.21%,50.89%,52.23%
deepseek-chat,52.68%,52.23%,50.89%,51.34%,51.34%,51.70%
gemini-1.5-pro-001,46.43%,48.66%,50.00%,43.75%,48.21%,47.41%
gpt-4o-2024-05-13,45.09%,44.20%,44.64%,44.20%,41.52%,43.93%
claude-3-5-sonnet-20240620,43.30%,42.41%,40.18%,39.73%,44.64%,42.05%
baichuan4,41.52%,43.30%,40.18%,35.27%,38.39%,39.73%
yi-large,44.64%,46.88%,37.95%,37.05%,31.70%,39.64%
ERNIE-4.0-Turbo-8K,31.70%,31.25%,35.27%,35.27%,34.38%,33.57%
llama3-70b-instruct,32.59%,35.71%,33.04%,34.82%,29.91%,33.21%

Temperature Value,0.100000,0.300000,0.500000,0.700000,0.900000,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
deepseek-coder,56.25%,54.91%,52.68%,53.57%,51.34%,53.75%
qwen2-72b-instruct,53.57%,54.91%,53.57%,48.21%,50.89%,52.23%
deepseek-chat,52.68%,52.23%,50.89%,51.34%,51.34%,51.70%
gemini-1.5-pro-001,46.43%,48.66%,50.00%,43.75%,48.21%,47.41%
gpt-4o-2024-05-13,45.09%,44.20%,44.64%,44.20%,41.52%,43.93%
claude-3-5-sonnet-20240620,43.30%,42.41%,40.18%,39.73%,44.64%,42.05%
baichuan4,41.52%,43.30%,40.18%,35.27%,38.39%,39.73%
yi-large,44.64%,46.88%,37.95%,37.05%,31.70%,39.64%
ERNIE-4.0-Turbo-8K,31.70%,31.25%,35.27%,35.27%,34.38%,33.57%
llama3-70b-instruct,32.59%,35.71%,33.04%,34.82%,29.91%,33.21%

Temperature Value,0.100000,0.300000,0.500000,0.700000,0.900000,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
deepseek-chat,50.00%,49.55%,49.55%,50.45%,47.77%,49.46%
gpt-4o-2024-05-13,46.43%,45.09%,49.11%,46.88%,48.21%,47.14%
deepseek-coder,46.88%,45.54%,48.21%,49.55%,43.75%,46.79%
claude-3-5-sonnet-20240620,46.43%,48.21%,45.09%,45.54%,45.98%,46.25%
doubao-pro-32k-240615,43.75%,41.96%,44.20%,45.54%,44.64%,44.02%
gemini-1.5-pro-001,42.41%,41.07%,43.75%,40.18%,43.75%,42.23%
qwen2-72b-instruct,42.41%,41.52%,43.30%,38.84%,41.52%,41.52%
qwen-max,39.73%,41.96%,38.84%,40.18%,41.96%,40.53%
moonshot-v1-8k,37.50%,38.39%,36.61%,35.71%,41.07%,37.86%
ERNIE-4.0-Turbo-8K,37.05%,33.48%,39.29%,33.04%,39.29%,36.43%

Temperature Value,0.100000,0.300000,0.500000,0.700000,0.900000,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
deepseek-chat,51.34%,50.89%,50.22%,50.90%,49.55%,50.58%
deepseek-coder,51.56%,50.23%,50.45%,51.56%,47.55%,50.27%
qwen2-72b-instruct,47.99%,48.22%,48.43%,43.53%,46.20%,46.87%
gpt-4o-2024-05-13,45.76%,44.65%,46.88%,45.54%,44.87%,45.54%
gemini-1.5-pro-001,44.42%,44.87%,46.88%,41.96%,45.98%,44.82%
claude-3-5-sonnet-20240620,44.87%,45.31%,42.63%,42.63%,45.31%,44.15%
Avg,38.86%,39.39%,38.78%,38.11%,38.03%,38.63%
doubao-pro-32k-240615,36.60%,37.05%,35.71%,38.40%,38.62%,37.28%
baichuan4,36.87%,38.96%,37.64%,35.43%,37.22%,37.22%
qwen-max,35.27%,37.94%,34.60%,37.28%,36.60%,36.34%


In [17]:
ans_data = {}

for paper in papers:
    ans_data[paper] = {}

    for model in models:

        ans_data[paper][model] = {}
        model_res_dir = os.path.join(res_dir, model)
        paper_res_dir = os.path.join(model_res_dir, paper)
        fpaths = parse_data.find_files_with_suffix(paper_res_dir, '.jsonl')

        for temp in values:

            ans_data[paper][model][temp] = {}

            total_cnt, correct_cnt, total_score = 0, 0, 0

            for fpath in fpaths:
                
                content = parse_data.read_jsonl(fpath)

                for i in content:
                    if i['llm_parameter']['temperature'] == temp and i['prompt_parameter']['lang'] == 'en':
                        total_cnt += 1
                        if i['answer']['correct']:
                            correct_cnt += 1
                        total_score += i['answer']['score']

            ans_data[paper][model][temp]['total_cnt'] = total_cnt
            ans_data[paper][model][temp]['correct_cnt'] = correct_cnt
            ans_data[paper][model][temp]['total_score'] = total_score
            ans_data[paper][model][temp]['avg_score'] = round(total_score/total_cnt, 4)
            ans_data[paper][model][temp]['accuracy'] = round(correct_cnt/total_cnt, 4)

tables = generate_table(ans_data, papers, 'EN')
display_multi_table(tables)

Temperature Value,0.100000,0.300000,0.500000,0.700000,0.900000,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Temperature Value,0.100000,0.300000,0.500000,0.700000,0.900000,Avg
Model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
Temperature Value,0.100000,0.300000,0.500000,0.700000,0.900000,Avg
Model,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5
qwen2-72b-instruct,53.57%,51.79%,54.46%,48.21%,52.68%,52.14%
deepseek-chat,43.75%,46.43%,44.64%,42.86%,51.79%,45.89%
deepseek-coder,46.43%,45.54%,45.54%,43.75%,45.54%,45.36%
gemini-1.5-pro-001,42.86%,47.32%,46.43%,43.75%,46.43%,45.36%
gpt-4o-2024-05-13,45.54%,44.64%,44.64%,45.54%,42.86%,44.64%
claude-3-5-sonnet-20240620,43.75%,41.96%,39.29%,36.61%,43.75%,41.07%
baichuan4,42.86%,46.43%,41.96%,33.93%,37.50%,40.54%
yi-large,42.86%,45.54%,33.04%,34.82%,30.36%,37.32%
ERNIE-4.0-Turbo-8K,34.82%,31.25%,37.50%,35.71%,36.61%,35.18%
llama3-70b-instruct,33.04%,35.71%,30.36%,38.39%,30.36%,33.57%

Temperature Value,0.100000,0.300000,0.500000,0.700000,0.900000,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
qwen2-72b-instruct,53.57%,51.79%,54.46%,48.21%,52.68%,52.14%
deepseek-chat,43.75%,46.43%,44.64%,42.86%,51.79%,45.89%
deepseek-coder,46.43%,45.54%,45.54%,43.75%,45.54%,45.36%
gemini-1.5-pro-001,42.86%,47.32%,46.43%,43.75%,46.43%,45.36%
gpt-4o-2024-05-13,45.54%,44.64%,44.64%,45.54%,42.86%,44.64%
claude-3-5-sonnet-20240620,43.75%,41.96%,39.29%,36.61%,43.75%,41.07%
baichuan4,42.86%,46.43%,41.96%,33.93%,37.50%,40.54%
yi-large,42.86%,45.54%,33.04%,34.82%,30.36%,37.32%
ERNIE-4.0-Turbo-8K,34.82%,31.25%,37.50%,35.71%,36.61%,35.18%
llama3-70b-instruct,33.04%,35.71%,30.36%,38.39%,30.36%,33.57%

Temperature Value,0.100000,0.300000,0.500000,0.700000,0.900000,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
deepseek-chat,45.54%,48.21%,48.21%,45.54%,44.64%,46.43%
claude-3-5-sonnet-20240620,44.64%,46.43%,42.86%,49.11%,45.54%,45.72%
gpt-4o-2024-05-13,42.86%,47.32%,48.21%,42.86%,47.32%,45.71%
doubao-pro-32k-240615,45.54%,41.96%,45.54%,42.86%,44.64%,44.11%
gemini-1.5-pro-001,40.18%,41.96%,42.86%,39.29%,46.43%,42.14%
deepseek-coder,41.07%,41.07%,42.86%,43.75%,41.07%,41.96%
qwen-max,39.29%,41.96%,39.29%,41.07%,42.86%,40.89%
qwen2-72b-instruct,39.29%,42.86%,43.75%,36.61%,40.18%,40.54%
ERNIE-4.0-Turbo-8K,41.07%,39.29%,41.96%,37.50%,41.07%,40.18%
moonshot-v1-8k,33.04%,39.29%,37.50%,33.04%,40.18%,36.61%

Temperature Value,0.100000,0.300000,0.500000,0.700000,0.900000,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
qwen2-72b-instruct,46.43%,47.33%,49.10%,42.41%,46.43%,46.34%
deepseek-chat,44.65%,47.32%,46.42%,44.20%,48.22%,46.16%
gpt-4o-2024-05-13,44.20%,45.98%,46.42%,44.20%,45.09%,45.18%
gemini-1.5-pro-001,41.52%,44.64%,44.65%,41.52%,46.43%,43.75%
deepseek-coder,43.75%,43.31%,44.20%,43.75%,43.31%,43.66%
claude-3-5-sonnet-20240620,44.20%,44.19%,41.08%,42.86%,44.65%,43.39%
ERNIE-4.0-Turbo-8K,37.95%,35.27%,39.73%,36.60%,38.84%,37.68%
Avg,37.08%,38.38%,37.76%,36.68%,37.63%,37.51%
baichuan4,34.41%,38.60%,36.36%,33.31%,36.06%,35.75%
qwen-max,34.83%,37.94%,33.93%,35.27%,35.71%,35.54%


In [18]:
ans_data = {}

for paper in papers:
    ans_data[paper] = {}

    for model in models:

        ans_data[paper][model] = {}
        model_res_dir = os.path.join(res_dir, model)
        paper_res_dir = os.path.join(model_res_dir, paper)
        fpaths = parse_data.find_files_with_suffix(paper_res_dir, '.jsonl')

        for temp in values:

            ans_data[paper][model][temp] = {}

            total_cnt, correct_cnt, total_score = 0, 0, 0

            for fpath in fpaths:
                
                content = parse_data.read_jsonl(fpath)

                for i in content:
                    if i['llm_parameter']['temperature'] == temp and i['prompt_parameter']['lang'] == 'zh':
                        total_cnt += 1
                        if i['answer']['correct']:
                            correct_cnt += 1
                        total_score += i['answer']['score']

            ans_data[paper][model][temp]['total_cnt'] = total_cnt
            ans_data[paper][model][temp]['correct_cnt'] = correct_cnt
            ans_data[paper][model][temp]['total_score'] = total_score
            ans_data[paper][model][temp]['avg_score'] = round(total_score/total_cnt, 4)
            ans_data[paper][model][temp]['accuracy'] = round(correct_cnt/total_cnt, 4)

tables = generate_table(ans_data, papers, 'ZH')
display_multi_table(tables)

Temperature Value,0.100000,0.300000,0.500000,0.700000,0.900000,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Temperature Value,0.100000,0.300000,0.500000,0.700000,0.900000,Avg
Model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
Temperature Value,0.100000,0.300000,0.500000,0.700000,0.900000,Avg
Model,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5
deepseek-coder,66.07%,64.29%,59.82%,63.39%,57.14%,62.14%
deepseek-chat,61.61%,58.04%,57.14%,59.82%,50.89%,57.50%
qwen2-72b-instruct,53.57%,58.04%,52.68%,48.21%,49.11%,52.32%
gemini-1.5-pro-001,50.00%,50.00%,53.57%,43.75%,50.00%,49.46%
gpt-4o-2024-05-13,44.64%,43.75%,44.64%,42.86%,40.18%,43.21%
claude-3-5-sonnet-20240620,42.86%,42.86%,41.07%,42.86%,45.54%,43.04%
yi-large,46.43%,48.21%,42.86%,39.29%,33.04%,41.97%
baichuan4,40.18%,40.18%,38.39%,36.61%,39.29%,38.93%
doubao-pro-32k-240615,37.50%,41.96%,35.71%,34.82%,36.61%,37.32%
qwen-max,31.25%,33.93%,32.14%,39.29%,33.93%,34.11%

Temperature Value,0.100000,0.300000,0.500000,0.700000,0.900000,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
deepseek-coder,66.07%,64.29%,59.82%,63.39%,57.14%,62.14%
deepseek-chat,61.61%,58.04%,57.14%,59.82%,50.89%,57.50%
qwen2-72b-instruct,53.57%,58.04%,52.68%,48.21%,49.11%,52.32%
gemini-1.5-pro-001,50.00%,50.00%,53.57%,43.75%,50.00%,49.46%
gpt-4o-2024-05-13,44.64%,43.75%,44.64%,42.86%,40.18%,43.21%
claude-3-5-sonnet-20240620,42.86%,42.86%,41.07%,42.86%,45.54%,43.04%
yi-large,46.43%,48.21%,42.86%,39.29%,33.04%,41.97%
baichuan4,40.18%,40.18%,38.39%,36.61%,39.29%,38.93%
doubao-pro-32k-240615,37.50%,41.96%,35.71%,34.82%,36.61%,37.32%
qwen-max,31.25%,33.93%,32.14%,39.29%,33.93%,34.11%

Temperature Value,0.100000,0.300000,0.500000,0.700000,0.900000,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
deepseek-chat,54.46%,50.89%,50.89%,55.36%,50.89%,52.50%
deepseek-coder,52.68%,50.00%,53.57%,55.36%,46.43%,51.61%
gpt-4o-2024-05-13,50.00%,42.86%,50.00%,50.89%,49.11%,48.57%
claude-3-5-sonnet-20240620,48.21%,50.00%,47.32%,41.96%,46.43%,46.78%
doubao-pro-32k-240615,41.96%,41.96%,42.86%,48.21%,44.64%,43.93%
qwen2-72b-instruct,45.54%,40.18%,42.86%,41.07%,42.86%,42.50%
gemini-1.5-pro-001,44.64%,40.18%,44.64%,41.07%,41.07%,42.32%
qwen-max,40.18%,41.96%,38.39%,39.29%,41.07%,40.18%
moonshot-v1-8k,41.96%,37.50%,35.71%,38.39%,41.96%,39.10%
baichuan4,38.46%,38.46%,39.42%,38.46%,37.50%,38.46%

Temperature Value,0.100000,0.300000,0.500000,0.700000,0.900000,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
deepseek-coder,59.38%,57.15%,56.69%,59.38%,51.79%,56.88%
deepseek-chat,58.03%,54.47%,54.02%,57.59%,50.89%,55.00%
qwen2-72b-instruct,49.55%,49.11%,47.77%,44.64%,45.98%,47.41%
gpt-4o-2024-05-13,47.32%,43.30%,47.32%,46.88%,44.65%,45.89%
gemini-1.5-pro-001,47.32%,45.09%,49.10%,42.41%,45.54%,45.89%
claude-3-5-sonnet-20240620,45.53%,46.43%,44.20%,42.41%,45.98%,44.91%
doubao-pro-32k-240615,39.73%,41.96%,39.28%,41.52%,40.62%,40.62%
Avg,40.63%,40.40%,39.79%,39.54%,38.42%,39.76%
baichuan4,39.32%,39.32%,38.91%,37.53%,38.40%,38.70%
qwen-max,35.71%,37.94%,35.27%,39.29%,37.50%,37.14%


## Print_question

```json
ans_data = {
    "paper_type": 
    {
        "model_name": {
            bool:{
                "accuracy": ..,
                "avg_score": ..., 
                "total_score": ..., 
                "correct_cnt": ..., 
                "total_cnt": ...
            }
        }
    }
}
```

Print_question values: [True, False]

In [19]:
values = [True, False]

In [20]:
def generate_table(ans_data, papers, paper_type):

    def highlight_greater_between_col(row, col1, col2):
        # Initialize styles for each cell in the row as empty
        styles = [''] * len(row)
        # Determine the columns' positions
        col1_pos = row.index.get_loc(col1)
        col2_pos = row.index.get_loc(col2)
        # Apply style to the greater value between specified columns
        if row[col1] > row[col2]:
            styles[col1_pos] = 'text-decoration: underline'
        elif row[col1] < row[col2]:
            styles[col2_pos] = 'text-decoration: underline'
        return styles

    tables = []

    all_data = []

    for paper in papers:
        data = []
        for model_name, questions in ans_data[paper].items():
            for temp_value, metrics in questions.items():
                question_num = bool(temp_value)
                data.append([model_name, question_num, metrics["accuracy"]])

        df = pd.DataFrame(data, columns=["Model", "Print Question", "Accuracy"])
        pivot_df = df.pivot(index="Model", columns="Print Question", values="Accuracy").reindex(columns=[True, False])
        pivot_df['Avg'] = pivot_df.mean(axis=1)
        pivot_df = pivot_df.sort_values(by='Avg', ascending=False)
        pivot_df.loc['Avg'] = pivot_df.mean(axis=0)

        styled_df = pivot_df.style.apply(highlight_top_three, subset=pivot_df.columns.tolist())
        styled_df = styled_df.apply(highlight_greater_between_col, col1=True, col2=False, axis=1)
        styled_df = styled_df.set_caption(f'{paper} Question Accuracy: {paper_type}')
        styled_df = styled_df.format('{:.2%}')
        tables.append(styled_df)
        
        for index, row in pivot_df.iterrows():
            for col in pivot_df.columns.drop('Avg'):
                all_data.append([index, col, row[col]])

    all_df = pd.DataFrame(all_data, columns=["Model", "Print Question", "Accuracy"])
    final_avg_df = all_df.groupby(["Model", "Print Question"]).mean().reset_index()
    final_pivot_df = final_avg_df.pivot(index="Model", columns="Print Question", values="Accuracy").reindex(columns=[True, False])
    final_pivot_df['Avg'] = final_pivot_df.mean(axis=1)
    final_pivot_df = final_pivot_df.sort_values(by='Avg', ascending=False)
    final_pivot_df.loc['Avg'] = final_pivot_df.mean(axis=0)

    styled_final_df = final_pivot_df.style.apply(highlight_top_three, subset=final_pivot_df.columns.tolist())
    styled_final_df = highlight_row_name(styled_final_df, 'Avg')
    styled_final_df = styled_final_df.apply(highlight_greater_between_col, col1=True, col2=False, axis=1)
    styled_final_df = styled_final_df.set_caption(f'All Questions\' Accuracy: {paper_type}')
    styled_final_df = styled_final_df.format('{:.2%}')

    tables.append(styled_final_df)

    return tables



def display_multi_table(table_list):
    ''' Acceps a list of IpyTable objects and returns a table which contains each IpyTable in a cell
    '''
    return HTML(
        '<table><tr style="background-color:white;">' + 
        ''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list]) +
        '</tr></table>'
    )

In [21]:
ans_data = {}

for paper in papers:
    ans_data[paper] = {}

    for model in models:

        ans_data[paper][model] = {}
        model_res_dir = os.path.join(res_dir, model)
        paper_res_dir = os.path.join(model_res_dir, paper)
        fpaths = parse_data.find_files_with_suffix(paper_res_dir, '.jsonl')

        for temp in values:

            ans_data[paper][model][temp] = {}

            total_cnt, correct_cnt, total_score = 0, 0, 0

            for fpath in fpaths:
                
                content = parse_data.read_jsonl(fpath)

                for i in content:
                    if i['prompt_parameter']['print_question'] == temp:
                        total_cnt += 1
                        if i['answer']['correct']:
                            correct_cnt += 1
                        total_score += i['answer']['score']

            ans_data[paper][model][temp]['total_cnt'] = total_cnt
            ans_data[paper][model][temp]['correct_cnt'] = correct_cnt
            ans_data[paper][model][temp]['total_score'] = total_score
            ans_data[paper][model][temp]['avg_score'] = round(total_score/total_cnt, 4)
            ans_data[paper][model][temp]['accuracy'] = round(correct_cnt/total_cnt, 4)

tables = generate_table(ans_data, papers, 'Print Question')
display_multi_table(tables)

Print Question,True,False,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Print Question,True,False,Avg
Model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3
Print Question,True,False,Avg
Model,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5
deepseek-coder,54.64%,52.86%,53.75%
qwen2-72b-instruct,51.79%,52.68%,52.24%
deepseek-chat,51.25%,52.14%,51.70%
gemini-1.5-pro-001,49.82%,45.00%,47.41%
gpt-4o-2024-05-13,42.86%,45.00%,43.93%
claude-3-5-sonnet-20240620,42.14%,41.96%,42.05%
baichuan4,39.82%,39.64%,39.73%
yi-large,39.64%,39.64%,39.64%
ERNIE-4.0-Turbo-8K,37.14%,30.00%,33.57%
llama3-70b-instruct,32.50%,33.93%,33.22%

Print Question,True,False,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
deepseek-coder,54.64%,52.86%,53.75%
qwen2-72b-instruct,51.79%,52.68%,52.24%
deepseek-chat,51.25%,52.14%,51.70%
gemini-1.5-pro-001,49.82%,45.00%,47.41%
gpt-4o-2024-05-13,42.86%,45.00%,43.93%
claude-3-5-sonnet-20240620,42.14%,41.96%,42.05%
baichuan4,39.82%,39.64%,39.73%
yi-large,39.64%,39.64%,39.64%
ERNIE-4.0-Turbo-8K,37.14%,30.00%,33.57%
llama3-70b-instruct,32.50%,33.93%,33.22%

Print Question,True,False,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
deepseek-chat,46.96%,51.96%,49.46%
gpt-4o-2024-05-13,47.50%,46.79%,47.14%
deepseek-coder,43.75%,49.82%,46.78%
claude-3-5-sonnet-20240620,45.54%,46.96%,46.25%
doubao-pro-32k-240615,43.75%,44.29%,44.02%
gemini-1.5-pro-001,40.71%,43.75%,42.23%
qwen2-72b-instruct,38.04%,45.00%,41.52%
qwen-max,41.07%,40.00%,40.53%
moonshot-v1-8k,37.68%,38.04%,37.86%
ERNIE-4.0-Turbo-8K,40.00%,32.86%,36.43%

Print Question,True,False,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
deepseek-chat,49.10%,52.05%,50.58%
deepseek-coder,49.20%,51.34%,50.27%
qwen2-72b-instruct,44.92%,48.84%,46.88%
gpt-4o-2024-05-13,45.18%,45.89%,45.54%
gemini-1.5-pro-001,45.27%,44.38%,44.82%
claude-3-5-sonnet-20240620,43.84%,44.46%,44.15%
Avg,38.69%,38.57%,38.63%
doubao-pro-32k-240615,36.52%,38.04%,37.28%
baichuan4,37.41%,37.03%,37.22%
qwen-max,37.68%,35.00%,36.34%


## CoT

```json
ans_data = {
    "paper_type": 
    {
        "model_name": {
            bool:{
                "accuracy": ..,
                "avg_score": ..., 
                "total_score": ..., 
                "correct_cnt": ..., 
                "total_cnt": ...
            }
        }
    }
}
```

CoT values: [True, False]

In [23]:
def generate_table(ans_data, papers, paper_type):

    def highlight_greater_between_col(row, col1, col2):
        # Initialize styles for each cell in the row as empty
        styles = [''] * len(row)
        # Determine the columns' positions
        col1_pos = row.index.get_loc(col1)
        col2_pos = row.index.get_loc(col2)
        # Apply style to the greater value between specified columns
        if row[col1] > row[col2]:
            styles[col1_pos] = 'text-decoration: underline'
        elif row[col1] < row[col2]:
            styles[col2_pos] = 'text-decoration: underline'
        return styles

    tables = []

    all_data = []

    for paper in papers:
        data = []
        for model_name, questions in ans_data[paper].items():
            for temp_value, metrics in questions.items():
                question_num = bool(temp_value)
                data.append([model_name, question_num, metrics["accuracy"]])

        df = pd.DataFrame(data, columns=["Model", "CoT", "Accuracy"])
        pivot_df = df.pivot(index="Model", columns="CoT", values="Accuracy").reindex(columns=[True, False])
        pivot_df['Avg'] = pivot_df.mean(axis=1)
        pivot_df = pivot_df.sort_values(by='Avg', ascending=False)
        pivot_df.loc['Avg'] = pivot_df.mean(axis=0)

        styled_df = pivot_df.style.apply(highlight_top_three, subset=pivot_df.columns.tolist())
        styled_df = styled_df.apply(highlight_greater_between_col, col1=True, col2=False, axis=1)
        styled_df = styled_df.set_caption(f'{paper} Question Accuracy: {paper_type}')
        styled_df = styled_df.format('{:.2%}')
        tables.append(styled_df)
        
        for index, row in pivot_df.iterrows():
            for col in pivot_df.columns.drop('Avg'):
                all_data.append([index, col, row[col]])

    all_df = pd.DataFrame(all_data, columns=["Model", "CoT", "Accuracy"])
    final_avg_df = all_df.groupby(["Model", "CoT"]).mean().reset_index()
    final_pivot_df = final_avg_df.pivot(index="Model", columns="CoT", values="Accuracy").reindex(columns=[True, False])
    final_pivot_df['Avg'] = final_pivot_df.mean(axis=1)
    final_pivot_df = final_pivot_df.sort_values(by='Avg', ascending=False)

    styled_final_df = final_pivot_df.style.apply(highlight_top_three, subset=final_pivot_df.columns.tolist())
    styled_final_df = highlight_row_name(styled_final_df, 'Avg')
    styled_final_df = styled_final_df.apply(highlight_greater_between_col, col1=True, col2=False, axis=1)
    styled_final_df = styled_final_df.set_caption(f'All Questions\' Accuracy: {paper_type}')
    styled_final_df = styled_final_df.format('{:.2%}')

    tables.append(styled_final_df)

    return tables

def display_multi_table(table_list):
    ''' Acceps a list of IpyTable objects and returns a table which contains each IpyTable in a cell
    '''
    return HTML(
        '<table><tr style="background-color:white;">' + 
        ''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list]) +
        '</tr></table>'
    )

In [24]:
ans_data = {}

for paper in papers:
    ans_data[paper] = {}

    for model in models:

        ans_data[paper][model] = {}
        model_res_dir = os.path.join(res_dir, model)
        paper_res_dir = os.path.join(model_res_dir, paper)
        fpaths = parse_data.find_files_with_suffix(paper_res_dir, '.jsonl')

        for temp in values:

            ans_data[paper][model][temp] = {}

            total_cnt, correct_cnt, total_score = 0, 0, 0

            for fpath in fpaths:
                
                content = parse_data.read_jsonl(fpath)

                for i in content:
                    if i['prompt_parameter']['cot'] == temp:
                        total_cnt += 1
                        if i['answer']['correct']:
                            correct_cnt += 1
                        total_score += i['answer']['score']

            ans_data[paper][model][temp]['total_cnt'] = total_cnt
            ans_data[paper][model][temp]['correct_cnt'] = correct_cnt
            ans_data[paper][model][temp]['total_score'] = total_score
            ans_data[paper][model][temp]['avg_score'] = round(total_score/total_cnt, 4)
            ans_data[paper][model][temp]['accuracy'] = round(correct_cnt/total_cnt, 4)

tables = generate_table(ans_data, papers, 'CoT')
display_multi_table(tables)

CoT,True,False,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CoT,True,False,Avg
Model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3
CoT,True,False,Avg
Model,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5
deepseek-coder,53.93%,53.57%,53.75%
qwen2-72b-instruct,51.96%,52.50%,52.23%
deepseek-chat,51.07%,52.32%,51.70%
gemini-1.5-pro-001,48.39%,46.43%,47.41%
gpt-4o-2024-05-13,44.64%,43.21%,43.93%
claude-3-5-sonnet-20240620,44.29%,39.82%,42.05%
baichuan4,40.36%,39.11%,39.73%
yi-large,40.36%,38.93%,39.64%
ERNIE-4.0-Turbo-8K,34.29%,32.86%,33.58%
llama3-70b-instruct,32.68%,33.75%,33.22%

CoT,True,False,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
deepseek-coder,53.93%,53.57%,53.75%
qwen2-72b-instruct,51.96%,52.50%,52.23%
deepseek-chat,51.07%,52.32%,51.70%
gemini-1.5-pro-001,48.39%,46.43%,47.41%
gpt-4o-2024-05-13,44.64%,43.21%,43.93%
claude-3-5-sonnet-20240620,44.29%,39.82%,42.05%
baichuan4,40.36%,39.11%,39.73%
yi-large,40.36%,38.93%,39.64%
ERNIE-4.0-Turbo-8K,34.29%,32.86%,33.58%
llama3-70b-instruct,32.68%,33.75%,33.22%

CoT,True,False,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
deepseek-chat,48.04%,50.89%,49.47%
gpt-4o-2024-05-13,47.32%,46.96%,47.14%
deepseek-coder,47.14%,46.43%,46.78%
claude-3-5-sonnet-20240620,45.36%,47.14%,46.25%
doubao-pro-32k-240615,44.82%,43.21%,44.02%
gemini-1.5-pro-001,41.43%,43.04%,42.23%
qwen2-72b-instruct,43.04%,40.00%,41.52%
qwen-max,41.43%,39.64%,40.53%
moonshot-v1-8k,39.46%,36.25%,37.85%
ERNIE-4.0-Turbo-8K,36.43%,36.43%,36.43%

CoT,True,False,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
deepseek-chat,49.56%,51.61%,50.58%
deepseek-coder,50.53%,50.00%,50.27%
qwen2-72b-instruct,47.50%,46.25%,46.88%
gpt-4o-2024-05-13,45.98%,45.08%,45.53%
gemini-1.5-pro-001,44.91%,44.73%,44.82%
claude-3-5-sonnet-20240620,44.83%,43.48%,44.15%
Avg,38.78%,38.49%,38.63%
doubao-pro-32k-240615,37.05%,37.50%,37.28%
baichuan4,37.87%,36.58%,37.22%
qwen-max,36.61%,36.07%,36.34%


In [25]:
ans_data = {}

for paper in papers:
    ans_data[paper] = {}

    for model in models:

        ans_data[paper][model] = {}
        model_res_dir = os.path.join(res_dir, model)
        paper_res_dir = os.path.join(model_res_dir, paper)
        fpaths = parse_data.find_files_with_suffix(paper_res_dir, '.jsonl')

        for temp in values:

            ans_data[paper][model][temp] = {}

            total_cnt, correct_cnt, total_score = 0, 0, 0

            for fpath in fpaths:
                
                content = parse_data.read_jsonl(fpath)

                for i in content:
                    if i['prompt_parameter']['cot'] == temp and i['prompt_parameter']['lang'] == 'en':
                        total_cnt += 1
                        if i['answer']['correct']:
                            correct_cnt += 1
                        total_score += i['answer']['score']

            ans_data[paper][model][temp]['total_cnt'] = total_cnt
            ans_data[paper][model][temp]['correct_cnt'] = correct_cnt
            ans_data[paper][model][temp]['total_score'] = total_score
            ans_data[paper][model][temp]['avg_score'] = round(total_score/total_cnt, 4)
            ans_data[paper][model][temp]['accuracy'] = round(correct_cnt/total_cnt, 4)

tables = generate_table(ans_data, papers, 'CoT-EN')
display_multi_table(tables)

CoT,True,False,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CoT,True,False,Avg
Model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3
CoT,True,False,Avg
Model,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5
qwen2-72b-instruct,53.93%,50.36%,52.14%
deepseek-chat,44.64%,47.14%,45.89%
deepseek-coder,45.36%,45.36%,45.36%
gemini-1.5-pro-001,46.79%,43.93%,45.36%
gpt-4o-2024-05-13,45.00%,44.29%,44.65%
claude-3-5-sonnet-20240620,43.21%,38.93%,41.07%
baichuan4,41.07%,40.00%,40.53%
yi-large,40.71%,33.93%,37.32%
ERNIE-4.0-Turbo-8K,35.71%,34.64%,35.17%
llama3-70b-instruct,32.86%,34.29%,33.58%

CoT,True,False,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
qwen2-72b-instruct,53.93%,50.36%,52.14%
deepseek-chat,44.64%,47.14%,45.89%
deepseek-coder,45.36%,45.36%,45.36%
gemini-1.5-pro-001,46.79%,43.93%,45.36%
gpt-4o-2024-05-13,45.00%,44.29%,44.65%
claude-3-5-sonnet-20240620,43.21%,38.93%,41.07%
baichuan4,41.07%,40.00%,40.53%
yi-large,40.71%,33.93%,37.32%
ERNIE-4.0-Turbo-8K,35.71%,34.64%,35.17%
llama3-70b-instruct,32.86%,34.29%,33.58%

CoT,True,False,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
deepseek-chat,44.29%,48.57%,46.43%
claude-3-5-sonnet-20240620,44.64%,46.79%,45.72%
gpt-4o-2024-05-13,44.29%,47.14%,45.72%
doubao-pro-32k-240615,45.71%,42.50%,44.10%
gemini-1.5-pro-001,40.71%,43.57%,42.14%
deepseek-coder,41.79%,42.14%,41.96%
qwen-max,41.79%,40.00%,40.90%
qwen2-72b-instruct,41.79%,39.29%,40.54%
ERNIE-4.0-Turbo-8K,39.64%,40.71%,40.17%
moonshot-v1-8k,38.21%,35.00%,36.60%

CoT,True,False,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
qwen2-72b-instruct,47.86%,44.83%,46.34%
deepseek-chat,44.46%,47.86%,46.16%
gpt-4o-2024-05-13,44.65%,45.72%,45.18%
gemini-1.5-pro-001,43.75%,43.75%,43.75%
deepseek-coder,43.57%,43.75%,43.66%
claude-3-5-sonnet-20240620,43.93%,42.86%,43.39%
ERNIE-4.0-Turbo-8K,37.67%,37.68%,37.68%
Avg,37.74%,37.28%,37.51%
baichuan4,36.30%,35.19%,35.75%
qwen-max,35.54%,35.53%,35.54%


In [26]:
ans_data = {}

for paper in papers:
    ans_data[paper] = {}

    for model in models:

        ans_data[paper][model] = {}
        model_res_dir = os.path.join(res_dir, model)
        paper_res_dir = os.path.join(model_res_dir, paper)
        fpaths = parse_data.find_files_with_suffix(paper_res_dir, '.jsonl')

        for temp in values:

            ans_data[paper][model][temp] = {}

            total_cnt, correct_cnt, total_score = 0, 0, 0

            for fpath in fpaths:
                
                content = parse_data.read_jsonl(fpath)

                for i in content:
                    if i['prompt_parameter']['cot'] == temp and i['prompt_parameter']['lang'] == 'zh':
                        total_cnt += 1
                        if i['answer']['correct']:
                            correct_cnt += 1
                        total_score += i['answer']['score']

            ans_data[paper][model][temp]['total_cnt'] = total_cnt
            ans_data[paper][model][temp]['correct_cnt'] = correct_cnt
            ans_data[paper][model][temp]['total_score'] = total_score
            ans_data[paper][model][temp]['avg_score'] = round(total_score/total_cnt, 4)
            ans_data[paper][model][temp]['accuracy'] = round(correct_cnt/total_cnt, 4)

tables = generate_table(ans_data, papers, 'CoT-ZH')
display_multi_table(tables)

CoT,True,False,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CoT,True,False,Avg
Model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3
CoT,True,False,Avg
Model,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5
deepseek-coder,62.50%,61.79%,62.15%
deepseek-chat,57.50%,57.50%,57.50%
qwen2-72b-instruct,50.00%,54.64%,52.32%
gemini-1.5-pro-001,50.00%,48.93%,49.47%
gpt-4o-2024-05-13,44.29%,42.14%,43.22%
claude-3-5-sonnet-20240620,45.36%,40.71%,43.04%
yi-large,40.00%,43.93%,41.97%
baichuan4,39.64%,38.21%,38.92%
doubao-pro-32k-240615,36.07%,38.57%,37.32%
qwen-max,34.29%,33.93%,34.11%

CoT,True,False,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
deepseek-coder,62.50%,61.79%,62.15%
deepseek-chat,57.50%,57.50%,57.50%
qwen2-72b-instruct,50.00%,54.64%,52.32%
gemini-1.5-pro-001,50.00%,48.93%,49.47%
gpt-4o-2024-05-13,44.29%,42.14%,43.22%
claude-3-5-sonnet-20240620,45.36%,40.71%,43.04%
yi-large,40.00%,43.93%,41.97%
baichuan4,39.64%,38.21%,38.92%
doubao-pro-32k-240615,36.07%,38.57%,37.32%
qwen-max,34.29%,33.93%,34.11%

CoT,True,False,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
deepseek-chat,51.79%,53.21%,52.50%
deepseek-coder,52.50%,50.71%,51.61%
gpt-4o-2024-05-13,50.36%,46.79%,48.58%
claude-3-5-sonnet-20240620,46.07%,47.50%,46.78%
doubao-pro-32k-240615,43.93%,43.93%,43.93%
qwen2-72b-instruct,44.29%,40.71%,42.50%
gemini-1.5-pro-001,42.14%,42.50%,42.32%
qwen-max,41.07%,39.29%,40.18%
moonshot-v1-8k,40.71%,37.50%,39.11%
baichuan4,39.23%,37.69%,38.46%

CoT,True,False,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
deepseek-coder,57.50%,56.25%,56.88%
deepseek-chat,54.64%,55.35%,55.00%
qwen2-72b-instruct,47.15%,47.67%,47.41%
gpt-4o-2024-05-13,47.33%,44.46%,45.90%
gemini-1.5-pro-001,46.07%,45.72%,45.89%
claude-3-5-sonnet-20240620,45.72%,44.10%,44.91%
doubao-pro-32k-240615,40.00%,41.25%,40.62%
Avg,39.82%,39.69%,39.76%
baichuan4,39.43%,37.95%,38.69%
qwen-max,37.68%,36.61%,37.15%


## Sequential

```json
ans_data = {
    "paper_type": 
    {
        "model_name": {
            'answer-first':{
                "accuracy": ..,
                "avg_score": ..., 
                "total_score": ..., 
                "correct_cnt": ..., 
                "total_cnt": ...
            }
        }
    }
}
```

Sequential values: ['answer-first', 'process-first']

In [27]:
def generate_table(ans_data, papers, paper_type):

    def highlight_greater_between_col(row, col1, col2):
        # Initialize styles for each cell in the row as empty
        styles = [''] * len(row)
        # Determine the columns' positions
        col1_pos = row.index.get_loc(col1)
        col2_pos = row.index.get_loc(col2)
        # Apply style to the greater value between specified columns
        if row[col1] > row[col2]:
            styles[col1_pos] = 'text-decoration: underline'
        elif row[col1] < row[col2]:
            styles[col2_pos] = 'text-decoration: underline'
        return styles

    tables = []

    all_data = []

    for paper in papers:
        data = []
        for model_name, questions in ans_data[paper].items():
            for temp_value, metrics in questions.items():
                question_num = temp_value
                data.append([model_name, question_num, metrics["accuracy"]])

        df = pd.DataFrame(data, columns=["Model", "Sequential", "Accuracy"])
        pivot_df = df.pivot(index="Model", columns="Sequential", values="Accuracy")
        pivot_df['Avg'] = pivot_df.mean(axis=1)
        pivot_df = pivot_df.sort_values(by='Avg', ascending=False)
        pivot_df.loc['Avg'] = pivot_df.mean(axis=0)

        styled_df = pivot_df.style.apply(highlight_top_three, subset=pivot_df.columns.tolist())
        styled_df = styled_df.apply(highlight_greater_between_col, col1='answer-first', col2='process-first', axis=1)
        styled_df = styled_df.set_caption(f'{paper} Question Accuracy: {paper_type}')
        styled_df = styled_df.format('{:.2%}')
        tables.append(styled_df)
        
        for index, row in pivot_df.iterrows():
            for col in pivot_df.columns.drop('Avg'):
                all_data.append([index, col, row[col]])

    all_df = pd.DataFrame(all_data, columns=["Model", "Sequential", "Accuracy"])
    final_avg_df = all_df.groupby(["Model", "Sequential"]).mean().reset_index()
    final_pivot_df = final_avg_df.pivot(index="Model", columns="Sequential", values="Accuracy")
    final_pivot_df['Avg'] = final_pivot_df.mean(axis=1)
    final_pivot_df = final_pivot_df.sort_values(by='Avg', ascending=False)

    styled_final_df = final_pivot_df.style.apply(highlight_top_three, subset=final_pivot_df.columns.tolist())
    styled_final_df = highlight_row_name(styled_final_df, 'Avg')
    styled_final_df = styled_final_df.apply(highlight_greater_between_col, col1='answer-first', col2='process-first', axis=1)
    styled_final_df = styled_final_df.set_caption(f'All Questions\' Accuracy: {paper_type}')
    styled_final_df = styled_final_df.format('{:.2%}')

    tables.append(styled_final_df)

    return tables

def display_multi_table(table_list):
    ''' Acceps a list of IpyTable objects and returns a table which contains each IpyTable in a cell
    '''
    return HTML(
        '<table><tr style="background-color:white;">' + 
        ''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list]) +
        '</tr></table>'
    )

In [28]:
values = ['answer-first', 'process-first']

In [29]:
ans_data = {}

for paper in papers:
    ans_data[paper] = {}

    for model in models:

        ans_data[paper][model] = {}
        model_res_dir = os.path.join(res_dir, model)
        paper_res_dir = os.path.join(model_res_dir, paper)
        fpaths = parse_data.find_files_with_suffix(paper_res_dir, '.jsonl')

        for temp in values:

            ans_data[paper][model][temp] = {}

            total_cnt, correct_cnt, total_score = 0, 0, 0

            for fpath in fpaths:
                
                content = parse_data.read_jsonl(fpath)

                for i in content:
                    if i['prompt_parameter']['sequential'] == temp:
                        total_cnt += 1
                        if i['answer']['correct']:
                            correct_cnt += 1
                        total_score += i['answer']['score']

            ans_data[paper][model][temp]['total_cnt'] = total_cnt
            ans_data[paper][model][temp]['correct_cnt'] = correct_cnt
            ans_data[paper][model][temp]['total_score'] = total_score
            ans_data[paper][model][temp]['avg_score'] = round(total_score/total_cnt, 4)
            ans_data[paper][model][temp]['accuracy'] = round(correct_cnt/total_cnt, 4)

tables = generate_table(ans_data, papers, 'Sequential')
display_multi_table(tables)

Sequential,answer-first,process-first,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sequential,answer-first,process-first,Avg
Model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3
Sequential,answer-first,process-first,Avg
Model,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5
deepseek-coder,42.50%,65.00%,53.75%
qwen2-72b-instruct,48.75%,55.71%,52.23%
deepseek-chat,41.79%,61.61%,51.70%
gemini-1.5-pro-001,43.93%,50.89%,47.41%
gpt-4o-2024-05-13,33.57%,54.29%,43.93%
claude-3-5-sonnet-20240620,26.25%,57.86%,42.05%
baichuan4,40.00%,39.46%,39.73%
yi-large,40.18%,39.11%,39.64%
ERNIE-4.0-Turbo-8K,22.14%,45.00%,33.57%
llama3-70b-instruct,30.71%,35.71%,33.21%

Sequential,answer-first,process-first,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
deepseek-coder,42.50%,65.00%,53.75%
qwen2-72b-instruct,48.75%,55.71%,52.23%
deepseek-chat,41.79%,61.61%,51.70%
gemini-1.5-pro-001,43.93%,50.89%,47.41%
gpt-4o-2024-05-13,33.57%,54.29%,43.93%
claude-3-5-sonnet-20240620,26.25%,57.86%,42.05%
baichuan4,40.00%,39.46%,39.73%
yi-large,40.18%,39.11%,39.64%
ERNIE-4.0-Turbo-8K,22.14%,45.00%,33.57%
llama3-70b-instruct,30.71%,35.71%,33.21%

Sequential,answer-first,process-first,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
deepseek-chat,40.18%,58.75%,49.47%
gpt-4o-2024-05-13,30.71%,63.57%,47.14%
deepseek-coder,36.07%,57.50%,46.78%
claude-3-5-sonnet-20240620,34.82%,57.68%,46.25%
doubao-pro-32k-240615,31.07%,56.96%,44.02%
gemini-1.5-pro-001,31.43%,53.04%,42.23%
qwen2-72b-instruct,29.11%,53.93%,41.52%
qwen-max,38.39%,42.68%,40.53%
moonshot-v1-8k,29.11%,46.61%,37.86%
ERNIE-4.0-Turbo-8K,28.21%,44.64%,36.43%

Sequential,answer-first,process-first,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
deepseek-chat,40.98%,60.18%,50.58%
deepseek-coder,39.29%,61.25%,50.27%
qwen2-72b-instruct,38.93%,54.82%,46.88%
gpt-4o-2024-05-13,32.14%,58.93%,45.54%
gemini-1.5-pro-001,37.68%,51.96%,44.82%
claude-3-5-sonnet-20240620,30.54%,57.77%,44.15%
Avg,30.55%,46.71%,38.63%
doubao-pro-32k-240615,24.20%,50.35%,37.27%
baichuan4,31.93%,42.52%,37.22%
qwen-max,34.38%,38.30%,36.34%


In [30]:
ans_data = {}

for paper in papers:
    ans_data[paper] = {}

    for model in models:

        ans_data[paper][model] = {}
        model_res_dir = os.path.join(res_dir, model)
        paper_res_dir = os.path.join(model_res_dir, paper)
        fpaths = parse_data.find_files_with_suffix(paper_res_dir, '.jsonl')

        for temp in values:

            ans_data[paper][model][temp] = {}

            total_cnt, correct_cnt, total_score = 0, 0, 0

            for fpath in fpaths:
                
                content = parse_data.read_jsonl(fpath)

                for i in content:
                    if i['prompt_parameter']['sequential'] == temp and i['prompt_parameter']['lang'] == 'en':
                        total_cnt += 1
                        if i['answer']['correct']:
                            correct_cnt += 1
                        total_score += i['answer']['score']

            ans_data[paper][model][temp]['total_cnt'] = total_cnt
            ans_data[paper][model][temp]['correct_cnt'] = correct_cnt
            ans_data[paper][model][temp]['total_score'] = total_score
            ans_data[paper][model][temp]['avg_score'] = round(total_score/total_cnt, 4)
            ans_data[paper][model][temp]['accuracy'] = round(correct_cnt/total_cnt, 4)

tables = generate_table(ans_data, papers, 'Sequential-EN')
display_multi_table(tables)

Sequential,answer-first,process-first,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sequential,answer-first,process-first,Avg
Model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3
Sequential,answer-first,process-first,Avg
Model,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5
qwen2-72b-instruct,48.93%,55.36%,52.14%
deepseek-chat,39.29%,52.50%,45.90%
deepseek-coder,33.93%,56.79%,45.36%
gemini-1.5-pro-001,40.36%,50.36%,45.36%
gpt-4o-2024-05-13,33.93%,55.36%,44.65%
claude-3-5-sonnet-20240620,25.71%,56.43%,41.07%
baichuan4,44.29%,36.79%,40.54%
yi-large,37.14%,37.50%,37.32%
ERNIE-4.0-Turbo-8K,25.71%,44.64%,35.17%
llama3-70b-instruct,30.36%,36.79%,33.58%

Sequential,answer-first,process-first,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
qwen2-72b-instruct,48.93%,55.36%,52.14%
deepseek-chat,39.29%,52.50%,45.90%
deepseek-coder,33.93%,56.79%,45.36%
gemini-1.5-pro-001,40.36%,50.36%,45.36%
gpt-4o-2024-05-13,33.93%,55.36%,44.65%
claude-3-5-sonnet-20240620,25.71%,56.43%,41.07%
baichuan4,44.29%,36.79%,40.54%
yi-large,37.14%,37.50%,37.32%
ERNIE-4.0-Turbo-8K,25.71%,44.64%,35.17%
llama3-70b-instruct,30.36%,36.79%,33.58%

Sequential,answer-first,process-first,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
deepseek-chat,37.50%,55.36%,46.43%
gpt-4o-2024-05-13,27.86%,63.57%,45.72%
claude-3-5-sonnet-20240620,34.29%,57.14%,45.72%
doubao-pro-32k-240615,31.07%,57.14%,44.10%
gemini-1.5-pro-001,29.64%,54.64%,42.14%
deepseek-coder,36.43%,47.50%,41.96%
qwen-max,38.93%,42.86%,40.89%
qwen2-72b-instruct,26.43%,54.64%,40.53%
ERNIE-4.0-Turbo-8K,29.64%,50.71%,40.17%
moonshot-v1-8k,26.79%,46.43%,36.61%

Sequential,answer-first,process-first,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
qwen2-72b-instruct,37.68%,55.00%,46.34%
deepseek-chat,38.40%,53.93%,46.16%
gpt-4o-2024-05-13,30.89%,59.47%,45.18%
gemini-1.5-pro-001,35.00%,52.50%,43.75%
deepseek-coder,35.18%,52.14%,43.66%
claude-3-5-sonnet-20240620,30.00%,56.78%,43.39%
ERNIE-4.0-Turbo-8K,27.68%,47.67%,37.68%
Avg,29.79%,45.22%,37.51%
baichuan4,33.49%,38.01%,35.75%
qwen-max,33.75%,37.32%,35.54%


In [31]:
ans_data = {}

for paper in papers:
    ans_data[paper] = {}

    for model in models:

        ans_data[paper][model] = {}
        model_res_dir = os.path.join(res_dir, model)
        paper_res_dir = os.path.join(model_res_dir, paper)
        fpaths = parse_data.find_files_with_suffix(paper_res_dir, '.jsonl')

        for temp in values:

            ans_data[paper][model][temp] = {}

            total_cnt, correct_cnt, total_score = 0, 0, 0

            for fpath in fpaths:
                
                content = parse_data.read_jsonl(fpath)

                for i in content:
                    if i['prompt_parameter']['sequential'] == temp and i['prompt_parameter']['lang'] == 'zh':
                        total_cnt += 1
                        if i['answer']['correct']:
                            correct_cnt += 1
                        total_score += i['answer']['score']

            ans_data[paper][model][temp]['total_cnt'] = total_cnt
            ans_data[paper][model][temp]['correct_cnt'] = correct_cnt
            ans_data[paper][model][temp]['total_score'] = total_score
            ans_data[paper][model][temp]['avg_score'] = round(total_score/total_cnt, 4)
            ans_data[paper][model][temp]['accuracy'] = round(correct_cnt/total_cnt, 4)

tables = generate_table(ans_data, papers, 'Sequential-ZH')
display_multi_table(tables)

Sequential,answer-first,process-first,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sequential,answer-first,process-first,Avg
Model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3
Sequential,answer-first,process-first,Avg
Model,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5
deepseek-coder,51.07%,73.21%,62.14%
deepseek-chat,44.29%,70.71%,57.50%
qwen2-72b-instruct,48.57%,56.07%,52.32%
gemini-1.5-pro-001,47.50%,51.43%,49.46%
gpt-4o-2024-05-13,33.21%,53.21%,43.21%
claude-3-5-sonnet-20240620,26.79%,59.29%,43.04%
yi-large,43.21%,40.71%,41.96%
baichuan4,35.71%,42.14%,38.92%
doubao-pro-32k-240615,20.00%,54.64%,37.32%
qwen-max,32.14%,36.07%,34.11%

Sequential,answer-first,process-first,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
deepseek-coder,51.07%,73.21%,62.14%
deepseek-chat,44.29%,70.71%,57.50%
qwen2-72b-instruct,48.57%,56.07%,52.32%
gemini-1.5-pro-001,47.50%,51.43%,49.46%
gpt-4o-2024-05-13,33.21%,53.21%,43.21%
claude-3-5-sonnet-20240620,26.79%,59.29%,43.04%
yi-large,43.21%,40.71%,41.96%
baichuan4,35.71%,42.14%,38.92%
doubao-pro-32k-240615,20.00%,54.64%,37.32%
qwen-max,32.14%,36.07%,34.11%

Sequential,answer-first,process-first,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
deepseek-chat,42.86%,62.14%,52.50%
deepseek-coder,35.71%,67.50%,51.61%
gpt-4o-2024-05-13,33.57%,63.57%,48.57%
claude-3-5-sonnet-20240620,35.36%,58.21%,46.78%
doubao-pro-32k-240615,31.07%,56.79%,43.93%
qwen2-72b-instruct,31.79%,53.21%,42.50%
gemini-1.5-pro-001,33.21%,51.43%,42.32%
qwen-max,37.86%,42.50%,40.18%
moonshot-v1-8k,31.43%,46.79%,39.11%
baichuan4,25.00%,51.92%,38.46%

Sequential,answer-first,process-first,Avg
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
deepseek-coder,43.39%,70.36%,56.87%
deepseek-chat,43.57%,66.42%,55.00%
qwen2-72b-instruct,40.18%,54.64%,47.41%
gemini-1.5-pro-001,40.35%,51.43%,45.89%
gpt-4o-2024-05-13,33.39%,58.39%,45.89%
claude-3-5-sonnet-20240620,31.08%,58.75%,44.91%
doubao-pro-32k-240615,25.53%,55.72%,40.62%
Avg,31.31%,48.21%,39.76%
baichuan4,30.35%,47.03%,38.69%
qwen-max,35.00%,39.29%,37.14%


## Token Number

```json
ans_data = {
    "paper_type": 
    {
        "model_name": {
            'question_idx':{
                "correct_cnt": ..., 
                "incorrect_cnt": ....,
                "total_cnt": ...,
                "correct_token": ...,
                "incorrect_token": ...,
                "total_token": ...,
                "avg_correct_tokens": ...,
                "avg_incorrect_tokens": ...,
                "avg_tokens": ...,
            }
        }
    }
}
```

In [32]:
import pandas as pd

def generate_tables(ans_data, papers):
    tables = []

    for paper in papers:
        data = []
        # Ensure iterating over all models for the specified paper
        for model_name, questions in ans_data[paper].items():
            for question_idx, metrics in questions.items():
                question_num = int(question_idx.replace('question', ''))
                # Add data for each metric type under each question
                data.extend([
                    (model_name, question_num, 'True', metrics['avg_correct_tokens']),
                    (model_name, question_num, 'False', metrics['avg_incorrect_tokens']),
                    (model_name, question_num, 'Total', metrics['avg_tokens'])
                ])

        # Create DataFrame
        df = pd.DataFrame(data, columns=["Model", "Question Index", "Metric Type", "Value"])

        # Pivot the DataFrame to get the desired format with multi-level columns
        pivot_df = df.pivot_table(
            index="Model",
            columns=["Question Index", "Metric Type"],
            values="Value",
            aggfunc='first'
        )

        # Reorder the levels to sort by Metric Type within each Question Index
        metric_order = ["True", "False", "Total"]  # Define the desired order for Metric Type
        # Reindex the columns based on the new order for each Question Index
        new_columns = pd.MultiIndex.from_product([pivot_df.columns.levels[0], metric_order], names=pivot_df.columns.names)
        pivot_df = pivot_df.reindex(columns=new_columns)

        # Sort the index to ensure models are in alphabetical order
        pivot_df = pivot_df.sort_index()

        # Set the caption for the DataFrame
        styled_df = pivot_df.style.set_caption(f'{paper} Token Metrics')

        # Format the numbers to no decimal places
        styled_df = styled_df.format('{:.0f}')

        # Collect all tables for return
        tables.append(styled_df)

    return tables

In [33]:
ans_data = {}
tiny_number = 1e-10

for paper in papers:
    ans_data[paper] = {}

    for model in models:

        ans_data[paper][model] = {}
        model_res_dir = os.path.join(res_dir, model)
        paper_res_dir = os.path.join(model_res_dir, paper)
        fpaths = parse_data.find_files_with_suffix(paper_res_dir, '.jsonl')

        for fpath in fpaths:
            
            ques_idx = fpath.split(os.sep)[-1].split('.')[0]
            ans_data[paper][model][ques_idx] = {}
            total_cnt, correct_cnt, incorrect_cnt = 0, 0, 0
            total_tokens, correct_tokens, incorrect_tokens = 0, 0, 0

            content = parse_data.read_jsonl(fpath)

            for i in content:
                # ignore the missing data
                if 'total_tokens' not in i['token_usage'].keys():
                    continue

                total_cnt += 1
                if i['answer']['correct']:
                    correct_cnt += 1
                    correct_tokens += i['token_usage']['total_tokens']
                else:
                    incorrect_cnt += 1
                    incorrect_tokens += i['token_usage']['total_tokens']

            if correct_cnt == 0:
                correct_cnt = tiny_number
            if incorrect_cnt == 0:
                incorrect_cnt = tiny_number
            if total_cnt == 0:
                total_cnt = tiny_number

            total_tokens = incorrect_tokens + correct_tokens
            ans_data[paper][model][ques_idx]['total_cnt'] = total_cnt
            ans_data[paper][model][ques_idx]['correct_cnt'] = correct_cnt
            ans_data[paper][model][ques_idx]['incorrect_cnt'] = incorrect_cnt
            ans_data[paper][model][ques_idx]['correct_tokens'] = correct_tokens
            ans_data[paper][model][ques_idx]['incorrect_tokens'] = incorrect_tokens
            ans_data[paper][model][ques_idx]['total_tokens'] = total_tokens
            ans_data[paper][model][ques_idx]['avg_correct_tokens'] = correct_tokens / correct_cnt
            ans_data[paper][model][ques_idx]['avg_incorrect_tokens'] = incorrect_tokens / incorrect_cnt
            ans_data[paper][model][ques_idx]['avg_tokens'] = total_tokens / total_cnt


tables = generate_tables(ans_data, papers)
display(tables[0])
display(tables[1])

Question Index,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10,10,11,11,11,12,12,12,13,13,13,14,14,14
Metric Type,True,False,Total,True,False,Total,True,False,Total,True,False,Total,True,False,Total,True,False,Total,True,False,Total,True,False,Total,True,False,Total,True,False,Total,True,False,Total,True,False,Total,True,False,Total,True,False,Total
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2
ERNIE-4.0-Turbo-8K,745,714,728,626,661,648,695,1338,864,902,1259,1170,768,810,787,869,916,875,696,745,714,1069,1045,1049,1172,1118,1127,1280,1120,1126,1195,1098,1128,780,975,968,874,842,845,0,1096,1096
abab6.5s-chat,832,884,863,1618,1900,1716,817,1250,1050,1169,2901,2706,1028,1291,1173,1092,1147,1125,1209,1185,1199,1094,1128,1107,1263,1293,1286,1352,1325,1327,1572,1500,1512,0,1831,1831,0,1070,1070,2630,1791,1865
baichuan4,906,908,907,1192,1319,1252,946,1228,1031,1096,1759,1536,975,1012,990,1144,1083,1126,963,1016,981,1198,1224,1213,1295,1329,1316,1261,1239,1239,1405,1353,1360,0,1641,1641,1199,1147,1154,1704,1350,1474
claude-3-5-sonnet-20240620,867,852,859,745,835,751,876,902,881,1066,1223,1178,808,837,815,970,1152,979,925,926,926,972,977,976,1163,1182,1174,1211,1213,1213,0,1269,1269,1255,1121,1135,883,872,875,1164,1141,1142
deepseek-chat,788,0,788,698,858,814,807,1106,956,888,1019,951,932,878,931,1078,1001,1068,933,0,933,921,968,950,1216,1154,1191,1355,1359,1358,1271,1302,1295,1362,1352,1354,940,1031,999,1042,1225,1221
deepseek-coder,801,4397,846,706,732,723,787,1908,1348,992,1016,1004,959,1273,986,1125,972,1089,984,0,984,966,947,959,1160,1127,1149,1549,1482,1498,1401,1441,1432,1565,1542,1548,1052,1331,1205,976,1284,1280
doubao-pro-32k-240615,605,645,631,598,728,676,604,658,619,684,739,723,666,658,662,740,736,738,652,583,648,1059,770,773,947,994,992,1018,910,913,0,1038,1038,823,793,800,718,658,669,0,839,839
gemini-1.5-pro-001,574,568,570,555,603,582,605,633,606,677,758,737,656,0,656,800,737,786,771,941,788,808,788,801,1007,901,968,1084,908,952,0,1141,1141,770,844,838,684,701,696,725,945,940
glm-4-0520,701,737,714,736,1319,1275,712,1910,1176,2332,1557,1586,843,1229,1080,942,882,936,893,972,956,921,922,922,1125,1074,1090,1263,1087,1089,0,1132,1132,1315,1931,1877,846,1204,1195,0,1895,1895
gpt-4o-2024-05-13,780,858,782,737,756,744,819,1017,906,860,1011,981,1045,1203,1100,1020,986,1006,899,880,898,1267,1020,1029,1157,1164,1161,1613,1536,1537,1444,1380,1416,1455,1619,1594,915,956,941,1211,1207,1207


Question Index,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10,10,11,11,11,12,12,12,13,13,13,14,14,14
Metric Type,True,False,Total,True,False,Total,True,False,Total,True,False,Total,True,False,Total,True,False,Total,True,False,Total,True,False,Total,True,False,Total,True,False,Total,True,False,Total,True,False,Total,True,False,Total,True,False,Total
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2
ERNIE-4.0-Turbo-8K,489,417,482,684,666,682,1043,1087,1074,998,1261,1238,912,882,900,1004,1084,1069,1098,1074,1078,985,871,939,936,847,918,1144,1212,1208,1245,1076,1078,976.0,855.0,888.0,815,913,897,0,954,954
abab6.5s-chat,0,0,0,0,0,0,0,0,0,1273,1382,1339,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0
baichuan4,613,653,615,861,905,877,1368,1510,1448,1271,1409,1342,1216,1205,1211,1156,1270,1253,0,1727,1727,1295,1393,1369,1106,1047,1094,1418,1339,1341,0,1171,1171,,,,1124,1470,1405,0,1229,1229
claude-3-5-sonnet-20240620,611,0,611,777,0,777,1223,1233,1228,1191,1264,1207,1098,1149,1126,1050,1037,1040,1078,1140,1123,1006,919,981,1071,0,1071,1253,1275,1274,0,1115,1115,1017.0,1045.0,1033.0,1110,1179,1174,0,850,850
deepseek-chat,536,0,536,741,1006,745,1239,1318,1241,1242,1417,1343,1013,1042,1013,1020,1293,1265,1109,1152,1138,1007,1077,1057,1070,965,1066,1284,1268,1268,1127,1108,1111,1006.0,1125.0,1065.0,837,943,920,0,1076,1076
deepseek-coder,563,0,563,783,932,841,1207,984,1199,1489,1463,1476,1028,0,1028,1095,1167,1148,1171,1506,1427,1122,1303,1262,1110,1096,1108,0,1318,1318,1216,1264,1254,1069.0,1162.0,1117.0,912,1029,1009,0,1781,1781
doubao-pro-32k-240615,472,0,472,599,0,599,848,816,827,1155,1067,1130,903,808,901,1047,902,908,682,870,868,761,776,766,892,898,895,1321,1128,1137,916,866,872,795.0,756.0,771.0,683,740,722,620,664,662
gemini-1.5-pro-001,464,0,464,626,0,626,824,836,831,1047,1118,1065,866,892,880,797,906,878,1303,1159,1161,903,945,928,777,750,776,1218,1243,1238,907,999,992,546.0,593.0,589.0,841,718,745,805,757,759
glm-4-0520,485,0,485,656,905,659,1176,1525,1416,1045,1178,1108,961,938,955,930,1190,1164,1363,1287,1296,1768,1371,1391,827,826,827,0,1141,1141,0,960,960,1083.0,2099.0,1743.0,973,2069,1808,0,1565,1565
gpt-4o-2024-05-13,510,0,510,750,0,750,1242,1272,1255,1289,1298,1295,1010,1031,1016,1104,1069,1080,1231,1235,1234,1014,1118,1074,1021,0,1021,1286,1239,1243,1220,1140,1149,970.0,1216.0,1096.0,836,971,921,0,1466,1466


# Score

```json
ans_data = {
    "paper_type": 
    {
        "model_name": {
            "question_idx":{
                "accuracy": ..,
                "avg_score": ..., 
                "total_score": ..., 
                "correct_cnt": ..., 
                "total_cnt": ...
            }
        }
    }
}
```

In [34]:
def generate_acc_table(ans_data, papers, paper_type):

    tables = []

    for paper in papers:
        data = []
        for model_name, questions in ans_data[paper].items():
            for question_idx, metrics in questions.items():
                question_num = int(question_idx.replace('question', ''))
                data.append([model_name, question_num, metrics["avg_score"]])

        df = pd.DataFrame(data, columns=["Model", "Question Index", "Score"])

        # Pivot table to get the desired format
        pivot_df = df.pivot(index="Model", columns="Question Index", values="Score")

        # Calculate the average accuracy for each model and add as a new column
        pivot_df['Total'] = pivot_df.sum(axis=1)
        pivot_df['Rate'] = pivot_df['Total'] /73

        pivot_df.loc['Avg'] = pivot_df.mean(axis=0)

        # Sort by Avg column in descending order
        pivot_df = pivot_df.sort_values(by='Total', ascending=False)

        # Apply the highlighting function
        styled_df = pivot_df.style.apply(highlight_top_three, subset=[i for i in range(1, 15)] + ['Total'])
        styled_df = highlight_row_name(styled_df, 'Avg')

        # Set the caption for the DataFrame
        styled_df = styled_df.set_caption(f'{paper} Question Score: {paper_type}')

        styled_df = styled_df.format({
            'Rate': '{:.2%}',   # 为 'Rate' 列应用百分比格式，并保留四位小数
            **{col: '{:.2f}' for col in pivot_df.columns if col != 'Rate'}  # 为除 'Rate' 外的其它列应用浮点数格式，并保留两位小数
        })
        

        # Display the styled DataFrame
        tables.append(styled_df)

    return tables

In [35]:
ans_data = {}

for paper in papers:
    ans_data[paper] = {}

    for model in models:

        ans_data[paper][model] = {}
        model_res_dir = os.path.join(res_dir, model)
        paper_res_dir = os.path.join(model_res_dir, paper)
        fpaths = parse_data.find_files_with_suffix(paper_res_dir, '.jsonl')

        for fpath in fpaths:
            
            ques_idx = fpath.split(os.sep)[-1].split('.')[0]
            ans_data[paper][model][ques_idx] = {}
            total_cnt, correct_cnt, total_score = 0, 0, 0

            content = parse_data.read_jsonl(fpath)

            for i in content:
                total_cnt += 1
                if i['answer']['correct']:
                    correct_cnt += 1
                total_score += i['answer']['score']

            ans_data[paper][model][ques_idx]['total_cnt'] = total_cnt
            ans_data[paper][model][ques_idx]['correct_cnt'] = correct_cnt
            ans_data[paper][model][ques_idx]['total_score'] = total_score
            ans_data[paper][model][ques_idx]['avg_score'] = total_score/total_cnt
            ans_data[paper][model][ques_idx]['accuracy'] = correct_cnt/total_cnt

tables = generate_acc_table(ans_data, papers, 'ALL')
display_multi_table(tables)

Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Total,Rate
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Total,Rate
Model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
deepseek-coder,4.94,1.62,2.5,2.62,4.56,3.81,5.0,3.19,3.9,1.46,3.6,1.44,2.25,0.06,40.96,56.11%
qwen2-72b-instruct,4.94,2.88,2.94,1.19,4.88,4.75,4.31,2.81,4.28,1.12,3.94,0.25,0.94,1.0,40.21,55.09%
deepseek-chat,5.00,1.38,2.5,2.62,4.94,4.31,5.0,1.94,3.6,1.73,2.85,1.12,1.75,0.12,38.86,53.24%
gemini-1.5-pro-001,1.94,2.12,4.75,1.31,5.0,3.88,4.5,3.12,3.83,1.65,1.76,0.38,1.62,0.12,35.99,49.30%
gpt-4o-2024-05-13,4.88,3.19,2.81,1.0,3.25,2.88,4.62,0.19,2.55,0.11,3.86,0.75,1.81,0.38,32.28,44.21%
claude-3-5-sonnet-20240620,2.19,4.69,4.06,1.44,3.75,4.75,2.81,0.94,2.55,0.94,0.82,0.5,1.31,0.19,30.94,42.38%
baichuan4,2.88,2.62,3.5,1.69,3.0,3.5,3.38,2.0,2.51,0.45,2.55,0.0,0.69,1.75,30.51,41.80%
yi-large,2.81,3.0,3.75,1.62,3.12,2.62,3.38,2.12,2.74,0.34,2.7,0.0,0.94,0.69,29.84,40.87%
Avg,2.96,2.27,3.05,1.21,3.41,3.57,3.74,1.61,2.36,0.88,2.49,0.4,0.84,0.43,29.24,40.05%
mistral-large-2402,5.00,3.44,2.06,0.69,2.12,2.38,3.81,0.38,2.21,0.15,2.77,0.06,0.19,0.31,25.57,35.03%

Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Total,Rate
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
deepseek-coder,4.94,1.62,2.5,2.62,4.56,3.81,5.0,3.19,3.9,1.46,3.6,1.44,2.25,0.06,40.96,56.11%
qwen2-72b-instruct,4.94,2.88,2.94,1.19,4.88,4.75,4.31,2.81,4.28,1.12,3.94,0.25,0.94,1.0,40.21,55.09%
deepseek-chat,5.0,1.38,2.5,2.62,4.94,4.31,5.0,1.94,3.6,1.73,2.85,1.12,1.75,0.12,38.86,53.24%
gemini-1.5-pro-001,1.94,2.12,4.75,1.31,5.0,3.88,4.5,3.12,3.83,1.65,1.76,0.38,1.62,0.12,35.99,49.30%
gpt-4o-2024-05-13,4.88,3.19,2.81,1.0,3.25,2.88,4.62,0.19,2.55,0.11,3.86,0.75,1.81,0.38,32.28,44.21%
claude-3-5-sonnet-20240620,2.19,4.69,4.06,1.44,3.75,4.75,2.81,0.94,2.55,0.94,0.82,0.5,1.31,0.19,30.94,42.38%
baichuan4,2.88,2.62,3.5,1.69,3.0,3.5,3.38,2.0,2.51,0.45,2.55,0.0,0.69,1.75,30.51,41.80%
yi-large,2.81,3.0,3.75,1.62,3.12,2.62,3.38,2.12,2.74,0.34,2.7,0.0,0.94,0.69,29.84,40.87%
Avg,2.96,2.27,3.05,1.21,3.41,3.57,3.74,1.61,2.36,0.88,2.49,0.4,0.84,0.43,29.24,40.05%
mistral-large-2402,5.0,3.44,2.06,0.69,2.12,2.38,3.81,0.38,2.21,0.15,2.77,0.06,0.19,0.31,25.57,35.03%

Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Total,Rate
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
deepseek-chat,5.0,4.94,4.88,2.12,4.88,0.5,1.62,1.44,5.78,0.86,1.5,2.5,1.06,2.48,39.55,54.18%
deepseek-coder,5.0,3.06,4.81,2.56,5.0,1.31,1.19,1.12,5.17,1.12,1.8,2.44,0.88,2.77,38.25,52.40%
gpt-4o-2024-05-13,5.0,5.0,2.88,1.94,3.38,1.5,0.94,2.12,6.0,1.12,1.39,2.44,1.88,2.48,38.05,52.12%
claude-3-5-sonnet-20240620,5.0,5.0,2.44,3.94,2.25,1.0,1.38,3.56,6.0,1.24,0.53,2.19,0.38,3.0,37.89,51.90%
doubao-pro-32k-240615,5.0,5.0,1.69,3.56,4.94,0.19,0.06,3.31,4.09,0.41,0.94,2.0,1.56,2.42,35.17,48.18%
gemini-1.5-pro-001,5.0,5.0,2.12,3.69,2.31,1.31,0.06,2.06,5.92,2.17,0.94,0.44,1.06,2.74,34.84,47.72%
qwen2-72b-instruct,5.0,4.94,1.31,3.0,2.62,0.5,2.56,1.69,5.7,1.09,1.24,1.56,0.44,2.6,34.25,46.92%
qwen-max,5.0,5.0,2.0,4.94,4.94,0.06,0.0,0.62,5.03,1.61,0.0,0.75,0.56,2.35,32.86,45.02%
Avg,4.89,3.94,2.13,2.48,3.29,0.73,1.11,1.63,4.92,1.2,0.74,1.55,0.91,2.06,31.48,43.12%
moonshot-v1-8k,4.56,2.56,1.12,3.38,4.12,0.31,2.19,1.31,5.81,0.79,0.64,1.06,0.75,1.12,29.74,40.74%


In [36]:
ans_data = {}

for paper in papers:
    ans_data[paper] = {}

    for model in models:

        ans_data[paper][model] = {}
        model_res_dir = os.path.join(res_dir, model)
        paper_res_dir = os.path.join(model_res_dir, paper)
        fpaths = parse_data.find_files_with_suffix(paper_res_dir, '.jsonl')

        for fpath in fpaths:
            
            ques_idx = fpath.split(os.sep)[-1].split('.')[0]
            ans_data[paper][model][ques_idx] = {}
            total_cnt, correct_cnt, total_score = 0, 0, 0

            content = parse_data.read_jsonl(fpath)

            for i in content:
                if i['prompt_parameter']['lang'] == 'zh':
                    total_cnt += 1
                    if i['answer']['correct']:
                        correct_cnt += 1
                    total_score += i['answer']['score']

            ans_data[paper][model][ques_idx]['total_cnt'] = total_cnt
            ans_data[paper][model][ques_idx]['correct_cnt'] = correct_cnt
            ans_data[paper][model][ques_idx]['total_score'] = total_score
            ans_data[paper][model][ques_idx]['avg_score'] = round(total_score/total_cnt, 4)
            ans_data[paper][model][ques_idx]['accuracy'] = round(correct_cnt/total_cnt, 4)

tables = generate_acc_table(ans_data, papers, 'ZH')
display_multi_table(tables)

Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Total,Rate
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Total,Rate
Model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
deepseek-coder,4.88,2.25,2.5,2.88,4.62,4.75,5.0,4.25,5.85,2.7,3.67,1.75,2.0,0.0,47.1,64.52%
deepseek-chat,5.00,2.12,2.5,3.0,4.88,5.0,5.0,2.25,4.5,2.55,2.02,2.0,2.0,0.0,42.82,58.66%
qwen2-72b-instruct,4.88,4.12,2.5,1.12,4.75,4.88,4.75,1.75,4.35,1.65,3.23,0.12,1.88,1.12,41.1,56.30%
gemini-1.5-pro-001,2.00,2.0,4.88,1.25,5.0,4.25,4.75,3.0,4.65,1.65,1.5,0.62,1.62,0.12,37.3,51.10%
gpt-4o-2024-05-13,4.75,3.62,2.88,0.75,2.88,3.0,4.38,0.38,2.7,0.23,3.75,0.5,1.88,0.25,31.93,43.73%
claude-3-5-sonnet-20240620,2.50,4.62,3.62,1.62,3.75,4.5,4.25,1.0,1.65,0.45,0.68,0.88,1.25,0.38,31.15,42.67%
yi-large,3.12,3.0,3.75,1.62,3.38,3.12,4.0,2.5,2.17,0.07,2.33,0.0,1.12,0.88,31.07,42.57%
Avg,2.91,2.51,3.12,1.26,3.3,3.86,3.91,1.7,2.62,1.0,2.32,0.59,0.9,0.46,30.46,41.73%
baichuan4,2.50,2.62,3.38,1.88,1.75,3.38,4.38,2.38,2.33,0.53,2.77,0.0,0.62,1.62,30.12,41.27%
qwen-max,1.25,1.75,2.88,0.62,4.12,3.88,3.88,2.62,2.25,0.68,2.17,0.12,0.0,0.0,26.23,35.92%

Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Total,Rate
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
deepseek-coder,4.88,2.25,2.5,2.88,4.62,4.75,5.0,4.25,5.85,2.7,3.67,1.75,2.0,0.0,47.1,64.52%
deepseek-chat,5.0,2.12,2.5,3.0,4.88,5.0,5.0,2.25,4.5,2.55,2.02,2.0,2.0,0.0,42.82,58.66%
qwen2-72b-instruct,4.88,4.12,2.5,1.12,4.75,4.88,4.75,1.75,4.35,1.65,3.23,0.12,1.88,1.12,41.1,56.30%
gemini-1.5-pro-001,2.0,2.0,4.88,1.25,5.0,4.25,4.75,3.0,4.65,1.65,1.5,0.62,1.62,0.12,37.3,51.10%
gpt-4o-2024-05-13,4.75,3.62,2.88,0.75,2.88,3.0,4.38,0.38,2.7,0.23,3.75,0.5,1.88,0.25,31.93,43.73%
claude-3-5-sonnet-20240620,2.5,4.62,3.62,1.62,3.75,4.5,4.25,1.0,1.65,0.45,0.68,0.88,1.25,0.38,31.15,42.67%
yi-large,3.12,3.0,3.75,1.62,3.38,3.12,4.0,2.5,2.17,0.07,2.33,0.0,1.12,0.88,31.07,42.57%
Avg,2.91,2.51,3.12,1.26,3.3,3.86,3.91,1.7,2.62,1.0,2.32,0.59,0.9,0.46,30.46,41.73%
baichuan4,2.5,2.62,3.38,1.88,1.75,3.38,4.38,2.38,2.33,0.53,2.77,0.0,0.62,1.62,30.12,41.27%
qwen-max,1.25,1.75,2.88,0.62,4.12,3.88,3.88,2.62,2.25,0.68,2.17,0.12,0.0,0.0,26.23,35.92%

Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Total,Rate
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
deepseek-coder,5.0,2.12,4.75,4.5,5.0,2.12,1.0,0.75,6.0,0.9,2.55,2.38,1.5,2.62,41.2,56.44%
deepseek-chat,5.0,4.88,5.0,4.12,5.0,0.62,1.62,1.25,6.0,0.6,1.43,2.5,0.88,1.95,40.85,55.96%
gpt-4o-2024-05-13,5.0,5.0,2.62,3.0,3.75,1.75,0.75,1.62,6.0,1.27,1.57,2.38,1.88,2.77,39.38,53.94%
claude-3-5-sonnet-20240620,5.0,5.0,2.38,4.88,1.75,1.12,1.0,3.62,6.0,1.27,1.05,2.0,0.75,3.0,38.82,53.18%
gemini-1.5-pro-001,5.0,5.0,2.25,4.62,2.25,0.88,0.0,2.62,6.0,2.4,0.6,0.0,0.5,3.02,35.15,48.15%
doubao-pro-32k-240615,5.0,5.0,1.88,4.0,5.0,0.38,0.0,2.75,3.75,0.68,1.05,2.0,1.12,1.77,34.38,47.09%
qwen2-72b-instruct,5.0,4.88,1.38,3.75,3.12,0.62,2.38,2.12,5.55,1.12,0.23,1.5,0.12,2.02,33.8,46.30%
qwen-max,5.0,5.0,1.25,5.0,5.0,0.0,0.0,0.75,4.95,1.65,0.0,0.62,0.75,2.6,32.57,44.62%
Avg,4.79,3.88,2.06,3.32,3.28,0.84,1.05,1.73,4.85,1.21,0.76,1.32,0.84,1.9,31.74,43.48%
glm-4-0520,5.0,5.0,1.75,3.75,3.62,0.62,0.5,0.25,4.42,1.05,0.15,1.25,1.62,1.43,30.43,41.68%


In [37]:
ans_data = {}

for paper in papers:
    ans_data[paper] = {}

    for model in models:

        ans_data[paper][model] = {}
        model_res_dir = os.path.join(res_dir, model)
        paper_res_dir = os.path.join(model_res_dir, paper)
        fpaths = parse_data.find_files_with_suffix(paper_res_dir, '.jsonl')

        for fpath in fpaths:
            
            ques_idx = fpath.split(os.sep)[-1].split('.')[0]
            ans_data[paper][model][ques_idx] = {}
            total_cnt, correct_cnt, total_score = 0, 0, 0

            content = parse_data.read_jsonl(fpath)

            for i in content:
                if i['prompt_parameter']['lang'] == 'en':
                    total_cnt += 1
                    if i['answer']['correct']:
                        correct_cnt += 1
                    total_score += i['answer']['score']

            ans_data[paper][model][ques_idx]['total_cnt'] = total_cnt
            ans_data[paper][model][ques_idx]['correct_cnt'] = correct_cnt
            ans_data[paper][model][ques_idx]['total_score'] = total_score
            ans_data[paper][model][ques_idx]['avg_score'] = round(total_score/total_cnt, 4)
            ans_data[paper][model][ques_idx]['accuracy'] = round(correct_cnt/total_cnt, 4)

tables = generate_acc_table(ans_data, papers, 'EN')
display_multi_table(tables)

Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Total,Rate
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Total,Rate
Model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
qwen2-72b-instruct,5.00,1.62,3.38,1.25,5.0,4.62,3.88,3.88,4.2,0.6,4.65,0.38,0.0,0.88,39.33,53.87%
deepseek-chat,5.00,0.62,2.5,2.25,5.0,3.62,5.0,1.62,2.7,0.9,3.67,0.25,1.5,0.25,34.9,47.81%
deepseek-coder,5.00,1.0,2.5,2.38,4.5,2.88,5.0,2.12,1.95,0.23,3.52,1.12,2.5,0.12,34.83,47.71%
gemini-1.5-pro-001,1.88,2.25,4.62,1.38,5.0,3.5,4.25,3.25,3.0,1.65,2.02,0.12,1.62,0.12,34.67,47.50%
gpt-4o-2024-05-13,5.00,2.75,2.75,1.25,3.62,2.75,4.88,0.0,2.4,0.0,3.98,1.0,1.75,0.5,32.62,44.69%
baichuan4,3.25,2.62,3.62,1.5,4.25,3.62,2.38,1.62,2.7,0.38,2.33,0.0,0.75,1.88,30.9,42.33%
claude-3-5-sonnet-20240620,1.88,4.75,4.5,1.25,3.75,5.0,1.38,0.88,3.45,1.43,0.97,0.12,1.38,0.0,30.73,42.09%
yi-large,2.50,3.0,3.75,1.62,2.88,2.12,2.75,1.75,3.3,0.6,3.08,0.0,0.75,0.5,28.6,39.18%
Avg,3.02,2.04,2.99,1.16,3.51,3.28,3.58,1.52,2.1,0.75,2.67,0.21,0.78,0.39,28.01,38.37%
ERNIE-4.0-Turbo-8K,2.00,1.75,2.88,1.12,2.88,4.5,4.12,1.12,1.5,0.68,3.23,0.12,0.75,0.0,26.65,36.51%

Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Total,Rate
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
qwen2-72b-instruct,5.0,1.62,3.38,1.25,5.0,4.62,3.88,3.88,4.2,0.6,4.65,0.38,0.0,0.88,39.33,53.87%
deepseek-chat,5.0,0.62,2.5,2.25,5.0,3.62,5.0,1.62,2.7,0.9,3.67,0.25,1.5,0.25,34.9,47.81%
deepseek-coder,5.0,1.0,2.5,2.38,4.5,2.88,5.0,2.12,1.95,0.23,3.52,1.12,2.5,0.12,34.83,47.71%
gemini-1.5-pro-001,1.88,2.25,4.62,1.38,5.0,3.5,4.25,3.25,3.0,1.65,2.02,0.12,1.62,0.12,34.67,47.50%
gpt-4o-2024-05-13,5.0,2.75,2.75,1.25,3.62,2.75,4.88,0.0,2.4,0.0,3.98,1.0,1.75,0.5,32.62,44.69%
baichuan4,3.25,2.62,3.62,1.5,4.25,3.62,2.38,1.62,2.7,0.38,2.33,0.0,0.75,1.88,30.9,42.33%
claude-3-5-sonnet-20240620,1.88,4.75,4.5,1.25,3.75,5.0,1.38,0.88,3.45,1.43,0.97,0.12,1.38,0.0,30.73,42.09%
yi-large,2.5,3.0,3.75,1.62,2.88,2.12,2.75,1.75,3.3,0.6,3.08,0.0,0.75,0.5,28.6,39.18%
Avg,3.02,2.04,2.99,1.16,3.51,3.28,3.58,1.52,2.1,0.75,2.67,0.21,0.78,0.39,28.01,38.37%
ERNIE-4.0-Turbo-8K,2.0,1.75,2.88,1.12,2.88,4.5,4.12,1.12,1.5,0.68,3.23,0.12,0.75,0.0,26.65,36.51%

Question Index,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Total,Rate
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
deepseek-chat,5.0,5.0,4.75,0.12,4.75,0.38,1.62,1.62,5.55,1.12,1.57,2.5,1.25,3.0,38.25,52.40%
claude-3-5-sonnet-20240620,5.0,5.0,2.5,3.0,2.75,0.88,1.75,3.5,6.0,1.2,0.0,2.38,0.0,3.0,36.95,50.62%
gpt-4o-2024-05-13,5.0,5.0,3.12,0.88,3.0,1.25,1.12,2.62,6.0,0.97,1.2,2.5,1.88,2.17,36.72,50.31%
doubao-pro-32k-240615,5.0,5.0,1.5,3.12,4.88,0.0,0.12,3.88,4.42,0.15,0.82,2.0,2.0,3.08,35.98,49.28%
deepseek-coder,5.0,4.0,4.88,0.62,5.0,0.5,1.38,1.5,4.35,1.35,1.05,2.5,0.25,2.92,35.3,48.36%
qwen2-72b-instruct,5.0,5.0,1.25,2.25,2.12,0.38,2.75,1.25,5.85,1.05,2.25,1.62,0.75,3.17,34.7,47.53%
gemini-1.5-pro-001,5.0,5.0,2.0,2.75,2.38,1.75,0.12,1.5,5.85,1.95,1.27,0.88,1.62,2.45,34.53,47.29%
qwen-max,5.0,5.0,2.75,4.88,4.88,0.12,0.0,0.5,5.1,1.57,0.0,0.88,0.38,2.1,33.15,45.41%
ERNIE-4.0-Turbo-8K,4.88,4.5,1.88,0.0,3.88,1.0,1.0,2.62,4.95,1.27,0.68,2.5,1.5,1.43,32.07,43.94%
Avg,4.99,4.01,2.2,1.64,3.3,0.63,1.16,1.53,4.98,1.18,0.73,1.79,0.97,2.21,31.21,42.75%


# Repetition

In [57]:
res = dict()

for model in models:
    model_res_dir = os.path.join(res_dir, model)
    fpaths = parse_data.find_files_with_suffix(model_res_dir, '.jsonl')
    res[model] = Counter()

    for fpath in fpaths:
        content = parse_data.read_jsonl(fpath)

        for i in content:
            if 'repetition' in i['answer'].keys():
                res[model]['bad_case'] += 1
                if i['answer']['repetition']:
                    res[model]['repetition'] += 1
                else:
                    res[model]['repetition'] += 0

# Convert the counter to a DataFrame
data = [{'Model': model, 'Bad Cases': counts['bad_case'], 'Repetitions': counts['repetition']} for model, counts in res.items()]
df = pd.DataFrame(data)

# Calculate Repetition Rate and format it as a percentage with two decimal places
df['Repetition Rate'] = df['Repetitions'] / df['Bad Cases']
df['Repetition Rate'] = df['Repetition Rate'].fillna(0)  # Fill NaN values with zero if there are no bad cases
df['Repetition Rate'] = df['Repetition Rate'].apply(lambda x: "{:.2%}".format(x))

# Sort the DataFrame by 'Repetition Rate' in ascending order
df.sort_values(by='Repetition Rate', ascending=True, na_position='first', inplace=True)

# Rearrange columns and reset index
df = df[['Model', 'Repetitions', 'Bad Cases', 'Repetition Rate']]
df.reset_index(drop=True, inplace=True)

df

Unnamed: 0,Model,Repetitions,Bad Cases,Repetition Rate
0,claude-3-5-sonnet-20240620,0,0,0.00%
1,doubao-pro-32k-240615,0,2,0.00%
2,gemini-1.5-pro-001,1,8,12.50%
3,mistral-large-2402,7,34,20.59%
4,ERNIE-4.0-Turbo-8K,124,597,20.77%
5,deepseek-chat,12,48,25.00%
6,deepseek-coder,14,55,25.45%
7,baichuan4,126,393,32.06%
8,qwen2-72b-instruct,2,6,33.33%
9,gpt-4o-2024-05-13,17,45,37.78%
