In [103]:
import numpy as np
import scipy.optimize as opt
from collections import Counter, defaultdict
import pickle
import pandas as pd


import plotly.graph_objects as go
import plotly.express as px
import os
import base64
import plotly.io as pio

In [104]:
models = [
    "chatglm3-6b-chat",
    "qwen2.5-7b-instruct",
    "baichuan2-7b-chat",
    "hunyuan",
    "mistral-7b-instruct-v0.3",
    "deepseek-v2-lite-chat",
    "gpt35",
    "gpt4o",
    "llama"
    ]

aggregate_models = [
    "chatglm3-6b-chat",
    "qwen2.5-7b-instruct",
    "baichuan2-7b-chat",
    "hunyuan",
    "mistral-7b-instruct-v0.3",
    "deepseek-v2-lite-chat"
]

# Update the domain mapping with the new names
domain_mapping = {
    "安全生产": "Production Safety",
    "石油天然气": "Oil and Gas",
    "消防": "Fire Safety",
    "建筑工程": "Civil Engineering",
    "经济金融": "Economics and Finance",
    "银行保险": "Banking and Insurance"
}


type_mapping = {
    "多选题": "Multiple Choice",
    "单选题": "Single Choice",
    "判断题": "True/False"
}

# Define the desired order for subdomains
subdomain_order = [
    "Production Safety",
    "Oil and Gas",
    "Fire Safety",
    "Civil Engineering",
    "Economics and Finance",
    "Banking and Insurance"
]

LLM_order = [
    "chatglm3-6b-chat",
    "baichuan2-7b-chat",
    "qwen2.5-7b-instruct",
    "hunyuan",
    "deepseek-v2-lite-chat",
    "mistral-7b-instruct-v0.3",
    "llama",
    "gpt35",
    "gpt4o",
]

In [None]:
import json

# Load data_list from a JSON file instead of pickle
def load_data_list_from_json(file_name):
	with open(file_name, 'r', encoding='utf-8') as file:
		data = json.load(file)
	print(f"data_list loaded from {file_name}")
	return data

file_name = "../../data/QualBench.json"
data_list = load_data_list_from_json(file_name)

data_list loaded from ../data/QualBench.json


## Dataset Statistics

In [106]:
unique_domain = set(entry["domain"] for entry in data_list)
print("Unique Domains:", unique_domain)

domain_counts = Counter(entry["domain"] for entry in data_list)
print("Number of questions under each domain:", domain_counts)

total_questions = sum(domain_counts.values())
print("Total number of questions:", total_questions)

type_counts = Counter(entry["question_type"] for entry in data_list)
print("Number of questions under each type:", type_counts)

# Data for the pie charts
domain_labels = list(domain_counts.keys())
domain_values = list(domain_counts.values())

type_labels = list(type_counts.keys())
type_values = list(type_counts.values())

Unique Domains: {'银行保险', '建筑工程', '经济金融', '消防', '石油天然气', '安全生产'}
Number of questions under each domain: Counter({'安全生产': 6520, '消防': 3401, '经济金融': 2377, '建筑工程': 1978, '石油天然气': 1604, '银行保险': 1436})
Total number of questions: 17316
Number of questions under each type: Counter({'单选题': 9538, '判断题': 4068, '多选题': 3710})


In [107]:
# Initialize a nested dictionary to store counts
questions_count_by_type_and_subdomain = defaultdict(lambda: defaultdict(int))

# Iterate through the data list to count questions
for entry in data_list:
    subdomain = domain_mapping[entry["domain"]]
    q_type = type_mapping[entry["question_type"]]
    questions_count_by_type_and_subdomain[subdomain][q_type] += 1

# Convert the result to a dictionary for better readability
questions_count_by_type_and_subdomain = {subdomain: dict(types) for subdomain, types in questions_count_by_type_and_subdomain.items()}

# Print the result
print("Number of questions under each type for each subdomain:")
for subdomain, counts in questions_count_by_type_and_subdomain.items():
    print(f"{subdomain}: {counts}")

Number of questions under each type for each subdomain:
Production Safety: {'Single Choice': 4187, 'Multiple Choice': 844, 'True/False': 1489}
Oil and Gas: {'Single Choice': 733, 'Multiple Choice': 333, 'True/False': 538}
Fire Safety: {'Single Choice': 1444, 'Multiple Choice': 827, 'True/False': 1130}
Civil Engineering: {'Single Choice': 1420, 'Multiple Choice': 558}
Economics and Finance: {'Single Choice': 1052, 'Multiple Choice': 717, 'True/False': 608}
Banking and Insurance: {'Single Choice': 702, 'Multiple Choice': 431, 'True/False': 303}


In [108]:
english_labels = [domain_mapping[label] for label in domain_counts.keys()]

# Create a pie chart for the number of questions in each domain
fig = go.Figure(data=[go.Pie(
    labels=english_labels, 
    values=list(domain_counts.values()), 
    hole=0.3,
    textinfo='value+percent',  # Display label, value, and percentage
    insidetextfont=dict(color='white')  # Set font color to white
)])

fig.update_layout(
    showlegend=True,
    annotations=[dict(text="Domain", font_size=15, showarrow=False)],
    margin=dict(l=20, r=40, t=20, b=20),  # Reduced margins
    width=500,  # Set compact width
    height=300  # Set compact height
)
fig.show()
# pio.write_image(fig, './img/stats_by_domain.jpg',scale=6)

In [109]:
# Map Chinese type labels to English using type_mapping
english_type_labels = [type_mapping[label] for label in type_counts.keys()]

# Create a pie chart for the number of questions in each domain
fig = go.Figure(data=[go.Pie(labels=english_type_labels, values=list(type_counts.values()), hole=0.3, textinfo='value+percent', insidetextfont=dict(color='white'))])
fig.update_layout(
    showlegend=True,
    annotations=[dict(text="Type", font_size=15, showarrow=False)],
    margin=dict(l=20, r=40, t=20, b=20),  # Reduced margins
    width=450,  # Set compact width
    height=300  # Set compact height
)
fig.show()
# pio.write_image(fig, './img/stats_by_type.jpg',scale=6)

## Dataset Evaluation with LLMs

In [None]:
# Load data_list from a file
def load_data_list(file_name):
    with open(file_name, 'rb') as file:
        data_list = pickle.load(file)
    print(f"data_list loaded from {file_name}")
    return data_list

# Example usage
file_name = "../llm_evaluation.pkl"
data_list = load_data_list(file_name)

unique_domain = set(entry["domain"] for entry in data_list.values())
print("Unique Domains:", unique_domain)

data_list loaded from ./llm_evaluation.pkl
Unique Domains: {'Civil Engineering', 'Fire Safety', 'Production Safety', 'Banking and Insurance', 'Oil and Gas', 'Economics and Finance'}


In [111]:
from sklearn.metrics import precision_score, recall_score

def custom_accuracy_score(y_true, y_pred):
    """
    Custom accuracy function that considers predictions correct if they contain the same characters,
    regardless of their order.

    Args:
        y_true (list): List of true labels.
        y_pred (list): List of predicted labels.

    Returns:
        float: Custom accuracy score.
    """
    correct = sum(sorted(true) == sorted(pred) for true, pred in zip(y_true, y_pred))
    return correct / len(y_true)


def custom_f1_score(y_true, y_pred):
    """
    Custom F1 score function that considers predictions correct if they contain the same characters,
    regardless of their order.

    Args:
        y_true (list): List of true labels.
        y_pred (list): List of predicted labels.

    Returns:
        float: Custom F1 score.
    """
    # Convert each true and predicted label to sorted strings for comparison
    y_true_sorted = [''.join(sorted(true)) for true in y_true]
    y_pred_sorted = [''.join(sorted(pred)) for pred in y_pred]

    # Calculate precision, recall, and F1 score
    precision = precision_score(y_true_sorted, y_pred_sorted, average='weighted', zero_division=0)
    recall = recall_score(y_true_sorted, y_pred_sorted, average='weighted', zero_division=0)
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return f1

In [112]:
def compute_f1_scores_by_subdomain(data_list, models):
    """
    Compute F1 scores for each LLM under each subdomain.

    Args:
        data_list (dict): A dictionary containing questions, answers, and model results.
        models (list): A list of model names.

    Returns:
        defaultdict: A nested dictionary where the outer keys are model names, the inner keys are subdomains,
                     and the values are the F1 scores for that subdomain.
    """
    # Group data by subdomain
    grouped_data = defaultdict(lambda: defaultdict(list))
    for entry in data_list.values():
        subdomain = entry["domain"]
        ground_truth = entry["answer"]
        for model_result in entry["results"]:
            for model, answer in model_result.items():
                grouped_data[model][subdomain].append((ground_truth, answer))
    
    # Compute F1 scores for each model and subdomain
    f1_scores = defaultdict(dict)
    for model, subdomains in grouped_data.items():
        for subdomain, entries in subdomains.items():
            true_labels = [ground_truth for ground_truth, _ in entries]
            predicted_labels = [answer for _, answer in entries]
            f1_scores[model][subdomain] = custom_f1_score(true_labels, predicted_labels)
    
    return f1_scores

def compute_acc_scores_by_subdomain(data_list, models):
    """
    Compute F1 scores for each LLM under each subdomain.

    Args:
        data_list (dict): A dictionary containing questions, answers, and model results.
        models (list): A list of model names.

    Returns:
        defaultdict: A nested dictionary where the outer keys are model names, the inner keys are subdomains,
                     and the values are the F1 scores for that subdomain.
    """
    # Group data by subdomain
    grouped_data = defaultdict(lambda: defaultdict(list))
    for entry in data_list.values():
        subdomain = entry["domain"]
        ground_truth = entry["answer"]
        for model_result in entry["results"]:
            for model, answer in model_result.items():
                grouped_data[model][subdomain].append((ground_truth, answer))
    
    # Compute F1 scores for each model and subdomain
    acc_scores = defaultdict(dict)
    for model, subdomains in grouped_data.items():
        for subdomain, entries in subdomains.items():
            true_labels = [ground_truth for ground_truth, _ in entries]
            predicted_labels = [answer for _, answer in entries]
            acc_scores[model][subdomain] = custom_accuracy_score(true_labels, predicted_labels)
    
    return acc_scores

# Compute F1 scores for each LLM under each subdomain
f1_scores_by_subdomain = compute_f1_scores_by_subdomain(data_list, models)
llm_accuracy_by_subdomain = compute_acc_scores_by_subdomain(data_list, models)

In [113]:
def save_pivot_table_to_excel(pivot_table, output_file, sheet_name="Pivot Table"):
    """
    Save a pivot table to an Excel file with formatting.

    Args:
        pivot_table (pd.DataFrame): The pivot table to save.
        output_file (str): The path to the output Excel file.
        sheet_name (str): The name of the sheet in the Excel file.
    """
    with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
        pivot_table.to_excel(writer, sheet_name=sheet_name, startrow=1, header=False)

        # Get the workbook and worksheet objects
        workbook = writer.book
        worksheet = writer.sheets[sheet_name]

        # Write the column headers with formatting
        header_format = workbook.add_format({'bold': True, 'text_wrap': True, 'valign': 'top', 'align': 'center', 'border': 1})
        for col_num, value in enumerate(pivot_table.columns.values):
            worksheet.write(0, col_num + 1, value[0], header_format)
            worksheet.write(1, col_num + 1, value[1], header_format)

        # Write the index header
        worksheet.write(1, 0, pivot_table.index.name, header_format)

        # Apply formatting to the data
        data_format = workbook.add_format({'border': 1, 'align': 'center'})
        for row_num, row_data in enumerate(pivot_table.itertuples(), start=2):
            worksheet.write(row_num, 0, row_data.Index, data_format)
            for col_num, cell_value in enumerate(row_data[1:], start=1):
                worksheet.write(row_num, col_num, cell_value, data_format)

    print(f"Table saved to {output_file}")

In [114]:
# Prepare data for accuracy and F1 scores
llm_scores = {}

for model in models:
    llm_scores[model] = {}
    for subdomain in unique_domain:
        acc = llm_accuracy_by_subdomain[model].get(subdomain, 0)
        f1 = f1_scores_by_subdomain[model].get(subdomain, 0)
        llm_scores[model][subdomain] = {"Accuracy": acc, "F1": f1}

# Create a DataFrame for better visualization
rows = []
for model, subdomains in llm_scores.items():
    for subdomain, scores in subdomains.items():
        rows.append({
            "LLM": model,
            "Subdomain": subdomain,
            "Accuracy": scores["Accuracy"],
            "F1": scores["F1"]
        })

llm_scores_df = pd.DataFrame(rows)

# Pivot the table for better readability
llm_scores_pivot = llm_scores_df.pivot(index="LLM", columns="Subdomain", values=["Accuracy", "F1"])

# Reorder the DataFrame
llm_scores_pivot_reordered = llm_scores_pivot.reorder_levels([1, 0], axis=1)  # Swap levels to make subdomain primary

llm_scores_pivot_reordered = llm_scores_pivot_reordered.sort_index(axis=1, level=0)  # Sort columns by subdomain

# Reorder the rows of the DataFrame based on the LLM_order
llm_scores_pivot_reordered = llm_scores_pivot_reordered.loc[LLM_order]
llm_scores_pivot_reordered = llm_scores_pivot_reordered[subdomain_order]  # Reorder subdomains

# Display the reordered DataFrame
print("Reordered LLM Scores by Subdomain with LLM Order:")
display(llm_scores_pivot_reordered)
# save_pivot_table_to_excel(llm_scores_pivot_reordered, "llm_evaluation_results_by_domains.xlsx", sheet_name="LLM Scores")

Reordered LLM Scores by Subdomain with LLM Order:


Subdomain,Production Safety,Production Safety,Oil and Gas,Oil and Gas,Fire Safety,Fire Safety,Civil Engineering,Civil Engineering,Economics and Finance,Economics and Finance,Banking and Insurance,Banking and Insurance
Unnamed: 0_level_1,Accuracy,F1,Accuracy,F1,Accuracy,F1,Accuracy,F1,Accuracy,F1,Accuracy,F1
LLM,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
chatglm3-6b-chat,0.414198,0.453959,0.488778,0.453555,0.535321,0.545194,0.394455,0.399058,0.387178,0.403103,0.437456,0.475614
baichuan2-7b-chat,0.496947,0.507134,0.608479,0.616843,0.521122,0.52945,0.404356,0.420607,0.421341,0.436052,0.484976,0.499332
qwen2.5-7b-instruct,0.769771,0.783861,0.718204,0.751284,0.785588,0.791047,0.630495,0.645852,0.7752,0.778094,0.825297,0.832035
hunyuan,0.501069,0.539022,0.506234,0.58567,0.557685,0.577885,0.435644,0.464771,0.504007,0.526758,0.55905,0.597663
deepseek-v2-lite-chat,0.487176,0.527375,0.579177,0.538696,0.586084,0.592796,0.456634,0.458361,0.499367,0.5004,0.590496,0.60578
mistral-7b-instruct-v0.3,0.432519,0.435359,0.514339,0.506857,0.480653,0.491307,0.368317,0.357247,0.382117,0.386618,0.415793,0.41633
llama,0.321985,0.323877,0.339152,0.332042,0.356763,0.361874,0.226535,0.234279,0.277098,0.271594,0.266247,0.273734
gpt35,0.589313,0.597572,0.693267,0.704318,0.533191,0.551275,0.474059,0.493919,0.542809,0.543395,0.625437,0.632283
gpt4o,0.634962,0.643725,0.737531,0.748442,0.582535,0.601575,0.532277,0.548374,0.582033,0.586108,0.66457,0.671618


In [115]:
icon_files = {
    "chatglm3-6b-chat": "chatglm-color.png",
    "qwen2.5-7b-instruct": "qwen-color.png",
    "baichuan2-7b-chat": "baichuan-color.png",
    "hunyuan": "hunyuan-color.png",
    "mistral-7b-instruct-v0.3": "mistral-color.png",
    "deepseek-v2-lite-chat": "deepseek-color.png",
    "gpt35": "gpt-35.webp",
    "gpt4o": "gpt-4.webp",
    "llama": "meta-color.png"
}

# Directory containing the icons
icon_dir = "../img/icons"

# Encode images as base64 for embedding in Plotly
encoded_images = {}
for model in models:
    img_path = os.path.join(icon_dir, icon_files[model])
    with open(img_path, "rb") as img_file:
        encoded_images[model] = "data:image/png;base64," + base64.b64encode(img_file.read()).decode()

# Aggregate accuracy by domain for each model
accuracy_by_domain = {model: [llm_accuracy_by_subdomain[model].get(domain, 0) for domain in subdomain_order] for model in models}

# Calculate average accuracy for each model
average_accuracy = {model: np.mean(accuracies) for model, accuracies in accuracy_by_domain.items()}

# Sort models by average accuracy in descending order
sorted_models = sorted(average_accuracy.keys(), key=lambda model: average_accuracy[model], reverse=True)

# Reorder accuracy_by_domain based on sorted models
sorted_accuracy_by_domain = {model: accuracy_by_domain[model] for model in sorted_models}

# Define numerical x-values for domains
domain_indices = list(range(len(subdomain_order)))

# Define a better color scheme
colors = px.colors.qualitative.Plotly  # 10 distinct colors, sufficient for 9 models

# Create the bar chart
fig = go.Figure()

# Add bars for each model
for i, model in enumerate(sorted_models):
    fig.add_trace(go.Bar(
        x=domain_indices,
        y=sorted_accuracy_by_domain[model],
        name=model,
        marker_color=colors[i]  # Assign a distinct color from the Plotly palette
    ))

# Add icons with adjusted positions
N = len(sorted_models)  # Number of models
for i, model in enumerate(sorted_models):
    for j, domain in enumerate(subdomain_order):
        accuracy = sorted_accuracy_by_domain[model][j]
        # Calculate x-position: center of each bar within the group
        x_pos = domain_indices[j] + (i - (N-1)/2) * (0.85 / N)
        y_pos = accuracy + 0.02  # Adjusted offset above the bar
        fig.add_layout_image(
            dict(
                source=encoded_images[model],
                x=x_pos,
                y=y_pos,
                xref="x",
                yref="y",
                sizex=0.05,  # Smaller size to fit within bar width and avoid overlap
                sizey=0.05,
                xanchor="center",
                yanchor="bottom",
                layer="above"
            )
        )


# Calculate the maximum y-value for the y-axis range
max_accuracy = max([max(accuracies) for accuracies in sorted_accuracy_by_domain.values()])
max_y = 0.88


# Update layout to make the figure compact
fig.update_layout(
    title=None,
    # xaxis_title="Domain",
    yaxis_title="Accuracy",
    barmode="group",
    showlegend=False,
    xaxis=dict(
        tickmode='array',
        tickvals=domain_indices,
        ticktext=subdomain_order,
        title_font=dict(size=14),
        tickfont=dict(size=12)
    ),
    yaxis=dict(
        title_font=dict(size=14),
        tickfont=dict(size=12),
        range=[0.1, max_y],  # Start y-axis at 0.5
        tickvals=np.arange(0.1, max_y + 0.01, 0.1).tolist(),  # Start ticks at 0.6 to skip 0.5
        # ticktext=[f"{val:.1f}" for val in np.arange(0.6, max_y + 0.01, 0.1)],  # Format tick labels
    ),
    margin=dict(l=10, r=0, t=10, b=5),  # Reduced margins
    bargap=0.15,  # Tighter spacing between bars in a group
    bargroupgap=0.05,  # Tighter spacing between groups
    width=1000,  # Smaller figure width
    height=340,  # Smaller figure height
    template="plotly_white"
)
# Show the plot
fig.show()
# pio.write_image(fig, '../img/fig_evaluation_result.pdf',scale=6)

In [116]:
type_order = [
    "Single Choice",
    "Multiple Choice",
    "True/False"
]

def compute_scores_by_q_type(data_list, models):
    """
    Compute accuracy and F1 scores for each LLM under each question type.

    Args:
        data_list (dict): A dictionary containing questions, answers, and model results.
        models (list): A list of model names.

    Returns:
        defaultdict: A nested dictionary where the outer keys are model names, the inner keys are question types,
                     and the values are dictionaries containing accuracy and F1 scores.
    """
    grouped_data = defaultdict(lambda: defaultdict(list))
    
    # Group data by question type
    for entry in data_list.values():
        q_type = entry["q_type"]
        ground_truth = entry["answer"]
        for model_result in entry["results"]:
            for model, answer in model_result.items():
                grouped_data[model][q_type].append((ground_truth, answer))
    
    # Compute accuracy and F1 scores for each model and question type
    scores_by_q_type = defaultdict(dict)
    for model, q_types in grouped_data.items():
        for q_type, entries in q_types.items():
            true_labels = [ground_truth for ground_truth, _ in entries]
            predicted_labels = [answer for _, answer in entries]
            accuracy = custom_accuracy_score(true_labels, predicted_labels)
            f1 = custom_f1_score(true_labels, predicted_labels)
            scores_by_q_type[model][q_type] = {"Accuracy": accuracy, "F1": f1}
    
    return scores_by_q_type

# Compute scores for each LLM under each question type
scores_by_q_type = compute_scores_by_q_type(data_list, models)

# Prepare data for a DataFrame
rows = []
for model, q_types in scores_by_q_type.items():
    for q_type, scores in q_types.items():
        rows.append({
            "LLM": model,
            "Question Type": q_type,
            "Accuracy": scores["Accuracy"],
            "F1": scores["F1"]
        })

# Create a DataFrame for better visualization
llm_scores_q_type_df = pd.DataFrame(rows)

# Pivot the table for better readability
llm_scores_q_type_pivot = llm_scores_q_type_df.pivot(index="LLM", columns="Question Type", values=["Accuracy", "F1"])

# Reorder the DataFrame
llm_scores_q_type_pivot_reordered = llm_scores_q_type_pivot.reorder_levels([1, 0], axis=1)  # Swap levels to make subdomain primary
# Reorder the rows of the DataFrame based on the LLM_order
llm_scores_q_type_pivot_reordered = llm_scores_q_type_pivot_reordered.sort_index(axis=1, level=0)  # Sort columns by subdomain

llm_scores_q_type_pivot_reordered = llm_scores_q_type_pivot_reordered.loc[LLM_order]
llm_scores_q_type_pivot_reordered = llm_scores_q_type_pivot_reordered[type_order]  # Reorder subdomains


# Display the table
print("LLM Scores by Question Type (Accuracy and F1):")
display(llm_scores_q_type_pivot_reordered)
# save_pivot_table_to_excel(llm_scores_q_type_pivot_reordered, "llm_evaluation_results_by_types.xlsx", sheet_name="LLM Scores")

LLM Scores by Question Type (Accuracy and F1):


Question Type,Single Choice,Single Choice,Multiple Choice,Multiple Choice,True/False,True/False
Unnamed: 0_level_1,Accuracy,F1,Accuracy,F1,Accuracy,F1
LLM,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
chatglm3-6b-chat,0.477333,0.479964,0.167649,0.211163,0.552954,0.555249
baichuan2-7b-chat,0.506105,0.507216,0.145256,0.198574,0.70301,0.703588
qwen2.5-7b-instruct,0.808897,0.809444,0.611668,0.626222,0.740914,0.747048
hunyuan,0.564285,0.581659,0.253388,0.317885,0.576366,0.599361
deepseek-v2-lite-chat,0.556747,0.558855,0.336181,0.349995,0.572575,0.579811
mistral-7b-instruct-v0.3,0.437308,0.439227,0.151149,0.184489,0.626756,0.630962
llama,0.280285,0.285233,0.088097,0.10701,0.519287,0.51881
gpt35,0.602293,0.606248,0.333824,0.359116,0.679376,0.703412
gpt4o,0.651343,0.657528,0.35386,0.403915,0.740691,0.751647
