## 两个模型得分比较

### Prepare

In [39]:
review_file = '../llmuses/registry/data/qa_browser/battle.jsonl'
category_map_file = '../llmuses/registry/data/qa_browser/category_mapping.yaml'

In [40]:
import pandas as pd
import json
import yaml

def read_yaml(yaml_file) -> dict:
    """
    Read yaml file to dict.
    """
    with open(yaml_file, "r") as f:
        try:
            stream = yaml.safe_load(f)
        except yaml.YAMLError as e:
            print(e)
            raise e

    return stream

def read_jsonl(input_file):
    all_data = []
    with open(input_file, 'r') as input_jsonl:
        for line in input_jsonl:
            data = json.loads(line)
            all_data.append(data)

    return all_data

def cat_view(df, category, model_a,model_b):
    coding_df = df[df['category'] == category]

    scores_a = coding_df[coding_df['model_a'] == model_a]['scores']
    scores_b = coding_df[coding_df['model_a'] != model_a]['scores'].apply(lambda x: [x[1], x[0]])

    score_a = 0
    score_b = 0
    for i in range(len(scores_a)):
        score_a += scores_a.iloc[i][0]
        score_b += scores_a.iloc[i][1]

    for i in range(len(scores_b)):
        score_a += scores_b.iloc[i][0]
        score_b += scores_b.iloc[i][1]

    return dict(
        category=category,
        count=len(coding_df),
        model_a=score_a,
        model_b=score_b
    )

def get_color(value):
    good_thresholds = [0.2, 0.1, 0.05]
    bad_thresholds = [-0.2, -0.1, -0.05]
    good_colors = ['#32CD32', '#98FF98', '#D0F0C0']
    bad_colors = ['#FF6347', '#FA8072', '#ffcccb']
    color = ''
    for i in range(len(good_thresholds)):
        if value > good_thresholds[i]:
            color = good_colors[i]
            break
    for i in range(len(bad_thresholds)):
        if value < bad_thresholds[i]:
            color = bad_colors[i]
            break
    return 'background-color: %s' % color if color else ''

def get_table_view(df):
    model_a = df.loc[0, 'model_a']
    model_b = df.loc[0, 'model_b']

    cat_data = [cat_view(df, category, model_a, model_b) for category in df['category'].unique().tolist()]
    cat_df = pd.DataFrame(cat_data)
    score_a = cat_df['model_a'].sum()
    score_b = cat_df['model_b'].sum()

    cat_df['model_a'] = cat_df.apply(lambda x: x['model_a'] / x['count'], axis=1)
    cat_df['model_b'] = cat_df.apply(lambda x: x['model_b'] / x['count'], axis=1)
    cat_df['count'] = cat_df['count'].apply(lambda x: int(x / 2))
    cat_df['score diff'] = cat_df.apply(lambda x: (x['model_b'] - x['model_a']) / x['model_a'], axis=1)
    cat_df.sort_values(by='model_a', ascending=False, inplace=True, ignore_index=True)

    cat_df.loc[cat_df.shape[0]] = ['总计', int(len(df) / 2), score_a / len(df), score_b / len(df), (score_b - score_a) / score_a]

    cat_df.rename(columns={'model_a': model_a, 'model_b': model_b}, inplace=True)
    cat_df = cat_df.style.format({
        model_a: "{:.2f}",
        model_b: "{:.2f}",
        'score diff': "{:.2%}",
    }).applymap(get_color, subset=['score diff'])
    return cat_df

### 分类别比较

In [41]:
import os

def get_category_map(category_file):
    if not category_file or not os.path.exists(category_file):
        return dict()

    category_mapping = read_yaml(category_file)
    return category_mapping


category_map = get_category_map(category_map_file)

def get_category_group(category):
    for key, value in category_map.items():
        if category in value:
            return key
    return '其他'

In [42]:
data = read_jsonl(review_file)
df = pd.DataFrame(data)
df = df[['model_a', 'model_b', 'scores', 'category', 'question', 'output_a', 'output_b']]

df['category'] = df['category'].apply(get_category_group)

table_df = get_table_view(df)
table_df


Unnamed: 0,category,count,50b.v10.15.11,50b.v10.15.13,score diff
0,安全风险,10,8.35,8.75,4.79%
1,文本生成,94,8.06,8.04,-0.33%
2,其他,19,7.72,7.58,-1.87%
3,知识问答,66,7.55,7.81,3.46%
4,文本理解,57,7.48,7.53,0.70%
5,翻译,11,7.05,6.77,-3.87%
6,CODING,16,5.44,7.41,36.21%
7,逻辑推理,24,5.4,6.42,18.92%
8,数学解题,20,5.25,5.88,11.90%
9,总计,317,7.29,7.56,3.66%


In [43]:
import plotly.graph_objects as go

score_list = []
for index, row in df.iterrows():
    score_list.append(dict(model=row['model_a'], category=row['category'], score=row['scores'][0]))
    score_list.append(dict(model=row['model_b'], category=row['category'], score=row['scores'][1]))

score_df = pd.DataFrame(score_list)
df_agg = score_df.groupby(["model", "category"])["score"].mean().reset_index()

pivot_df = (
    df_agg.pivot(index="model", columns="category", values="score")
    .fillna(0)
)

categories = pivot_df.columns.tolist()

fig = go.Figure()

for model in pivot_df.index.tolist():
    fig.add_trace(
        go.Scatterpolar(
            r=pivot_df[pivot_df.index == model].values.tolist()[0],
            theta=categories,
            fill="toself",
            name=model,
        )
    )

fig.update_layout(
    polar=dict(radialaxis=dict(visible=True, range=[0, 10])), showlegend=True
)
fig.show()

### 原始类别比较

In [44]:
data = read_jsonl(review_file)
df_1 = pd.DataFrame(data)
df_1 = df_1[['model_a', 'model_b', 'scores', 'category', 'question', 'output_a', 'output_b']]

table_df_1 = get_table_view(df_1)
table_df_1


Unnamed: 0,category,count,50b.v10.15.11,50b.v10.15.13,score diff
0,续写生成,9,8.61,6.33,-26.45%
1,图像,1,8.5,7.5,-11.76%
2,考试教育,2,8.5,9.0,5.88%
3,有伤害性,10,8.35,8.75,4.79%
4,通用写作,19,8.29,8.3,0.16%
5,文本分类,7,8.29,7.79,-6.03%
6,角色扮演,4,8.25,7.0,-15.15%
7,情感分析,11,8.23,8.09,-1.66%
8,AI个人信息,11,8.16,7.77,-4.74%
9,润色/纠错,6,8.08,7.58,-6.19%
