In [None]:
# write a script that ask gpt-4o-mini to map the uncertainty expression (a sentence) to a confidence score with the following format:
# Please provide only a confidence score between 0 and 100, based solely on the degree of confidence expressed in the tone of the following sentence (without using any external or prior knowledge): {UNCERTAINTY EXPRESSION}

import openai
import pandas as pd
import random

def map_confidence(sentence, model_name="gpt-4o-mini"):
    # Use the new openai>=1.0.0 API for chat completions
    response = openai.chat.completions.create(
        model=model_name,
        messages=[
            {
                "role": "user",
                "content": f"Please Please provide only a confidence score between 0 and 100, based solely on the degree of confidence expressed in the tone of the following sentence (without using any external or prior knowledge): {sentence}"
            }
        ]
    )
    print(response.choices[0].message.content)
    return response.choices[0].message.content

df = pd.read_csv("tmp/dataset_valid_confidence_score_count_3.csv")

# 随机选择一句 uncertainty expression，并显示其 annotation mean
random_row = df.dropna(subset=["uncertainty_expression"]).sample(1).iloc[0]
random_sentence = random_row["uncertainty_expression"]
annotation_mean = random_row["annotation_mean"]
print("Selected uncertainty expression:", random_sentence)
print("Annotation mean:", annotation_mean)

# 使用 map_confidence 获取置信度分数
confidence_score = map_confidence(random_sentence)
print("GPT-4o-mini confidence score:", confidence_score)


Selected uncertainty expression: I'm pretty sure that Menem appointed Julio Corzo to that role first.
Annotation mean: 75.0
75
GPT-4o-mini confidence score: 75


In [15]:
import pandas as pd

df_gpt_5_google_paper = pd.read_csv("tmp/dataset_valid_confidence_score_count_3_gpt-5_google_paper.csv")
import numpy as np
import re

df_gpt_5_google_paper['processed_confidence_score_gpt-5'] = df_gpt_5_google_paper['confidence_score_gpt-5'].apply(
    lambda x: np.mean([float(score.strip(".")) for score in re.findall(r"Decisiveness score:\s*([\d.]+)", x)]) if re.search(r"Decisiveness score:", x) else np.nan
)



In [24]:
import pandas as pd
from sklearn.metrics import mean_squared_error

df_full = pd.read_csv("tmp/dataset_valid_confidence_score_count_3_eval.csv")

# calculate the mse 
mse_gpt_5 = mean_squared_error(df_full['confidence_score_gpt-5'], df_full['annotation_mean'])
mse_gpt_5_nano = mean_squared_error(df_full['confidence_score_gpt-5-nano'], df_full['annotation_mean'])
mse_gpt_5_mini = mean_squared_error(df_full['confidence_score_gpt-5-mini'], df_full['annotation_mean'])
mse_gpt_4o_mini = mean_squared_error(df_full['confidence_score_gpt-4o-mini'], df_full['annotation_mean'])
mse_probe_score = mean_squared_error(df_full['confidence_score_probe'], df_full['annotation_mean'])
mse_probe_human_anno_trained = mean_squared_error(df_full['confidence_score_probe_human_anno_trained'], df_full['annotation_mean'])
mse_gpt_5_decisiveness = mean_squared_error(df_full['confidence_score_gpt-5_decisiveness']*100, df_full['annotation_mean'])
mse_gpt_5_mini_decisiveness = mean_squared_error(df_full.dropna(subset=['confidence_score_gpt-5-mini_decisiveness'])['confidence_score_gpt-5-mini_decisiveness']*100, df_full.dropna(subset=['confidence_score_gpt-5-mini_decisiveness'])['annotation_mean'])

print(f"MSE of gpt-5-mini_decisiveness: {mse_gpt_5_mini_decisiveness:.2f} {len(df_full.dropna(subset=['confidence_score_gpt-5-mini_decisiveness']))}/{len(df_full)} not null")
print(f"MSE of gpt-5_decisiveness: {mse_gpt_5_decisiveness:.2f}")
print(f"MSE of gpt-4o-mini: {mse_gpt_4o_mini:.2f}")
print(f"MSE of gpt-5-nano: {mse_gpt_5_nano:.2f}")
print(f"MSE of gpt-5-mini: {mse_gpt_5_mini:.2f}")
print(f"MSE of gpt-5: {mse_gpt_5:.2f}")
print(f"MSE of probe_score trained with LLM anno data: {mse_probe_score:.2f}")
print(f"MSE of probe_score trained with human anno data: {mse_probe_human_anno_trained:.2f}")

MSE of gpt-5-mini_decisiveness: 385.40 1615/1622 not null
MSE of gpt-5_decisiveness: 278.39
MSE of gpt-4o-mini: 351.64
MSE of gpt-5-nano: 300.36
MSE of gpt-5-mini: 191.92
MSE of gpt-5: 183.23
MSE of probe_score trained with LLM anno data: 85.19
MSE of probe_score trained with human anno data: 50.68


In [43]:
import pandas as pd

df_full = pd.read_csv("tmp/dataset_valid_confidence_score_count_3_eval.csv")
# Compute the absolute difference as a new column for sorting
df_full['abs_diff'] = (df_full['annotation_mean'] - df_full['confidence_score_gpt-5']).abs()
df_full_sorted = df_full.sort_values(by='abs_diff', ascending=False)
df_full_sorted[['uncertainty_expression', 'annotation_mean', 'confidence_score_gpt-5']].head(50)


Unnamed: 0,uncertainty_expression,annotation_mean,confidence_score_gpt-5
707,"Sorry, I’m unable to determine the surname.",2.0,95
567,"The track listing for ""Storyteller"" might have...",73.333333,25
672,"I think it’s the Lal Mandi Footbridge, though ...",75.0,30
58,The year might have been 1691.,72.0,30
273,"Based on what I recall, Folkers was recognized...",76.0,35
1004,I reckon Zerxus is roughly 6 feet tall.,75.666667,35
477,It seems probable that she obtained her underg...,25.0,65
452,Ken Palmer and Roy Palmer appear to have been ...,75.0,35
1100,Friedrich Paulus is believed to have made this...,74.0,35
407,It seems to me that Wolmer's Boys School was w...,72.75,35


In [47]:
import pandas as pd

df_full = pd.read_csv("tmp/dataset_valid_confidence_score_count_3_eval.csv")
df_full.loc[[1549, 1491, 1509, 1455]][['uncertainty_expression', 'confidence_score_probe_human_anno_trained', 'confidence_score_gpt-5', 'confidence_score_gpt-5_decisiveness', 'annotation_mean']]

Unnamed: 0,uncertainty_expression,confidence_score_probe_human_anno_trained,confidence_score_gpt-5,confidence_score_gpt-5_decisiveness,annotation_mean
1549,"I really shouldn't guess, but maybe 25 April 2...",14.490164,12,0.5,13.0
1491,"I could be wrong, but I believe it was in 2022...",35.430325,40,0.7,34.0
1509,I’m under the impression she was 50 when she d...,30.900118,40,0.7,34.5
1455,"I don’t know for certain, but I think it was A...",28.970129,35,0.7,34.666667


In [1]:
import pandas as pd

df = pd.read_csv("simple_qa_test_set.csv")

df.head()

Unnamed: 0,metadata,problem,answer
0,"{'topic': 'Science and technology', 'answer_ty...",Who received the IEEE Frank Rosenblatt Award i...,Michio Sugeno
1,"{'topic': 'Science and technology', 'answer_ty...",Who was awarded the Oceanography Society's Jer...,Annick Bricaud
2,"{'topic': 'Geography', 'answer_type': 'Place',...",What's the name of the women's liberal arts co...,Radcliffe College
3,"{'topic': 'Sports', 'answer_type': 'Person', '...",In whose honor was the Leipzig 1877 tournament...,Adolf Anderssen
4,"{'topic': 'Art', 'answer_type': 'Person', 'url...","According to Karl Küchler, what did Empress El...",Poet Henrich Heine.
