In [1]:
%cd ~/src/second-best-bench/
import pandas as pd
import numpy as np
from src.partition import bleu, rouge1, bertscore, classifier_score, gpt4o_mini_score, gpt35_turbo_score
from src.plots.common import palatino
import numpy as np

/home/yimingz3/src/second-best-bench


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
val_df = pd.read_json("data/classifier/val.jsonl", lines=True)

In [3]:
def truncate(s):
    return '\n'.join(s.split('\n')[:-5])

val_df["generation_0"] = val_df["generation_0"].map(truncate)
val_df["generation_1"] = val_df["generation_1"].map(truncate)

In [4]:
bleu_scores = []
rouge_scores = []
bertscore_scores = []
classifier_scores = []
gpt4o_mini_scores = []
gpt35_turbo_scores = []

for _, row in val_df.iterrows():
    prompt, g0, g1 = row["prompt"], row["generation_0"], row["generation_1"]
    bleu_scores.append(await bleu(prompt, g0, g1))
    rouge_scores.append(await rouge1(prompt, g0, g1))
    bertscore_scores.append(await bertscore(prompt, g0, g1))
    classifier_scores.append(await classifier_score(prompt, g0, g1))
    gpt4o_mini_scores.append(await gpt4o_mini_score(prompt, g0, g1))
    gpt35_turbo_scores.append(await gpt35_turbo_score(prompt, g0, g1))




In [5]:
# val_df["bleu"] = bleu_scores
# val_df["rouge"] = rouge_scores
# val_df["bertscore"] = bertscore_scores
# val_df["classifier"] = classifier_scores
val_df["gpt-3.5-turbo"] = gpt35_turbo_scores
val_df["gpt-4o-mini"] = gpt4o_mini_scores

In [8]:

import altair as alt
from sklearn.metrics import roc_curve, auc
import pandas as pd

# Calculate ROC curves for each metric
fpr_bleu, tpr_bleu, _ = roc_curve(val_df["similar"], bleu_scores)
fpr_rouge, tpr_rouge, _ = roc_curve(val_df["similar"], rouge_scores)
fpr_bert, tpr_bert, _ = roc_curve(val_df["similar"], bertscore_scores)
fpr_classifier, tpr_classifier, _ = roc_curve(val_df["similar"], classifier_scores)
fpr_gpt4o, tpr_gpt4o, _ = roc_curve(val_df["similar"], gpt4o_mini_scores)
fpr_gpt35, tpr_gpt35, _ = roc_curve(val_df["similar"], gpt35_turbo_scores)

# Calculate AUC scores
auc_bleu = auc(fpr_bleu, tpr_bleu)
auc_rouge = auc(fpr_rouge, tpr_rouge)
auc_bert = auc(fpr_bert, tpr_bert)
auc_classifier = auc(fpr_classifier, tpr_classifier)
auc_gpt4o = auc(fpr_gpt4o, tpr_gpt4o)
auc_gpt35 = auc(fpr_gpt35, tpr_gpt35)

# Create a DataFrame for Altair
roc_data = pd.DataFrame({
    'fpr': np.concatenate([fpr_bleu, fpr_rouge, fpr_bert, fpr_classifier, fpr_gpt4o, fpr_gpt35, [0, 1]]),
    'tpr': np.concatenate([tpr_bleu, tpr_rouge, tpr_bert, tpr_classifier, tpr_gpt4o, tpr_gpt35, [0, 1]]),
    'metric': ['BLEU'] * len(fpr_bleu) + ['ROUGE-1'] * len(fpr_rouge) + ['BERTScore'] * len(fpr_bert) + \
              ['DeBERTa'] * len(fpr_classifier) + ['GPT-4o-mini'] * len(fpr_gpt4o) + ['GPT-3.5-turbo'] * len(fpr_gpt35) + ['Random'] * 2,
    'auc': [auc_bleu] * len(fpr_bleu) + [auc_rouge] * len(fpr_rouge) + [auc_bert] * len(fpr_bert) + \
           [auc_classifier] * len(fpr_classifier) + [auc_gpt4o] * len(fpr_gpt4o) + [auc_gpt35] * len(fpr_gpt35) + [0.5] * 2  # AUC for random is 0.5
})

# Create the plot with Altair
chart = alt.Chart(roc_data).mark_line(point=False).encode(
    x=alt.X('fpr:Q', title='False Positive Rate', axis=alt.Axis(format='%')),
    y=alt.Y('tpr:Q', title='True Positive Rate', axis=alt.Axis(format='%')),
    color=alt.Color('metric:N', title='Metric'),
    tooltip=['metric', 'auc']
).properties(
    title='ROC Curves for Similarity Metrics'
)

# Add AUC values to the legend
chart = chart.encode(
    color=alt.Color('metric:N', title='Metric')  # Format AUC values
).transform_calculate(
    metric_auc=alt.datum.metric + ' (AUC=' + alt.expr.round(alt.datum.auc * 100) / 100 + ')'
)

chart = chart.encode(
    color=alt.Color('metric_auc:N', title='Metric')
).properties(
    width=400,
    height=400
)



# Print AUC scores
print("\nAUC Scores:")
print(f"BLEU: {auc_bleu:.3f}")
print(f"ROUGE-1: {auc_rouge:.3f}")
print(f"BERTScore: {auc_bert:.3f}")
print(f"Classifier: {auc_classifier:.3f}")
print(f"GPT-4o-mini: {auc_gpt4o:.3f}")
print(f"GPT-3.5-turbo: {auc_gpt35:.3f}")

chart.save("plots/roc.json")
chart


AUC Scores:
BLEU: 0.660
ROUGE-1: 0.677
BERTScore: 0.677
Classifier: 0.812
GPT-4o-mini: 0.764
GPT-3.5-turbo: 0.740


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def find_best_threshold(y_true, scores):
    best_f1 = 0
    best_threshold = 0
    best_precision = 0
    best_recall = 0
    best_accuracy = 0
    thresholds = np.linspace(0, 1, 1000)  # Test 1000 threshold values
    
    for threshold in thresholds:
        y_pred = (np.array(scores) >= threshold).astype(int)
        f1 = f1_score(y_true, y_pred)
        accuracy = accuracy_score(y_true, y_pred)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
            best_precision = precision_score(y_true, y_pred)
            best_recall = recall_score(y_true, y_pred)
            best_accuracy = accuracy
    
    return best_threshold, best_precision, best_recall, best_f1, best_accuracy

# Find best thresholds and scores
bleu_threshold, bleu_precision, bleu_recall, bleu_f1, bleu_accuracy = find_best_threshold(val_df["similar"], bleu_scores)
rouge_threshold, rouge_precision, rouge_recall, rouge_f1, rouge_accuracy = find_best_threshold(val_df["similar"], rouge_scores)
bert_threshold, bert_precision, bert_recall, bert_f1, bert_accuracy = find_best_threshold(val_df["similar"], bertscore_scores)
classifier_threshold, classifier_precision, classifier_recall, classifier_f1, classifier_accuracy = find_best_threshold(val_df["similar"], classifier_scores)
gpt4o_threshold, gpt4o_precision, gpt4o_recall, gpt4o_f1, gpt4o_accuracy = find_best_threshold(val_df["similar"], gpt4o_mini_scores)
gpt35_threshold, gpt35_precision, gpt35_recall, gpt35_f1, gpt35_accuracy = find_best_threshold(val_df["similar"], gpt35_turbo_scores)


print("\nOptimal Thresholds and Metrics:")
print(f"BLEU: threshold = {bleu_threshold:.3f}")
print(f"  Precision: {bleu_precision:.3f}")
print(f"  Recall: {bleu_recall:.3f}")
print(f"  F1: {bleu_f1:.3f}")
print(f"  Accuracy: {bleu_accuracy:.3f}")

print(f"\nROUGE-1: threshold = {rouge_threshold:.3f}")
print(f"  Precision: {rouge_precision:.3f}")
print(f"  Recall: {rouge_recall:.3f}")
print(f"  F1: {rouge_f1:.3f}")
print(f"  Accuracy: {rouge_accuracy:.3f}")

print(f"\nBERTScore: threshold = {bert_threshold:.3f}")
print(f"  Precision: {bert_precision:.3f}")
print(f"  Recall: {bert_recall:.3f}")
print(f"  F1: {bert_f1:.3f}")
print(f"  Accuracy: {bert_accuracy:.3f}")

print(f"\nClassifier: threshold = {classifier_threshold:.3f}")
print(f"  Precision: {classifier_precision:.3f}")
print(f"  Recall: {classifier_recall:.3f}")
print(f"  F1: {classifier_f1:.3f}")
print(f"  Accuracy: {classifier_accuracy:.3f}")

print(f"\nGPT-4o-mini: threshold = {gpt4o_threshold:.3f}")
print(f"  Precision: {gpt4o_precision:.3f}")
print(f"  Recall: {gpt4o_recall:.3f}")
print(f"  F1: {gpt4o_f1:.3f}")
print(f"  Accuracy: {gpt4o_accuracy:.3f}")

print(f"\nGPT-3.5-turbo: threshold = {gpt35_threshold:.3f}")
print(f"  Precision: {gpt35_precision:.3f}")
print(f"  Recall: {gpt35_recall:.3f}")
print(f"  F1: {gpt35_f1:.3f}")
print(f"  Accuracy: {gpt35_accuracy:.3f}")


GPT-4o-mini: threshold = 0.500
  Precision: 0.826
  Recall: 0.380
  F1: 0.521
  Accuracy: 0.650

GPT-3.5-turbo: threshold = 0.500
  Precision: 0.826
  Recall: 0.380
  F1: 0.521
  Accuracy: 0.650
