In [1]:
import pandas as pd
from utils_analysis import prepare_scores
import matplotlib.pyplot as plt
import numpy as np
from utils_tail_probs import softmax, tail_index
from scipy.stats import spearmanr
from scipy.spatial.distance import jensenshannon
import json
from src.eval import calculate_bleu, calculate_rouge_single, calculate_f1_score

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/mila/f/floresl/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/mila/f/floresl/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/mila/f/floresl/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [14]:
df = pd.read_csv("data/hotpotqa/test.csv")

# "results/bart/r1/bart-base_data_hotpotqa_checkpoint-26835_hotpotqa.json",
# "results/bart/r2/bart-base_data_hotpotqa_checkpoint-26835_hotpotqa.json",
# "results/bart/r3/bart-base_data_hotpotqa_checkpoint-26835_hotpotqa.json",
# "results/t5/r1/flan-t5-base_data_hotpotqa_checkpoint-26835_hotpotqa.json",
# "results/t5/r2/flan-t5-base_data_hotpotqa_checkpoint-26835_hotpotqa.json",
# "results/t5/r3/flan-t5-base_data_hotpotqa_checkpoint-26835_hotpotqa.json",

results = prepare_scores(
    "/home/mila/f/floresl/beam-search/results/flan-t5-base_data_hotpotqa_checkpoint-26835_hotpotqa.json",
    df["target"],
    "f1",
)

baseline_probs = json.load(
    open(
        "/home/mila/f/floresl/beam-search/results/t5_baseline/google_flan-t5-base_hotpotqa.json",
        "r",
    )
)

  abs(spearmanr(ground_truth_score, confidence_score_dict[str(k)]).statistic)  # type: ignore


In [15]:
results.scores_dataframe.drop(["sentences", "dropout_sentences"], axis=1).corr(
    method="spearman"
)["f1"]

length_normalized_log_probs    0.301710
mean_token_entropy            -0.152407
dropout_bleu_variance         -0.000331
dropout_meteor_score           0.269517
dropout_entropy               -0.367446
dropout_disagreement          -0.232327
f1                             1.000000
beam_score_ratios_90           0.429086
beam_score_log_probs_90       -0.233861
beam_score_top_k_90           -0.230181
beam_score_impt_wt_90          0.229047
Name: f1, dtype: float64

### Tail Index

In [16]:
beam_score_log_probs = pd.DataFrame(results.scores_by_beam["beam_score_log_probs"])

In [17]:
tail_indices = []
for i in range(len(beam_score_log_probs)):
    probs = softmax(beam_score_log_probs.iloc[i].to_numpy(), temperature=0.05)
    tail_indices.append(tail_index(probs))

spearmanr(tail_indices, results.scores_dataframe["f1"])

SignificanceResult(statistic=0.45084327032516935, pvalue=3.188911085633083e-51)

### Baseline Metrics

In [18]:
baseline_log_probs = baseline_probs["beam_score_log_probs"]
baseline_log_probs = pd.DataFrame(baseline_log_probs).values

beam_score_log_probs = beam_score_log_probs.values

In [19]:
js_distances = []
for i in range(1000):
    js = jensenshannon(
        softmax(beam_score_log_probs[i], temperature=0.05),
        softmax(baseline_log_probs[i], temperature=0.05),
    )
    js_distances.append(js)
spearmanr(js_distances, results.scores_dataframe["f1"])

SignificanceResult(statistic=0.3658134678021087, pvalue=5.0446810980216386e-33)

In [20]:
js_distances = []
for i in range(1000):
    js = jensenshannon(
        softmax(beam_score_log_probs[i], temperature=0.05), np.array([1 / 100] * 100)
    )
    js_distances.append(js)
spearmanr(js_distances, results.scores_dataframe["f1"])

SignificanceResult(statistic=0.4450505436905848, pvalue=8.245428996916145e-50)

### Oracle Methods

In [21]:
beam_search_sentences = results.sentences

In [22]:
def get_list_of_scores(target: str, predictions: list[str]):
    return list(
        map(
            lambda pred: calculate_f1_score(pred, target),
            predictions,
        )
    )

In [23]:
weighted_avgs = list[float]()
corr_w_scores = list[float]()
for i in range(1000):
    quality_scores = get_list_of_scores(df["target"][i], results.sentences[i])
    weighted_avg = (
        float(np.average(beam_score_log_probs[i], weights=quality_scores))
        if sum(quality_scores) > 0
        else 0
    )
    corr_w_score = spearmanr(beam_score_log_probs[i], quality_scores).statistic
    if str(corr_w_score) == "nan":
        corr_w_score = 0.0
    weighted_avgs.append(weighted_avg)
    corr_w_scores.append(corr_w_score)

  corr_w_score = spearmanr(beam_score_log_probs[i], quality_scores).statistic


In [24]:
spearmanr(weighted_avgs, results.scores_dataframe["f1"])

SignificanceResult(statistic=-0.46469880589198126, pvalue=1.0290937237817967e-54)

In [25]:
spearmanr(corr_w_scores, results.scores_dataframe["f1"])

SignificanceResult(statistic=0.24082241522347897, pvalue=1.1647447611793204e-14)