In [1]:
import pandas as pd
from utils_analysis import prepare_scores
import matplotlib.pyplot as plt
import numpy as np
from utils_tail_probs import softmax, tail_index
from scipy.stats import spearmanr
from scipy.spatial.distance import jensenshannon
import json
from src.eval import calculate_f1_score

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/mila/f/floresl/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/mila/f/floresl/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/mila/f/floresl/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [22]:
df = pd.read_csv("data/squad/test.csv")

# "results/bart/r1/bart-base_data_squad_checkpoint-26280_squad.json",
# "results/bart/r2/bart-base_data_squad_checkpoint-26280_squad.json",
# "results/bart/r3/bart-base_data_squad_checkpoint-26280_squad.json",
# "results/t5/r1/flan-t5-base_data_squad_checkpoint-26280_squad.json",
# "results/t5/r2/flan-t5-base_data_squad_checkpoint-26280_squad.json",
# "results/t5/r3/flan-t5-base_data_squad_checkpoint-26280_squad.json",

results = prepare_scores(
    "/home/mila/f/floresl/beam-search/results/flan-t5-base_data_squad_checkpoint-26280_squad.json",
    df["target"],
    "f1",
)

baseline_probs = json.load(
    open(
        "/home/mila/f/floresl/beam-search/results/t5_baseline/google_flan-t5-base_squad.json",
        "r",
    )
)

  abs(spearmanr(ground_truth_score, confidence_score_dict[str(k)]).statistic)  # type: ignore


In [23]:
results.scores_dataframe.drop(["sentences", "dropout_sentences"], axis=1).corr(
    method="spearman"
)["f1"]

length_normalized_log_probs    0.444738
mean_token_entropy             0.007599
dropout_bleu_variance         -0.244893
dropout_meteor_score           0.209157
dropout_entropy               -0.236142
dropout_disagreement          -0.378404
f1                             1.000000
beam_score_ratios_1            0.409458
beam_score_log_probs_1        -0.241033
beam_score_top_k_1             0.072919
beam_score_impt_wt_1          -0.149944
Name: f1, dtype: float64

### Tail Index

In [24]:
beam_score_log_probs = pd.DataFrame(results.scores_by_beam["beam_score_log_probs"])

In [25]:
tail_indices = []
for i in range(len(beam_score_log_probs)):
    probs = softmax(beam_score_log_probs.iloc[i].to_numpy(), temperature=0.005)
    tail_indices.append(tail_index(probs))

spearmanr(tail_indices, results.scores_dataframe["f1"])

SignificanceResult(statistic=0.42716153394871137, pvalue=1.2879423468677025e-45)

### Baseline Metrics

In [26]:
baseline_log_probs = baseline_probs["beam_score_log_probs"]
baseline_log_probs = pd.DataFrame(baseline_log_probs).values

beam_score_log_probs = beam_score_log_probs.values

In [27]:
js_distances = []
for i in range(1000):
    js = jensenshannon(
        softmax(beam_score_log_probs[i], temperature=0.05),
        softmax(baseline_log_probs[i], temperature=0.05),
    )
    js_distances.append(js)
spearmanr(js_distances, results.scores_dataframe["f1"])

SignificanceResult(statistic=0.26323176066995724, pvalue=2.5967501513187192e-17)

In [28]:
js_distances = []
for i in range(1000):
    js = jensenshannon(
        softmax(beam_score_log_probs[i], temperature=0.05), np.array([1 / 100] * 100)
    )
    js_distances.append(js)
spearmanr(js_distances, results.scores_dataframe["f1"])

SignificanceResult(statistic=0.3494262268209138, pvalue=4.317629104247729e-30)

### Oracle Methods

In [29]:
beam_search_sentences = results.sentences

In [30]:
def get_list_of_scores(target: str, predictions: list[str]):
    return list(
        map(
            lambda pred: calculate_f1_score(pred, target),
            predictions,
        )
    )

In [31]:
weighted_avgs = list[float]()
corr_w_scores = list[float]()
for i in range(1000):
    quality_scores = get_list_of_scores(df["target"][i], results.sentences[i])
    weighted_avg = (
        float(np.average(beam_score_log_probs[i], weights=quality_scores))
        if sum(quality_scores) > 0
        else 0
    )
    corr_w_score = spearmanr(beam_score_log_probs[i], quality_scores).statistic
    if str(corr_w_score) == "nan":
        corr_w_score = 0.0
    weighted_avgs.append(weighted_avg)
    corr_w_scores.append(corr_w_score)

  corr_w_score = spearmanr(beam_score_log_probs[i], quality_scores).statistic


In [32]:
spearmanr(weighted_avgs, results.scores_dataframe["f1"])

SignificanceResult(statistic=0.16066369159199695, pvalue=3.2646346373080996e-07)

In [33]:
spearmanr(corr_w_scores, results.scores_dataframe["f1"])

SignificanceResult(statistic=-0.0023158321724892706, pvalue=0.941693480040375)