In [1]:
import pandas as pd
from utils_analysis import prepare_scores
import matplotlib.pyplot as plt
import numpy as np
from utils_tail_probs import softmax, tail_index
from scipy.stats import spearmanr
from scipy.spatial.distance import jensenshannon
import json
from src.eval import calculate_bleu
from src.eval import calculate_rouge_single

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/mila/f/floresl/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/mila/f/floresl/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/mila/f/floresl/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
df = pd.read_csv("data/debatesum/test.csv")

# "results/bart/r1/bart-base_data_debatesum_checkpoint-1500_debatesum.json",
# "results/bart/r2/bart-base_data_debatesum_checkpoint-1500_debatesum.json",
# "results/bart/r3/bart-base_data_debatesum_checkpoint-1500_debatesum.json",
# "results/t5/r1/flan-t5-base_data_debatesum_checkpoint-1500_debatesum.json",
# "results/t5/r2/flan-t5-base_data_debatesum_checkpoint-1500_debatesum.json",
# "results/t5/r3/flan-t5-base_data_debatesum_checkpoint-1500_debatesum.json",

results = prepare_scores(
    "results/flan-t5-base_data_debatesum_checkpoint-1500_debatesum.json",
    df["target"],
    "rougeL",
)

baseline_probs = json.load(
    open(
        "/home/mila/f/floresl/beam-search/results/bart_baseline/facebook_bart-base_debatesum.json",
        "r",
    )
)

  abs(spearmanr(ground_truth_score, confidence_score_dict[str(k)]).statistic)  # type: ignore


In [7]:
results.scores_dataframe.drop(["sentences", "dropout_sentences"], axis=1).corr(
    method="spearman"
)["rougeL"]

length_normalized_log_probs   -0.247232
mean_token_entropy             0.247735
dropout_bleu_variance          0.060817
dropout_meteor_score           0.038216
dropout_entropy               -0.037358
dropout_disagreement           0.024891
rougeL                         1.000000
beam_score_ratios_94           0.295760
beam_score_log_probs_94       -0.257581
beam_score_top_k_94           -0.253596
beam_score_impt_wt_94          0.253572
Name: rougeL, dtype: float64

### Tail Index

In [None]:
beam_score_log_probs = pd.DataFrame(results.scores_by_beam["beam_score_log_probs"])

In [9]:
tail_indices = []
for i in range(len(beam_score_log_probs)):
    probs = softmax(beam_score_log_probs.iloc[i].to_numpy(), temperature=1)
    tail_indices.append(tail_index(probs))

spearmanr(tail_indices, results.scores_dataframe["rougeL"])

SignificanceResult(statistic=0.3538730177374212, pvalue=7.185307233847468e-31)

### Baseline Metrics

In [11]:
baseline_log_probs = baseline_probs["beam_score_log_probs"]
baseline_log_probs = pd.DataFrame(baseline_log_probs).values

beam_score_log_probs = beam_score_log_probs.values

In [13]:
js_distances = []
for i in range(1000):
    js = jensenshannon(
        softmax(beam_score_log_probs[i], temperature=1),
        softmax(baseline_log_probs[i], temperature=1),
    )
    js_distances.append(js)
spearmanr(js_distances, results.scores_dataframe["rougeL"])

SignificanceResult(statistic=0.24480883017997826, pvalue=4.106737226095836e-15)

In [14]:
js_distances = []
for i in range(1000):
    js = jensenshannon(
        softmax(beam_score_log_probs[i], temperature=1), np.array([1 / 100] * 100)
    )
    js_distances.append(js)
spearmanr(js_distances, results.scores_dataframe["rougeL"])

SignificanceResult(statistic=0.35409802690685954, pvalue=6.5570065826067755e-31)

### Oracle Methods

In [15]:
beam_search_sentences = results.sentences

In [17]:
def get_list_of_scores(target: str, predictions: list[str]):
    return list(
        map(
            lambda pred: calculate_rouge_single(pred, target)["rougeL"],
            predictions,
        )
    )

In [18]:
weighted_avgs = list[float]()
corr_w_scores = list[float]()
for i in range(1000):
    quality_scores = get_list_of_scores(df["target"][i], results.sentences[i])
    weighted_avg = (
        float(np.average(beam_score_log_probs[i], weights=quality_scores))
        if sum(quality_scores) > 0
        else 0
    )
    corr_w_score = spearmanr(beam_score_log_probs[i], quality_scores).statistic
    if str(corr_w_score) == "nan":
        corr_w_score = 0.0
    weighted_avgs.append(weighted_avg)
    corr_w_scores.append(corr_w_score)

  corr_w_score = spearmanr(beam_score_log_probs[i], quality_scores).statistic


In [19]:
spearmanr(weighted_avgs, results.scores_dataframe["rougeL"])

SignificanceResult(statistic=-0.33360440929429996, pvalue=2.0250808238005607e-27)

In [20]:
spearmanr(corr_w_scores, results.scores_dataframe["rougeL"])

SignificanceResult(statistic=0.15082459022271166, pvalue=1.6598639615661723e-06)