In [24]:
import numpy as np
import tqdm 

def _bootstrap_confidence_intervals(logits, bootstraps, true_false_bool):
    """
    Computes bootstrap confidence intervals for retrieval metrics by resampling
    the *query* set only, while keeping the reference set intact.

    Args:
        logits (dict): A dictionary containing:
                        - logits["image_to_text"]: shape (N, N)
                        - logits["text_to_image"]: shape (N, N)
        bootstraps (int): Number of bootstrap iterations.

    Returns:
        dict: Dictionary with metric names as keys and dictionaries containing 
                mean, ci_lower, and ci_upper as values.
    """
    alpha = 0.95
    lower_p = (1 - alpha) / 2 * 100
    upper_p = (1 + alpha) / 2 * 100
    N = logits["image_to_text"].shape[0]  # number of samples (images/text)

    ci_results = {}
    
    # We’ll define a helper to compute rank-based metrics for a set of ranks:
    def compute_rank_metrics(ranks_array):
        ranks_array = np.array(ranks_array)
        # ranks_array is 0-based rank
        mean_rank = np.mean(ranks_array) + 1.0   # 1-based
        median_rank = np.floor(np.median(ranks_array)) + 1.0
        r1 = np.mean(ranks_array < 1.0)
        r5 = np.mean(ranks_array < 5.0)
        r10 = np.mean(ranks_array < 10.0)
        return mean_rank, median_rank, r1, r5, r10
    
    def get_true_false_rankings(rankings, true_labels):
        true_rankings = []
        false_rankings = []
        for i, label in enumerate(true_labels):
            if label:
                true_rankings.append(rankings[i])
            else:
                false_rankings.append(rankings[i])
        return true_rankings, false_rankings
    
    median_bootstraps = {}

    # For each retrieval direction, do query-only bootstrapping
    # 'image_to_text' => images are queries, text is reference
    # 'text_to_image' => text is query, images are reference
    for name in ['image_to_text', 'text_to_image']:
        logits_2d = logits[name].numpy()  # shape (N, N), query x reference
        bootstrap_mean_rank = []
        bootstrap_median_rank = []
        bootstrap_R1 = []
        bootstrap_R5 = []
        bootstrap_R10 = []

        bootstrap_mean_rank_common = []
        bootstrap_median_rank_common = []
        bootstrap_R1_common = []
        bootstrap_R5_common = []
        bootstrap_R10_common = []

        bootstrap_mean_rank_non_common = []
        bootstrap_median_rank_non_common = []
        bootstrap_R1_non_common = []
        bootstrap_R5_non_common = []
        bootstrap_R10_non_common = []

        for _ in tqdm.tqdm(range(bootstraps)):
            # Sample query indices (with replacement)
            sampled_indices = np.random.randint(0, N, size=(N,))
            
            # Compute ranks for each query in 'sampled_indices'
            ranks_resampled = []
            # For i-th query, the correct reference is the same index (i.e., i)
            # but since we are sampling with replacement, we need to find 
            # the rank of 'sampled_indices[i]' in row i of the 2D logits.
            # Actually, if name == 'image_to_text', the correct text index is the same 
            # as the image index. But here, we have "row = image_i, col = text_i".
            
            # So for each bootstrapped query i, the "real" row is 'sampled_indices[i]', 
            # and the correct reference is 'sampled_indices[i]' as well.
            
            # 1) Get that row's logits
            # 2) Sort descending
            # 3) Find where 'sampled_indices[i]' sits
            row_logits = logits_2d[sampled_indices]  # shape (N, N)
            # row_logits[i] is the logits for the query "sampled_indices[i]" 
            # against all references. We find the rank of reference = sampled_indices[i] in that row.
            
            # Sort indices in descending order for each row
            ranking = np.argsort(-row_logits, axis=1)  # shape (N, N) (descending)
            
            for i in range(N):
                correct_ref = sampled_indices[i]
                row_i = ranking[i]  # the sorted indices for the i-th query in the resampled set
                # where is correct_ref in row_i?
                rank_pos = np.where(row_i == correct_ref)[0][0]  # 0-based rank
                ranks_resampled.append(rank_pos)
            
            ranks_resampled = np.array(ranks_resampled)

            # Compute the metrics
            m_rank, md_rank, R1_, R5_, R10_ = compute_rank_metrics(ranks_resampled)
            bootstrap_mean_rank.append(m_rank)
            bootstrap_median_rank.append(md_rank)
            bootstrap_R1.append(R1_)
            bootstrap_R5.append(R5_)
            bootstrap_R10.append(R10_)

            # Get the true and false rankings
            true_rankings, false_rankings = get_true_false_rankings(ranks_resampled, true_false_bool)

            # Compute the metrics for common and non-common
            m_rank, md_rank, R1_, R5_, R10_ = compute_rank_metrics(true_rankings)
            bootstrap_mean_rank_common.append(m_rank)
            bootstrap_median_rank_common.append(md_rank)
            bootstrap_R1_common.append(R1_)
            bootstrap_R5_common.append(R5_)
            bootstrap_R10_common.append(R10_)

            m_rank, md_rank, R1_, R5_, R10_ = compute_rank_metrics(false_rankings)
            bootstrap_mean_rank_non_common.append(m_rank)
            bootstrap_median_rank_non_common.append(md_rank)
            bootstrap_R1_non_common.append(R1_)
            bootstrap_R5_non_common.append(R5_)
            bootstrap_R10_non_common.append(R10_)

        # Convert bootstrap samples to np.array
        bootstrap_mean_rank = np.array(bootstrap_mean_rank)
        bootstrap_median_rank = np.array(bootstrap_median_rank)
        bootstrap_R1 = np.array(bootstrap_R1)
        bootstrap_R5 = np.array(bootstrap_R5)
        bootstrap_R10 = np.array(bootstrap_R10)

        bootstrap_mean_rank_common = np.array(bootstrap_mean_rank_common)
        bootstrap_median_rank_common = np.array(bootstrap_median_rank_common)
        bootstrap_R1_common = np.array(bootstrap_R1_common)
        bootstrap_R5_common = np.array(bootstrap_R5_common)
        bootstrap_R10_common = np.array(bootstrap_R10_common)

        bootstrap_mean_rank_non_common = np.array(bootstrap_mean_rank_non_common)
        bootstrap_median_rank_non_common = np.array(bootstrap_median_rank_non_common)
        bootstrap_R1_non_common = np.array(bootstrap_R1_non_common)
        bootstrap_R5_non_common = np.array(bootstrap_R5_non_common)
        bootstrap_R10_non_common = np.array(bootstrap_R10_non_common)

        # Store mean & CI bounds
        ci_results[f"{name}_mean_rank"] = {
            'mean': float(np.mean(bootstrap_mean_rank)),
            'ci_lower': float(np.percentile(bootstrap_mean_rank, lower_p)),
            'ci_upper': float(np.percentile(bootstrap_mean_rank, upper_p))
        }
        ci_results[f"{name}_median_rank"] = {
            'mean': float(np.mean(bootstrap_median_rank)),
            'ci_lower': float(np.percentile(bootstrap_median_rank, lower_p)),
            'ci_upper': float(np.percentile(bootstrap_median_rank, upper_p))
        }
        ci_results[f"{name}_R@1"] = {
            'mean': float(np.mean(bootstrap_R1)),
            'ci_lower': float(np.percentile(bootstrap_R1, lower_p)),
            'ci_upper': float(np.percentile(bootstrap_R1, upper_p))
        }
        ci_results[f"{name}_R@5"] = {
            'mean': float(np.mean(bootstrap_R5)),
            'ci_lower': float(np.percentile(bootstrap_R5, lower_p)),
            'ci_upper': float(np.percentile(bootstrap_R5, upper_p))
        }
        ci_results[f"{name}_R@10"] = {
            'mean': float(np.mean(bootstrap_R10)),
            'ci_lower': float(np.percentile(bootstrap_R10, lower_p)),
            'ci_upper': float(np.percentile(bootstrap_R10, upper_p))
        }

        ci_results[f"{name}_mean_rank_common"] = {
            'mean': float(np.mean(bootstrap_mean_rank_common)),
            'ci_lower': float(np.percentile(bootstrap_mean_rank_common, lower_p)),
            'ci_upper': float(np.percentile(bootstrap_mean_rank_common, upper_p))
        }
        ci_results[f"{name}_median_rank_common"] = {
            'mean': float(np.mean(bootstrap_median_rank_common)),
            'ci_lower': float(np.percentile(bootstrap_median_rank_common, lower_p)),
            'ci_upper': float(np.percentile(bootstrap_median_rank_common, upper_p))
        }
        ci_results[f"{name}_R@1_common"] = {
            'mean': float(np.mean(bootstrap_R1_common)),
            'ci_lower': float(np.percentile(bootstrap_R1_common, lower_p)),
            'ci_upper': float(np.percentile(bootstrap_R1_common, upper_p))
        }
        ci_results[f"{name}_R@5_common"] = {
            'mean': float(np.mean(bootstrap_R5_common)),
            'ci_lower': float(np.percentile(bootstrap_R5_common, lower_p)),
            'ci_upper': float(np.percentile(bootstrap_R5_common, upper_p))
        }
        ci_results[f"{name}_R@10_common"] = {
            'mean': float(np.mean(bootstrap_R10_common)),
            'ci_lower': float(np.percentile(bootstrap_R10_common, lower_p)),
            'ci_upper': float(np.percentile(bootstrap_R10_common, upper_p))
        }

        ci_results[f"{name}_mean_rank_non_common"] = {
            'mean': float(np.mean(bootstrap_mean_rank_non_common)),
            'ci_lower': float(np.percentile(bootstrap_mean_rank_non_common, lower_p)),
            'ci_upper': float(np.percentile(bootstrap_mean_rank_non_common, upper_p))
        }
        ci_results[f"{name}_median_rank_non_common"] = {
            'mean': float(np.mean(bootstrap_median_rank_non_common)),
            'ci_lower': float(np.percentile(bootstrap_median_rank_non_common, lower_p)),
            'ci_upper': float(np.percentile(bootstrap_median_rank_non_common, upper_p))
        }
        ci_results[f"{name}_R@1_non_common"] = {
            'mean': float(np.mean(bootstrap_R1_non_common)),
            'ci_lower': float(np.percentile(bootstrap_R1_non_common, lower_p)),
            'ci_upper': float(np.percentile(bootstrap_R1_non_common, upper_p))
        }
        ci_results[f"{name}_R@5_non_common"] = {
            'mean': float(np.mean(bootstrap_R5_non_common)),
            'ci_lower': float(np.percentile(bootstrap_R5_non_common, lower_p)),
            'ci_upper': float(np.percentile(bootstrap_R5_non_common, upper_p))
        }
        ci_results[f"{name}_R@10_non_common"] = {
            'mean': float(np.mean(bootstrap_R10_non_common)),
            'ci_lower': float(np.percentile(bootstrap_R10_non_common, lower_p)),
            'ci_upper': float(np.percentile(bootstrap_R10_non_common, upper_p))
        }

        median_bootstraps[f"{name}"] = bootstrap_median_rank
    return ci_results, median_bootstraps

In [25]:
import torch

all_trained_all_retrieved = torch.load("./projects/pathology/data/logits/all_logits_hande.pt")
all_trained_he_retrieved = torch.load("./projects/pathology/data/logits/all_logits_all.pt")
he_trained_he_retrieved = torch.load("./projects/pathology/data/logits/hande_logits_hande.pt")
he_trained_all_retrieved = torch.load("./projects/pathology/data/logits/hande_logits_all.pt")

In [26]:
import pandas as pd
import json

with open('./projects/pathology/data/patient_info/patient_characteristics.xlsx', 'rb') as f:
    patient_info = pd.read_excel(f)

with open('./projects/pathology/data/patient_info/report_id_specimen_map.json', 'rb') as f:
    report_id_specimen_map = json.load(f)

patient_info['specimen'] = patient_info['specimen'].apply(
    lambda x: x[:9] + "_" + x[9:]
)
specimen_report_id_map = {v: k for k, v in report_id_specimen_map.items()}

with open("./projects/pathology/data/experiment/biogpt.json", "r") as f:
    biogpt_order = json.load(f)

image_id_order = [x['image_id'] for x in biogpt_order]

mapped_image_ids = [specimen_report_id_map[image_id] for image_id in image_id_order if image_id in specimen_report_id_map]

mapping_to_f_t = patient_info.set_index('specimen')['label'].to_dict()

mapped_image_ids_f_t = [mapping_to_f_t[image_id] for image_id in mapped_image_ids if image_id in mapping_to_f_t]

In [27]:
he_full, median_rank_1 = _bootstrap_confidence_intervals(all_trained_he_retrieved, 1000, mapped_image_ids_f_t)

100%|██████████| 1000/1000 [03:16<00:00,  5.09it/s]
100%|██████████| 1000/1000 [03:17<00:00,  5.06it/s]


In [28]:
full_full, median_rank_2 = _bootstrap_confidence_intervals(all_trained_all_retrieved, 1000, mapped_image_ids_f_t)

100%|██████████| 1000/1000 [03:15<00:00,  5.11it/s]
100%|██████████| 1000/1000 [03:16<00:00,  5.09it/s]


In [29]:
he_he, median_rank_3 = _bootstrap_confidence_intervals(he_trained_he_retrieved, 1000, mapped_image_ids_f_t)

100%|██████████| 1000/1000 [03:15<00:00,  5.11it/s]
100%|██████████| 1000/1000 [03:16<00:00,  5.10it/s]


In [30]:
full_he, median_rank_4 = _bootstrap_confidence_intervals(he_trained_all_retrieved, 1000, mapped_image_ids_f_t)

100%|██████████| 1000/1000 [03:16<00:00,  5.09it/s]
100%|██████████| 1000/1000 [03:16<00:00,  5.09it/s]


In [31]:
he_full = {'image_to_text_mean_rank': {'mean': 172.3839994923858,
  'ci_lower': 162.84826142131982,
  'ci_upper': 182.69098984771574},
 'image_to_text_median_rank': {'mean': 78.905,
  'ci_lower': 72.0,
  'ci_upper': 88.0},
 'image_to_text_R@1': {'mean': 0.018539086294416245,
  'ci_lower': 0.01318527918781727,
  'ci_upper': 0.024365482233502538},
 'image_to_text_R@5': {'mean': 0.08009390862944163,
  'ci_lower': 0.06852791878172589,
  'ci_upper': 0.09187817258883249},
 'image_to_text_R@10': {'mean': 0.13129543147208123,
  'ci_lower': 0.1182741116751269,
  'ci_upper': 0.14519035532994923},
 'text_to_image_mean_rank': {'mean': 202.15955177664975,
  'ci_lower': 190.928769035533,
  'ci_upper': 214.27479695431472},
 'text_to_image_median_rank': {'mean': 101.431,
  'ci_lower': 92.0,
  'ci_upper': 109.0},
 'text_to_image_R@1': {'mean': 0.01417309644670051,
  'ci_lower': 0.009644670050761422,
  'ci_upper': 0.01979695431472081},
 'text_to_image_R@5': {'mean': 0.06464720812182742,
  'ci_lower': 0.05380710659898477,
  'ci_upper': 0.07461928934010152},
 'text_to_image_R@10': {'mean': 0.11330761421319796,
  'ci_lower': 0.09949238578680203,
  'ci_upper': 0.12741116751269035}}

full_full = {'image_to_text_mean_rank': {'mean': 83.27324365482234,
  'ci_lower': 77.44775380710661,
  'ci_upper': 89.59619289340102},
 'image_to_text_median_rank': {'mean': 31.715,
  'ci_lower': 29.0,
  'ci_upper': 35.0},
 'image_to_text_R@1': {'mean': 0.057805076142131984,
  'ci_lower': 0.04821065989847717,
  'ci_upper': 0.06852791878172589},
 'image_to_text_R@5': {'mean': 0.181792385786802,
  'ci_lower': 0.16446700507614212,
  'ci_upper': 0.19848984771573602},
 'image_to_text_R@10': {'mean': 0.27975634517766496,
  'ci_lower': 0.2593908629441624,
  'ci_upper': 0.300507614213198},
 'text_to_image_mean_rank': {'mean': 86.90838730964468,
  'ci_lower': 80.49159898477157,
  'ci_upper': 93.01997461928934},
 'text_to_image_median_rank': {'mean': 31.467,
  'ci_lower': 29.0,
  'ci_upper': 35.02499999999998},
 'text_to_image_R@1': {'mean': 0.05804213197969543,
  'ci_lower': 0.047715736040609136,
  'ci_upper': 0.06802030456852792},
 'text_to_image_R@5': {'mean': 0.17233401015228428,
  'ci_lower': 0.15532994923857868,
  'ci_upper': 0.18883248730964466},
 'text_to_image_R@10': {'mean': 0.27001065989847717,
  'ci_lower': 0.250253807106599,
  'ci_upper': 0.28883248730964467}}

he_he = {'image_to_text_mean_rank': {'mean': 137.77397461928933,
  'ci_lower': 128.8506472081218,
  'ci_upper': 146.77970812182738},
 'image_to_text_median_rank': {'mean': 60.878,
  'ci_lower': 55.97500000000002,
  'ci_upper': 67.0},
 'image_to_text_R@1': {'mean': 0.0235248730964467,
  'ci_lower': 0.017258883248730966,
  'ci_upper': 0.030456852791878174},
 'image_to_text_R@5': {'mean': 0.10601776649746193,
  'ci_lower': 0.09289340101522843,
  'ci_upper': 0.11928934010152284},
 'image_to_text_R@10': {'mean': 0.17565482233502538,
  'ci_lower': 0.15939086294416244,
  'ci_upper': 0.19289340101522842},
 'text_to_image_mean_rank': {'mean': 140.92329492385787,
  'ci_lower': 131.8305456852792,
  'ci_upper': 150.30319796954313},
 'text_to_image_median_rank': {'mean': 61.007,
  'ci_lower': 56.0,
  'ci_upper': 67.0},
 'text_to_image_R@1': {'mean': 0.026875634517766495,
  'ci_lower': 0.01979695431472081,
  'ci_upper': 0.03401015228426396},
 'text_to_image_R@5': {'mean': 0.1040005076142132,
  'ci_lower': 0.09086294416243655,
  'ci_upper': 0.116751269035533},
 'text_to_image_R@10': {'mean': 0.16814111675126903,
  'ci_lower': 0.15024111675126905,
  'ci_upper': 0.1847715736040609}}

full_he = {'image_to_text_mean_rank': {'mean': 135.69216446700506,
  'ci_lower': 127.52586294416244,
  'ci_upper': 144.51873096446698},
 'image_to_text_median_rank': {'mean': 62.854,
  'ci_lower': 56.0,
  'ci_upper': 68.0},
 'image_to_text_R@1': {'mean': 0.024320812182741115,
  'ci_lower': 0.017258883248730966,
  'ci_upper': 0.03147208121827411},
 'image_to_text_R@5': {'mean': 0.09498071065989848,
  'ci_lower': 0.08222081218274112,
  'ci_upper': 0.10761421319796954},
 'image_to_text_R@10': {'mean': 0.1637167512690355,
  'ci_lower': 0.14771573604060914,
  'ci_upper': 0.1802030456852792},
 'text_to_image_mean_rank': {'mean': 138.47823299492387,
  'ci_lower': 130.45125634517765,
  'ci_upper': 146.7815228426396},
 'text_to_image_median_rank': {'mean': 61.211,
  'ci_lower': 56.97500000000002,
  'ci_upper': 67.0},
 'text_to_image_R@1': {'mean': 0.023779695431472077,
  'ci_lower': 0.017258883248730966,
  'ci_upper': 0.030964467005076143},
 'text_to_image_R@5': {'mean': 0.08999390862944162,
  'ci_lower': 0.07766497461928934,
  'ci_upper': 0.10304568527918782},
 'text_to_image_R@10': {'mean': 0.15444213197969542,
  'ci_lower': 0.13807106598984772,
  'ci_upper': 0.17055837563451776}}

In [32]:
import pandas as pd

# Define the datasets
datasets = {
    'he_he': he_he,
    'full_he': full_he,
    'he_full': he_full,
    'full_full': full_full,
}

# Initialize lists to hold rows for each table
text_to_image_rows = []
image_to_text_rows = []

# Process each dataset
for name, data in datasets.items():
    # Extract text_to_image metrics
    text_row = {'dataset': name}
    for key in data:
        if key.startswith('text_to_image_'):
            metric = key[len('text_to_image_'):]
            text_row[f'{metric}_mean'] = data[key]['mean']
            text_row[f'{metric}_ci_lower'] = data[key]['ci_lower']
            text_row[f'{metric}_ci_upper'] = data[key]['ci_upper']
    text_to_image_rows.append(text_row)
    
    # Extract image_to_text metrics
    image_row = {'dataset': name}
    for key in data:
        if key.startswith('image_to_text_'):
            metric = key[len('image_to_text_'):]
            image_row[f'{metric}_mean'] = data[key]['mean']
            image_row[f'{metric}_ci_lower'] = data[key]['ci_lower']
            image_row[f'{metric}_ci_upper'] = data[key]['ci_upper']
    image_to_text_rows.append(image_row)

# Create DataFrames

column_order = [
    'R@1_mean', 'R@1_ci_lower', 'R@1_ci_upper',
    'R@5_mean', 'R@5_ci_lower', 'R@5_ci_upper',
    'R@10_mean', 'R@10_ci_lower', 'R@10_ci_upper',
    'mean_rank_mean', 'mean_rank_ci_lower', 'mean_rank_ci_upper',
    'median_rank_mean', 'median_rank_ci_lower', 'median_rank_ci_upper'
]


text_to_image_df = pd.DataFrame(text_to_image_rows).set_index('dataset')[column_order]
image_to_text_df = pd.DataFrame(image_to_text_rows).set_index('dataset')[column_order]

In [None]:
text_to_image_df

Unnamed: 0_level_0,R@1_mean,R@1_ci_lower,R@1_ci_upper,R@5_mean,R@5_ci_lower,R@5_ci_upper,R@10_mean,R@10_ci_lower,R@10_ci_upper,mean_rank_mean,mean_rank_ci_lower,mean_rank_ci_upper,median_rank_mean,median_rank_ci_lower,median_rank_ci_upper
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
he_he,0.026876,0.019797,0.03401,0.104001,0.090863,0.116751,0.168141,0.150241,0.184772,140.923295,131.830546,150.303198,61.007,56.0,67.0
full_he,0.02378,0.017259,0.030964,0.089994,0.077665,0.103046,0.154442,0.138071,0.170558,138.478233,130.451256,146.781523,61.211,56.975,67.0
he_full,0.014173,0.009645,0.019797,0.064647,0.053807,0.074619,0.113308,0.099492,0.127411,202.159552,190.928769,214.274797,101.431,92.0,109.0
full_full,0.058042,0.047716,0.06802,0.172334,0.15533,0.188832,0.270011,0.250254,0.288832,86.908387,80.491599,93.019975,31.467,29.0,35.025


In [None]:
def truncate(number, decimals=0):
    """Truncate number without rounding"""
    if not isinstance(number, (int, float)):
        return number
    factor = 10 ** decimals
    return int(number * factor) / factor

# Format numbers
for df in [text_to_image_df, image_to_text_df]:
    # Format Recall@k metrics (3 decimals)
    recall_cols = [col for col in df.columns if col.startswith('R@')]
    df[recall_cols] = df[recall_cols].applymap(lambda x: f"{truncate(x, 3):.3f}")
    
    # Format Rank metrics (1 decimal)
    rank_cols = [col for col in df.columns if 'rank' in col]
    df[rank_cols] = df[rank_cols].applymap(lambda x: f"{truncate(x, 1):.1f}")


In [None]:
image_to_text_df

Unnamed: 0_level_0,R@1_mean,R@1_ci_lower,R@1_ci_upper,R@5_mean,R@5_ci_lower,R@5_ci_upper,R@10_mean,R@10_ci_lower,R@10_ci_upper,mean_rank_mean,mean_rank_ci_lower,mean_rank_ci_upper,median_rank_mean,median_rank_ci_lower,median_rank_ci_upper
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
he_he,0.023,0.017,0.03,0.106,0.092,0.119,0.175,0.159,0.192,137.7,128.8,146.7,60.8,55.9,67.0
full_he,0.024,0.017,0.031,0.094,0.082,0.107,0.163,0.147,0.18,135.6,127.5,144.5,62.8,56.0,68.0
he_full,0.018,0.013,0.024,0.08,0.068,0.091,0.131,0.118,0.145,172.3,162.8,182.6,78.9,72.0,88.0
full_full,0.057,0.048,0.068,0.181,0.164,0.198,0.279,0.259,0.3,83.2,77.4,89.5,31.7,29.0,35.0


In [None]:
text_to_image_df

Unnamed: 0_level_0,R@1_mean,R@1_ci_lower,R@1_ci_upper,R@5_mean,R@5_ci_lower,R@5_ci_upper,R@10_mean,R@10_ci_lower,R@10_ci_upper,mean_rank_mean,mean_rank_ci_lower,mean_rank_ci_upper,median_rank_mean,median_rank_ci_lower,median_rank_ci_upper
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
he_he,0.026,0.019,0.034,0.104,0.09,0.116,0.168,0.15,0.184,140.9,131.8,150.3,61.0,56.0,67.0
full_he,0.023,0.017,0.03,0.089,0.077,0.103,0.154,0.138,0.17,138.4,130.4,146.7,61.2,56.9,67.0
he_full,0.014,0.009,0.019,0.064,0.053,0.074,0.113,0.099,0.127,202.1,190.9,214.2,101.4,92.0,109.0
full_full,0.058,0.047,0.068,0.172,0.155,0.188,0.27,0.25,0.288,86.9,80.4,93.0,31.4,29.0,35.0
