In [None]:
from pycocoevalcap.bleu.bleu_scorer import BleuScorer
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider

def compute_scores(gts, res): 
    """
    Performs the MS COCO evaluation using the Python 3 implementation (https://github.com/salaniz/pycocoevalcap)

    :param gts: Dictionary with the image ids and their gold captions,
    :param res: Dictionary with the image ids and their generated captions
    :print: Evaluation score (the mean of the scores of all the instances) for each measure
    """

    # Set up scorers
    scorers = [
        (Bleu(4), ["BLEU_1", "BLEU_2", "BLEU_3", "BLEU_4"]),
        (Meteor(), "METEOR"), # sudo apt-get install default-jre <- to install java and not return error
        (Rouge(), "ROUGE_L"),
        (Cider(), "CIDEr"),
        # (Spice(), "SPICE") 
    ]

    eval_res = {}
    eval_res_total = {}
    # Compute score for each metric
    for scorer, method in scorers:
        try:
            score, scores = scorer.compute_score(gts, res)
        except TypeError:
            score, scores = scorer.compute_score(gts, res)
        if type(method) == list:
            for sc, m in zip(score, method):
                eval_res[m] = sc
            for sc, m in zip(scores, method):
                eval_res_total[m] = sc
        else:
            eval_res[method] = score
            eval_res_total[method] = scores


    return eval_res, eval_res_total

# The following code is taken from the pycocoevalcap library
# It is needed to put bleu_scorer.compute_score to verbose = 0

def compute_scores_bleu(gts, res): 
    """
    Performs the MS COCO evaluation using the Python 3 implementation (https://github.com/salaniz/pycocoevalcap)

    :param gts: Dictionary with the image ids and their gold captions,
    :param res: Dictionary with the image ids and their generated captions
    :print: Evaluation score (the mean of the scores of all the instances) for each measure
    """

    # Set up scorers
    scorers = [
        (Bleu(4), ["BLEU_1", "BLEU_2", "BLEU_3", "BLEU_4"]),
        (Meteor(), "METEOR")
    ]

    eval_res = {}
    eval_res_total = {}
    # Compute score for each metric
    for scorer, method in scorers:
        try:
            score, scores = scorer.compute_score(gts, res)
        except TypeError:
            score, scores = scorer.compute_score(gts, res)
        if type(method) == list:
            for sc, m in zip(score, method):
                eval_res[m] = sc
            for sc, m in zip(scores, method):
                eval_res_total[m] = sc
        else:
            eval_res[method] = score
            eval_res_total[method] = scores

    return eval_res, eval_res_total

# calculate bleu's total, common and non_common corpus average

def calculate_bleu_t_c_nc(df, second_column):
    gts = {i: [val] for i, val in enumerate(df["gt_caption"])}
    res = {i: [val] for i, val in enumerate(df[second_column])}
    total_bleu_average, _ = compute_scores_bleu(gts, res)

    # Common bleu corpus average
    common_df = df[df['label']==False]
    gts_common = {i: [val] for i, val in enumerate(common_df["gt_caption"])}
    res_common = {i: [val] for i, val in enumerate(common_df[second_column])}
    common_bleu_average, _ = compute_scores_bleu(gts_common, res_common)

    # Non-common bleu corpus average
    non_common_df = df[df['label']==True]
    gts_non_common = {i: [val] for i, val in enumerate(non_common_df["gt_caption"])}
    res_non_common = {i: [val] for i, val in enumerate(non_common_df[second_column])}
    non_common_bleu_average, _ = compute_scores_bleu(gts_non_common, res_non_common)

    complete = {
        'total': {
            'BLEU_1': total_bleu_average['BLEU_1'], 
            'BLEU_4': total_bleu_average['BLEU_4'],
            'METEOR': total_bleu_average['METEOR']
            },
        'common': {
            'BLEU_1': common_bleu_average['BLEU_1'], 
            'BLEU_4': common_bleu_average['BLEU_4'],
            'METEOR': common_bleu_average['METEOR']
            },
        'non_common': {
            'BLEU_1': non_common_bleu_average['BLEU_1'], 
            'BLEU_4': non_common_bleu_average['BLEU_4'],
            'METEOR': non_common_bleu_average['METEOR']
            }
    }
    return complete

class Bleu:
    def __init__(self, n=4):
        # default compute Blue score up to 4
        self._n = n
        self._hypo_for_image = {}
        self.ref_for_image = {}

    def compute_score(self, gts, res):

        assert(gts.keys() == res.keys())
        imgIds = gts.keys()

        bleu_scorer = BleuScorer(n=self._n)
        for id in imgIds:
            hypo = res[id]
            ref = gts[id]

            # Sanity check.
            assert(type(hypo) is list)
            assert(len(hypo) == 1)
            assert(type(ref) is list)
            assert(len(ref) >= 1)

            bleu_scorer += (hypo[0], ref)

        #score, scores = bleu_scorer.compute_score(option='shortest')
        score, scores = bleu_scorer.compute_score(option='closest', verbose=0)
        #score, scores = bleu_scorer.compute_score(option='average', verbose=1)
        
        # return (bleu, bleu_info)
        return score, scores

    def method(self):
        return "Bleu"

In [64]:
import json

with open("./data/experiment/biogpt.json") as f:
    biogpt_he = json.load(f)

with open("./data/experiment/biogpt-all.json") as f:
    biogpt_all = json.load(f)

with open("./data/experiment/generated_captions_he.txt") as f:
    baseline_he = f.readlines()

with open("./data/experiment/generated_captions_full.txt") as f:
    baseline_all = f.readlines()


In [65]:
import pandas as pd

biogpt_he_df = pd.DataFrame(biogpt_he).set_index('image_id').rename(columns={'caption':'biogpt_he_caption'})
biogpt_all_df = pd.DataFrame(biogpt_all).set_index('image_id').rename(columns={'caption':'biogpt_all_caption'})

biogpt_he_df['baseline_he'] = baseline_he
biogpt_all_df['baseline_all'] = baseline_all

In [66]:
import tqdm

def calculate_the_captioning_scores_in_batches(
    df: pd.DataFrame, 
    second_column: str,
    batch_size: int = 32
):
    """
    Computes captioning scores in batches and assigns the results to
    a new column in the DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing the columns 'gt_caption' and `second_column`.
    second_column : str
        The name of the column that holds the generated captions for scoring.
    compute_scores_function : callable
        A function that takes in (gts, res) and returns (eval_res, eval_res_total).
        It's assumed to have the same signature as your original `compute_scores`.
    batch_size : int, optional
        The size of each batch to process, by default 32.

    Returns
    -------
    pd.DataFrame
        The same DataFrame with an additional column 
        f"{second_column}_eval_results" containing per-image scoring results.
    """

    # Prepare a list to hold the per-row dictionaries for the entire DataFrame
    all_instance_results = [None] * len(df)

    # Iterate over the DataFrame in chunks of size `batch_size`
    for start_idx in tqdm.tqdm(range(0, len(df), batch_size)):
        end_idx = min(start_idx + batch_size, len(df))
        df_subset = df.iloc[start_idx:end_idx]

        # Build the per-batch dictionaries of ground truth and generated captions
        # Notice the 0..N indexing within this subset for `compute_scores`
        local_gts = {i: [val] for i, val in enumerate(df_subset["gt_caption"])}
        local_res = {i: [val] for i, val in enumerate(df_subset[second_column])}

        # Compute the scores on this subset
        eval_res, eval_res_total = compute_scores(local_gts, local_res)

        # Assign the per-image scores into the correct slice of our final results
        for local_idx, global_idx in enumerate(range(start_idx, end_idx)):
            row_scores = {}
            for metric, scores_list in eval_res_total.items():
                row_scores[metric] = scores_list[local_idx]
            all_instance_results[global_idx] = row_scores

    # Assign the final per-image results to the new column
    df[f"{second_column}_eval_results"] = all_instance_results

    # Return the updated DataFrame. If you need any aggregated metric across
    # the entire dataset, you can compute or return it separately.
    return eval_res, df


In [67]:
biogpt_all_df["len_biogpt_all_caption"] = biogpt_all_df["biogpt_all_caption"].apply(lambda x: len(str(x).split()))
biogpt_all_df["len_baseline_all"] = biogpt_all_df["baseline_all"].apply(lambda x: len(str(x).split()))

print("Average length of biogpt_all_caption:", biogpt_all_df["len_biogpt_all_caption"].mean())
print("Average length of baseline_he:", biogpt_all_df["len_baseline_all"].mean())

biogpt_he_df["len_biogpt_he_caption"] = biogpt_he_df["biogpt_he_caption"].apply(lambda x: len(str(x).split()))
biogpt_he_df["len_baseline_he"] = biogpt_he_df["baseline_he"].apply(lambda x: len(str(x).split()))

print("Average length of biogpt_he_caption:", biogpt_he_df["len_biogpt_he_caption"].mean())
print("Average length of baseline_he:", biogpt_he_df["len_baseline_he"].mean())


Average length of biogpt_all_caption: 82.78730964467005
Average length of baseline_he: 250.73604060913706
Average length of biogpt_he_caption: 60.54619289340101
Average length of baseline_he: 158.56751269035533


In [68]:
biogpt_he_df['baseline_he'] = biogpt_he_df['baseline_he'].str.replace('\n', '')
biogpt_all_df['baseline_all'] = biogpt_all_df['baseline_all'].str.replace('\n', '')

In [69]:
captioning_metrics_biogpt_he, df_he = calculate_the_captioning_scores_in_batches(biogpt_he_df, 'biogpt_he_caption', 1970)
captioning_metrics_baseline_he, df_he = calculate_the_captioning_scores_in_batches(df_he, 'baseline_he', 1970)

100%|██████████| 1/1 [00:20<00:00, 20.37s/it]
100%|██████████| 1/1 [00:38<00:00, 38.89s/it]


In [70]:
captioning_metrics_biogpt_all, df_all = calculate_the_captioning_scores_in_batches(biogpt_all_df, 'biogpt_all_caption', 1970)
captioning_metrics_baseline_all, df_all = calculate_the_captioning_scores_in_batches(df_all, 'baseline_all', 1970)

100%|██████████| 1/1 [00:41<00:00, 41.21s/it]
100%|██████████| 1/1 [01:16<00:00, 76.21s/it]


In [71]:
captioning_metrics_biogpt_he

{'BLEU_1': 0.33895694732434295,
 'BLEU_2': 0.22218331259067925,
 'BLEU_3': 0.15995441847538674,
 'BLEU_4': 0.12387873113049473,
 'METEOR': 0.18264103292761122,
 'ROUGE_L': 0.28806295350039757,
 'CIDEr': 0.35653341827914276}

In [72]:
captioning_metrics_biogpt_all

{'BLEU_1': 0.3270022918591056,
 'BLEU_2': 0.222950965735909,
 'BLEU_3': 0.16408986234852505,
 'BLEU_4': 0.12737205327226825,
 'METEOR': 0.17833666656398903,
 'ROUGE_L': 0.3256555116847032,
 'CIDEr': 0.2913729757238771}

In [73]:
with open('./data/patient_info/patient_characteristics.xlsx', 'rb') as f:
    patient_info = pd.read_excel(f)

with open('./data/patient_info/report_id_specimen_map.json', 'rb') as f:
    report_id_specimen_map = json.load(f)

patient_info['specimen'] = patient_info['specimen'].apply(
    lambda x: x[:9] + "_" + x[9:]
)

specimen_report_id_map = {v: k for k, v in report_id_specimen_map.items()}
mapping = patient_info[['specimen', 'label']].set_index('specimen').to_dict()['label']

df_he['specimen'] = df_he.index.map(specimen_report_id_map)
df_he['label'] = df_he['specimen'].map(mapping)

df_all['specimen'] = df_all.index.map(specimen_report_id_map)
df_all['label'] = df_all['specimen'].map(mapping)

In [74]:
captioning_metrics_he = pd.DataFrame(df_he['biogpt_he_caption_eval_results'].to_dict()).T
captioning_metrics_baseline_he = pd.DataFrame(df_he['baseline_he_eval_results'].to_dict()).T

captioning_metrics_all = pd.DataFrame(df_all['biogpt_all_caption_eval_results'].to_dict()).T
captioning_metrics_baseline_all = pd.DataFrame(df_all['baseline_all_eval_results'].to_dict()).T

In [None]:
idx = [
    None
]

In [76]:
captioning_metrics_he['specimen'] = captioning_metrics_he.index.map(specimen_report_id_map)
captioning_metrics_he['label'] = captioning_metrics_he['specimen'].map(mapping)

captioning_metrics_baseline_he['specimen'] = captioning_metrics_baseline_he.index.map(specimen_report_id_map)
captioning_metrics_baseline_he['label'] = captioning_metrics_baseline_he['specimen'].map(mapping)

captioning_metrics_all['specimen'] = captioning_metrics_all.index.map(specimen_report_id_map)
captioning_metrics_all['label'] = captioning_metrics_all['specimen'].map(mapping)

captioning_metrics_baseline_all['specimen'] = captioning_metrics_baseline_all.index.map(specimen_report_id_map)
captioning_metrics_baseline_all['label'] = captioning_metrics_baseline_all['specimen'].map(mapping)

In [None]:
experiment_values = captioning_metrics_he[captioning_metrics_he['specimen'].isin(idx)]
experiment_values[experiment_values['label']==True]

In [78]:
captioning_metrics_he['label'].value_counts()

label
False    1597
True      373
Name: count, dtype: int64

In [79]:
import numpy as np

def bootstrap_confidence_intervals(df, second_column, metrics, bootstraps, alpha=0.95):
    """
    """
    lower_p = (1 - alpha) / 2 * 100
    upper_p = (1 + alpha) / 2 * 100

    common_subset = metrics[metrics['label']==False]
    non_common_subset = metrics[metrics['label']==True]

    results_total = {}
    results_common = {}
    results_non_common = {}

    # Different solution for BLEU, corpus average instead of normal average
    bootstrap_values_bleu_1 = {'total': [], 'common': [], 'non_common': []}
    bootstrap_values_bleu_4 = {'total': [], 'common': [], 'non_common': []}
    bootstrap_values_meteor = {'total': [], 'common': [], 'non_common': []}
    for _ in tqdm.tqdm(range(bootstraps)):
        # Total bleu corpus average
        idx = np.random.choice(df.index, len(df), replace=True)
        filtered_df = df.loc[idx]
        gts = {i: [val] for i, val in enumerate(filtered_df["gt_caption"])}
        res = {i: [val] for i, val in enumerate(filtered_df[second_column])}
        total_bleu_average, _ = compute_scores_bleu(gts, res)

        # Common bleu corpus average
        common_df = df[df['label']==False]
        idx_common = np.random.choice(common_df.index, len(common_df), replace=True)
        filtered_common_df = df.loc[idx_common]
        gts_common = {i: [val] for i, val in enumerate(filtered_common_df["gt_caption"])}
        res_common = {i: [val] for i, val in enumerate(filtered_common_df[second_column])}
        common_bleu_average, _ = compute_scores_bleu(gts_common, res_common)

        # Non-common bleu corpus average
        non_common_df = df[df['label']==True]
        idx_non_common = np.random.choice(non_common_df.index, len(non_common_df), replace=True)
        filtered_non_common_df = df.loc[idx_non_common]
        gts_non_common = {i: [val] for i, val in enumerate(filtered_non_common_df["gt_caption"])}
        res_non_common = {i: [val] for i, val in enumerate(filtered_non_common_df[second_column])}
        non_common_bleu_average, _ = compute_scores_bleu(gts_non_common, res_non_common)
        
        bootstrap_values_bleu_1['total'].append(total_bleu_average['BLEU_1'])
        bootstrap_values_bleu_1['common'].append(common_bleu_average['BLEU_1'])
        bootstrap_values_bleu_1['non_common'].append(non_common_bleu_average['BLEU_1'])

        bootstrap_values_bleu_4['total'].append(total_bleu_average['BLEU_4'])
        bootstrap_values_bleu_4['common'].append(common_bleu_average['BLEU_4'])
        bootstrap_values_bleu_4['non_common'].append(non_common_bleu_average['BLEU_4'])

        bootstrap_values_meteor['total'].append(total_bleu_average['METEOR'])
        bootstrap_values_meteor['common'].append(common_bleu_average['METEOR'])
        bootstrap_values_meteor['non_common'].append(non_common_bleu_average['METEOR'])
        
    results_total['BLEU_1'] = (
        np.percentile(bootstrap_values_bleu_1['total'], lower_p),
        np.percentile(bootstrap_values_bleu_1['total'], upper_p)
    )
    results_common['BLEU_1'] = (
        np.percentile(bootstrap_values_bleu_1['common'], lower_p),
        np.percentile(bootstrap_values_bleu_1['common'], upper_p)
    )
    results_non_common['BLEU_1'] = (
        np.percentile(bootstrap_values_bleu_1['non_common'], lower_p),
        np.percentile(bootstrap_values_bleu_1['non_common'], upper_p)
    )

    results_total['BLEU_4'] = (
        np.percentile(bootstrap_values_bleu_4['total'], lower_p),
        np.percentile(bootstrap_values_bleu_4['total'], upper_p)
    )
    results_common['BLEU_4'] = (
        np.percentile(bootstrap_values_bleu_4['common'], lower_p),
        np.percentile(bootstrap_values_bleu_4['common'], upper_p)
    )
    results_non_common['BLEU_4'] = (
        np.percentile(bootstrap_values_bleu_4['non_common'], lower_p),
        np.percentile(bootstrap_values_bleu_4['non_common'], upper_p)
    )

    results_total['METEOR'] = (
        np.percentile(bootstrap_values_meteor['total'], lower_p),
        np.percentile(bootstrap_values_meteor['total'], upper_p)
    )
    results_common['METEOR'] = (
        np.percentile(bootstrap_values_meteor['common'], lower_p),
        np.percentile(bootstrap_values_meteor['common'], upper_p)
    )
    results_non_common['METEOR'] = (
        np.percentile(bootstrap_values_meteor['non_common'], lower_p),
        np.percentile(bootstrap_values_meteor['non_common'], upper_p)
    )

    metric_list = ["ROUGE_L","CIDEr"]
    for metric in metric_list:
        metric_values = metrics[metric].values
        common_values = common_subset[metric].values
        non_common_values = non_common_subset[metric].values
        bootstrap_values = {'total': [], 'common': [], 'non_common': []}
        for _ in range(bootstraps):
            bootstrap_values['total'].append(np.mean(
                np.random.choice(metric_values, 
                                 len(metric_values), 
                                 replace=True))
                                 )
            bootstrap_values['common'].append(np.mean(
                np.random.choice(common_values, 
                                 len(common_subset), 
                                 replace=True))
                                 )
            bootstrap_values['non_common'].append(np.mean(
                np.random.choice(non_common_values, 
                                 len(non_common_subset), 
                                 replace=True))
                                 )
        results_total[metric] = (
            np.percentile(bootstrap_values['total'], lower_p),
            np.percentile(bootstrap_values['total'], upper_p)
        )
        results_common[metric] = (
            np.percentile(bootstrap_values['common'], lower_p),
            np.percentile(bootstrap_values['common'], upper_p)
        )
        results_non_common[metric] = (
            np.percentile(bootstrap_values['non_common'], lower_p),
            np.percentile(bootstrap_values['non_common'], upper_p)
        )


    results_total = pd.DataFrame(results_total, index=['lower', 'upper'])
    results_common = pd.DataFrame(results_common, index=['lower', 'upper'])
    results_non_common = pd.DataFrame(results_non_common, index=['lower', 'upper'])

    return results_total, results_common, results_non_common

In [80]:
n_bootstraps = 1000

In [81]:
total_he, common_he, non_common_he = bootstrap_confidence_intervals(df_he, 'biogpt_he_caption', captioning_metrics_he, n_bootstraps)

100%|██████████| 1000/1000 [9:45:57<00:00, 35.16s/it]  


In [82]:
he_scores = calculate_bleu_t_c_nc(df_he, 'biogpt_he_caption')

In [83]:
cap_metrics = ['ROUGE_L', 'CIDEr']

he_result = {**captioning_metrics_he[cap_metrics].mean().to_dict(), **he_scores['total']}
common_he_result = {**captioning_metrics_he.groupby('label')[cap_metrics].mean().T.to_dict()[False], **he_scores['common']}
non_common_he_result = {**captioning_metrics_he.groupby('label')[cap_metrics].mean().T.to_dict()[True], **he_scores['non_common']}

In [84]:
total_he_baseline, common_he_baseline, non_common_he_baseline = bootstrap_confidence_intervals(df_he, 'baseline_he', captioning_metrics_baseline_he, 1)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:47<00:00, 47.19s/it]


In [85]:
he_baseline_scores = calculate_bleu_t_c_nc(df_he, 'baseline_he')

In [86]:
he_baseline_result = {**captioning_metrics_baseline_he[cap_metrics].mean().to_dict(), **he_baseline_scores['total']}
common_he_baseline_result = {**captioning_metrics_baseline_he.groupby('label')[cap_metrics].mean().T.to_dict()[False], **he_baseline_scores['common']}
non_common_he_baseline_result = {**captioning_metrics_baseline_he.groupby('label')[cap_metrics].mean().T.to_dict()[True], **he_baseline_scores['non_common']}

In [87]:
total_all, common_all, non_common_all = bootstrap_confidence_intervals(df_all, "biogpt_all_caption", captioning_metrics_all, n_bootstraps)

100%|██████████| 1000/1000 [13:05:18<00:00, 47.12s/it] 


In [88]:
all_scores = calculate_bleu_t_c_nc(df_all, 'biogpt_all_caption')

In [89]:
all_result = {**captioning_metrics_all[cap_metrics].mean().to_dict(), **all_scores['total']}
common_all_result = {**captioning_metrics_all.groupby('label')[cap_metrics].mean().T.to_dict()[False], **all_scores['common']}
non_common_all_result = {**captioning_metrics_all.groupby('label')[cap_metrics].mean().T.to_dict()[True], **all_scores['non_common']}

In [90]:
total_all_baseline, common_all_baseline, non_common_all_baseline = bootstrap_confidence_intervals(df_all, 'baseline_all', captioning_metrics_baseline_all, 1)

100%|██████████| 1/1 [01:13<00:00, 73.40s/it]


In [91]:
all_baseline_scores = calculate_bleu_t_c_nc(df_all, 'baseline_all')

In [92]:
all_baseline_result = {**captioning_metrics_baseline_all[cap_metrics].mean().to_dict(), **all_baseline_scores['total']}
common_all_baseline_result = {**captioning_metrics_baseline_all.groupby('label')[cap_metrics].mean().T.to_dict()[False], **all_baseline_scores['common']}
non_common_all_baseline_result = {**captioning_metrics_baseline_all.groupby('label')[cap_metrics].mean().T.to_dict()[True], **all_baseline_scores['non_common']}

In [93]:
import pandas as pd

# Define the data list with model names, subsets, result dictionaries, and confidence interval DataFrames
data = [
    ('biogpt_he', 'Total', he_result, total_he),
    ('biogpt_he', 'Common', common_he_result, common_he),
    ('biogpt_he', 'Non-common', non_common_he_result, non_common_he),
    ('baseline_he', 'Total', he_baseline_result, total_he_baseline),
    ('baseline_he', 'Common', common_he_baseline_result, common_he_baseline),
    ('baseline_he', 'Non-common', non_common_he_baseline_result, non_common_he_baseline),
    ('biogpt_all', 'Total', all_result, total_all),
    ('biogpt_all', 'Common', common_all_result, common_all),
    ('biogpt_all', 'Non-common', non_common_all_result, non_common_all),
    ('baseline_all', 'Total', all_baseline_result, total_all_baseline),
    ('baseline_all', 'Common', common_all_baseline_result, common_all_baseline),
    ('baseline_all', 'Non-common', non_common_all_baseline_result, non_common_all_baseline),
]

# Define the metrics to include in the table
metrics = ['BLEU_1', 'BLEU_4', 'METEOR', 'ROUGE_L', 'CIDEr']

# Create a list to hold the table rows
rows = []

# Populate the rows with formatted metric values and confidence intervals
for model, subset, result_dict, ci_df in data:
    row = {'Model': model, 'Subset': subset}
    for metric in metrics:
        value = result_dict[metric]
        lower = ci_df.loc['lower', metric]
        upper = ci_df.loc['upper', metric]
        # Format the string as "value (lower - upper)" with 3 decimal places
        formatted = f"{value:.3f} ({lower:.3f} - {upper:.3f})"
        row[metric] = formatted
    rows.append(row)

# Create a DataFrame from the rows
df_table = pd.DataFrame(rows)

# Set the multi-index using 'Model' and 'Subset'
df_table.set_index(['Model', 'Subset'], inplace=True)

# Display the table
df_table

Unnamed: 0_level_0,Unnamed: 1_level_0,BLEU_1,BLEU_4,METEOR,ROUGE_L,CIDEr
Model,Subset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
biogpt_he,Total,0.339 (0.331 - 0.347),0.124 (0.117 - 0.131),0.183 (0.179 - 0.186),0.288 (0.280 - 0.296),0.357 (0.301 - 0.413)
biogpt_he,Common,0.357 (0.349 - 0.367),0.138 (0.129 - 0.147),0.194 (0.190 - 0.198),0.303 (0.294 - 0.312),0.424 (0.354 - 0.493)
biogpt_he,Non-common,0.289 (0.275 - 0.304),0.086 (0.076 - 0.097),0.154 (0.149 - 0.160),0.224 (0.214 - 0.235),0.069 (0.042 - 0.109)
baseline_he,Total,0.203 (0.203 - 0.203),0.031 (0.031 - 0.031),0.186 (0.185 - 0.185),0.163 (0.161 - 0.161),0.012 (0.012 - 0.012)
baseline_he,Common,0.192 (0.191 - 0.191),0.031 (0.030 - 0.030),0.191 (0.189 - 0.189),0.166 (0.168 - 0.168),0.013 (0.013 - 0.013)
baseline_he,Non-common,0.250 (0.252 - 0.252),0.030 (0.031 - 0.031),0.170 (0.171 - 0.171),0.152 (0.153 - 0.153),0.008 (0.008 - 0.008)
biogpt_all,Total,0.327 (0.317 - 0.337),0.127 (0.121 - 0.134),0.178 (0.174 - 0.183),0.326 (0.319 - 0.333),0.291 (0.249 - 0.339)
biogpt_all,Common,0.371 (0.362 - 0.381),0.150 (0.142 - 0.157),0.199 (0.194 - 0.203),0.349 (0.341 - 0.356),0.343 (0.289 - 0.404)
biogpt_all,Non-common,0.242 (0.219 - 0.263),0.085 (0.073 - 0.095),0.140 (0.132 - 0.147),0.226 (0.214 - 0.240),0.068 (0.045 - 0.095)
baseline_all,Total,0.180 (0.175 - 0.175),0.029 (0.028 - 0.028),0.167 (0.164 - 0.164),0.159 (0.160 - 0.160),0.010 (0.009 - 0.009)
