In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from IPython.display import display
import numpy as np

# %matplotlib inline

from pathlib import Path


In [2]:
def collect_one(run, csv_folder, test_config='general'):
    #if run.name == 'baseline':
    parameters = {p.split('=')[0]: p.split('=')[1] for p in run.as_posix().split('/') if "=" in p}
    # else:
    #     with open(run / 'config.json', 'r') as f:
    #         cfg = json.load(f)
            
    #     run_dir = cfg["run_dir"]
    #     parameters = {p.split('=')[0]: p.split('=')[1] for p in run_dir.split('/') if "=" in p}

    data = []
    for yamlf in (run / 'inference' / csv_folder).rglob('*.csv'):
        test_config_name = yamlf.parent.stem
        if test_config not in yamlf.parent.stem:
            continue
        df = pd.read_csv(yamlf)
        # keep only the last line. This is because until this commit, we were appending the logs and not overriding them, so the last line is the one we want
        df = df.tail(1)
        df['tok_position_inference'] = 'beginning' if 'tok_beginning' in test_config_name else 'in_place' if 'tok_in_place' in test_config_name else None
        df['inference_config'] = test_config_name.split('_')[-1] if 'conf' in test_config_name else None
        if run.name == 'baseline':
            df['model'] = 'clip_original'
        data.append(df)
    
    data = pd.concat(data)
    # data.columns.names = ['type', 'metric']
    # data.sort_values(by=['type', 'metric'], axis=1, inplace=True)
    data.drop(columns=['epoch', 'step'], inplace=True)
    
    if data.empty:
        print(f'Pred folder is empty: {csv_folder}')
    
    for k, v in parameters.items():
        data[k] = v
    
    return data

def collect_all(root, csv_folder, test_config='general'):
    root = Path(root)
    metrics = [collect_one(csvf.parents[1], csvf.name, test_config=test_config) for csvf in list(root.rglob(csv_folder))]
    metrics = pd.concat(metrics, ignore_index=True)
    return metrics

default_fields_dict = {
    'r1': lambda x: u"{:.1f}".format(x),
    'r5': lambda x: u"{:.1f}".format(x),
    'r10': lambda x: u"{:.1f}".format(x),
    'meanr': lambda x: u"{:.1f}".format(x),
    'medr': lambda x: int(x),
    'spice': lambda x: u"{:.3f}".format(x),
    'spacy': lambda x: u"{:.3f}".format(x),
}
def render_to_latex(metrics, rename_func=default_fields_dict, **latex_kwargs):
    m = metrics.copy()
    # renaming
    for col, lambda_fn in rename_func.items():
        m[col] = m[col].apply(lambda_fn)
    # m = m.applymap(lambda x: u"{:.2f}".format(x))
    ltex = m.style.to_latex(
        **latex_kwargs
    )
    return ltex

In [3]:
# Compute metrics for each detected run

def summarize_metrics(
        metrics, 
        dataset=None, 
        model=None,
        translator=None,
        tok_position=None, 
        training_setup=None, 
        inference_config=None,
        loss=None, 
        learning_rate=None, 
        finetuning=None, 
        drop_i2t=True,
        decimal_places=3):
    
    if dataset is not None:
        metrics = metrics[(metrics['data'] == dataset)]
        metrics.drop(columns="data", inplace=True)

    # TODO: as of now, there is only one split seed.
    # In the future, we would have to average among different splits
    # metrics.drop(columns="split_seed", inplace=True)

    id_vars = ['data', 'model', 'translator', 'tok_position', 'training-setup', 'loss', 'lr', 'finetuning', 'inference_config']

    if translator is not None:
        metrics = metrics[metrics['translator'].isin(learning_rate)]
        if len(translator) == 1:
            metrics.drop(columns="translator", inplace=True)
            id_vars.remove('translator')
    if model is not None:
        metrics = metrics[metrics['model'].isin(learning_rate)]
        if len(model) == 1:
            metrics.drop(columns="model", inplace=True)
            id_vars.remove('model')
    if learning_rate is not None:
        metrics = metrics[metrics['lr'].isin(learning_rate)]
        if len(learning_rate) == 1:
            metrics.drop(columns="lr", inplace=True)
            id_vars.remove('lr')
    if finetuning is not None:
        metrics = metrics[metrics['finetuning'].isin(finetuning)]
        if len(finetuning) == 1:
            metrics.drop(columns="finetuning", inplace=True)
            id_vars.remove('finetuning')
    if tok_position is not None:
        metrics = metrics[metrics['tok_position'].isin(tok_position)]
        if len(tok_position) == 1:
            metrics.drop(columns="tok_position", inplace=True)
            id_vars.remove('tok_position')
    if training_setup is not None:
        metrics = metrics[metrics['training-setup'].isin(training_setup)]
        if len(training_setup) == 1:
            metrics.drop(columns="training-setup", inplace=True)
            id_vars.remove('training-setup')
    if loss is not None:
        metrics = metrics[metrics['loss'].isin(loss)]
        if len(loss) == 1:
            metrics.drop(columns="loss", inplace=True)
            id_vars.remove('loss')
    if inference_config is not None:
        metrics = metrics[metrics['inference_config'].isin(inference_config)]
        if len(inference_config) == 1:
            metrics.drop(columns="inference_config", inplace=True)
            id_vars.remove('inference_config')

    if drop_i2t:
        # remove columns containing i2t in the name of the second level of the multiindex
        metrics = metrics.loc[:, ~metrics.columns.str.contains('i2t')]

    # round to given decimal places
    metrics = metrics.round(decimal_places)

    metrics.set_index(id_vars, inplace=True)
    # split into different dataframes, one for each column (first level of the multiindex)
    # column_types = list(metrics.columns.get_level_values(0).unique())
    # metrics = {c: metrics.loc[:, c].copy() for c in column_types}
    # {k: v.columns.set_names(k, inplace=True) for k, v in metrics.items()}
    return metrics

In [4]:
# rename content of the table
def rename_fn(v):
    mapping = {'ContrastiveFixed': 'Triplet',
               'InfoNCELoss': 'InfoNCE'}
    if v in mapping:
        return mapping[v]
    return v

def render_to_latex(metrics, rename_func=default_fields_dict, **latex_kwargs):
    m = metrics.copy()
     # make bold the best values

    # Custom function to highlight the maximum value in each group
    def highlight_best(data):
        attr = 'font-weight: bold'
        result = pd.DataFrame('', index=data.index, columns=data.columns)
        for col in data.columns:
            best_idx = data[col].idxmax()
            # for idx in best_idx.values:
            result.loc[best_idx, col] = attr
        return result

    styled_df = m.style.apply(highlight_best, axis=None)
    ltex = styled_df.format(precision=2).to_latex(
        **latex_kwargs
    )
    return ltex

# Results - General retrieval (best contrastive sum checkpoint)

In [21]:
# collect all data
ROOT = "runs"

metrics = collect_all(ROOT, 'best-contrastive-sum')
metrics_baselines = collect_all(ROOT, 'original_checkpoint')    # baseline model
metrics_concat = pd.concat([metrics, metrics_baselines], axis=0, join='outer')

metrics = summarize_metrics(
    metrics_concat,
    training_setup=["with_entities", np.nan],
    finetuning=["disabled", "shallow-vpt-5", np.nan],
    # tok_position=["tok_in_place_multi_prompts", np.nan],
    # tok_position_inference=["in_place", None]
    )

# remove contrastive_sum columns
metrics.drop(columns="contrastive_sum", inplace=True)

# remove data, translator, tok_position, training-setup, loss, lr, tok_position_inference from the multi index
metrics.index = metrics.index.droplevel(['data', 'translator', 'training-setup', 'loss', 'lr'])

# reorder columns to t2i-r@1 t2i-r@5 t2i-r@10 t2i-r@50 contrastive_t2i_sum
metrics = metrics[["t2i-r@1", "t2i-r@5", "t2i-r@10", "t2i-r@50", "contrastive_t2i_sum"]]

# transform in percentage
metrics = metrics * 100

# latex = render_to_latex(
#     metrics, 
#     caption="General Retrieval",
#     clines="skip-last;data",
#     hrules=True,
#     column_format="llllccccc",
#     convert_css=True
# )

# print(latex)

metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,t2i-r@1,t2i-r@5,t2i-r@10,t2i-r@50,contrastive_t2i_sum
model,tok_position,finetuning,inference_config,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
idclip,tok_in_place_multi_prompts,disabled,conf5,37.7,66.4,77.1,94.6,275.8
idclip,tok_in_place_multi_prompts,disabled,conf1,37.9,69.1,80.4,96.3,283.7
idclip,tok_in_place_multi_prompts,disabled,conf2,37.7,66.5,76.9,95.0,276.2
idclip,tok_in_place_multi_prompts,disabled,conf4,37.5,66.3,76.9,94.6,275.4
idclip,tok_in_place_multi_prompts,disabled,conf3,37.6,67.0,77.4,95.0,277.1
idclip,tok_in_place_multi_prompts,shallow-vpt-5,conf5,38.8,65.1,75.2,93.5,272.5
idclip,tok_in_place_multi_prompts,shallow-vpt-5,conf1,40.8,69.1,79.2,95.7,284.8
idclip,tok_in_place_multi_prompts,shallow-vpt-5,conf2,38.7,65.4,75.7,93.8,273.6
idclip,tok_in_place_multi_prompts,shallow-vpt-5,conf4,38.7,65.7,75.8,93.7,273.9
idclip,tok_in_place_multi_prompts,shallow-vpt-5,conf3,35.9,63.5,73.7,92.8,265.8


# Results - Entities retrieval (best contrastive sum checkpoint)

In [22]:
# collect all data
ROOT = "runs"

metrics = collect_all(ROOT, 'best-contrastive-sum', test_config='entities')
metrics_baselines = collect_all(ROOT, 'original_checkpoint', test_config='entities')    # baseline model
metrics_concat = pd.concat([metrics, metrics_baselines], axis=0, join='outer')

metrics = summarize_metrics(
    metrics_concat,
    training_setup=["with_entities", np.nan],
    finetuning=["disabled", "shallow-vpt-5", np.nan],
    # tok_position=["tok_in_place_multi_prompts", np.nan],
    # tok_position_inference=["in_place", None]
    )

# remove all columns containing "entity-r"
metrics = metrics.loc[:, ~metrics.columns.str.contains('entity-r')]
metrics.drop(columns="entities_sum", inplace=True)

# remove data, translator, tok_position, training-setup, loss, lr, tok_position_inference from the multi index
metrics.index = metrics.index.droplevel(['data', 'translator', 'training-setup', 'loss', 'lr'])

# reorder columns to t2i-r@1 t2i-r@5 t2i-r@10 t2i-r@50 contrastive_t2i_sum
metrics = metrics[["entity-kmin-r@1", "entity-kmin-r@5", "entity-kmin-r@10", "entity-kmin-r@50", "entities_kmin_sum", "mAP"]]

# transform in percentage
metrics = metrics * 100

metrics

# latex = render_to_latex(
#     metrics, 
#     caption="Entities Retrieval",
#     clines="skip-last;data",
#     hrules=True,
#     column_format="llllcccccc",
#     convert_css=True
# )

# print(latex)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,entity-kmin-r@1,entity-kmin-r@5,entity-kmin-r@10,entity-kmin-r@50,entities_kmin_sum,mAP
model,tok_position,finetuning,inference_config,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
idclip,tok_in_place_multi_prompts,disabled,conf5,22.6,11.3,13.3,29.0,76.3,3.6
idclip,tok_in_place_multi_prompts,disabled,conf2,20.9,11.3,13.3,28.5,74.0,3.6
idclip,tok_in_place_multi_prompts,disabled,conf3,19.3,10.4,13.0,28.4,71.0,3.3
idclip,tok_in_place_multi_prompts,disabled,conf1,15.9,9.3,12.8,28.4,66.4,2.9
idclip,tok_in_place_multi_prompts,disabled,conf4,21.8,11.3,12.9,29.6,75.7,3.6
idclip,tok_in_place_multi_prompts,shallow-vpt-5,conf5,22.9,13.6,16.2,31.2,83.9,4.3
idclip,tok_in_place_multi_prompts,shallow-vpt-5,conf2,22.1,12.6,15.8,30.2,80.7,4.1
idclip,tok_in_place_multi_prompts,shallow-vpt-5,conf3,21.2,11.9,13.7,29.2,76.0,3.6
idclip,tok_in_place_multi_prompts,shallow-vpt-5,conf1,19.3,12.3,14.9,32.0,78.4,3.8
idclip,tok_in_place_multi_prompts,shallow-vpt-5,conf4,21.5,13.2,16.1,31.2,82.0,4.1


# Results - General retrieval (best entities sum checkpoint)

In [17]:
# collect all data
ROOT = "runs"

metrics = collect_all(ROOT, 'best-entities-sum')
metrics_baselines = collect_all(ROOT, 'original_checkpoint')    # baseline model
metrics_concat = pd.concat([metrics, metrics_baselines], axis=0, join='outer')

metrics = summarize_metrics(
    metrics_concat,
    training_setup=["with_entities", np.nan],
    finetuning=["disabled", "shallow-vpt-5", np.nan],
    # tok_position=["tok_in_place_multi_prompts", np.nan],
    # tok_position_inference=["in_place", None]
    )

# remove contrastive_sum columns
metrics.drop(columns="contrastive_sum", inplace=True)

# remove data, translator, tok_position, training-setup, loss, lr, tok_position_inference from the multi index
metrics.index = metrics.index.droplevel(['data', 'translator', 'training-setup', 'loss', 'lr'])

# reorder columns to t2i-r@1 t2i-r@5 t2i-r@10 t2i-r@50 contrastive_t2i_sum
metrics = metrics[["t2i-r@1", "t2i-r@5", "t2i-r@10", "t2i-r@50", "contrastive_t2i_sum"]]

# transform in percentage
metrics = metrics * 100

# latex = render_to_latex(
#     metrics, 
#     caption="General Retrieval",
#     clines="skip-last;data",
#     hrules=True,
#     column_format="llllccccc",
#     convert_css=True
# )

# print(latex)

metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,t2i-r@1,t2i-r@5,t2i-r@10,t2i-r@50,contrastive_t2i_sum
model,tok_position,finetuning,inference_config,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
idclip,tok_in_place_multi_prompts,disabled,conf5,36.1,63.5,74.0,93.5,267.1
idclip,tok_in_place_multi_prompts,disabled,conf1,36.6,66.9,77.6,95.3,276.4
idclip,tok_in_place_multi_prompts,disabled,conf2,36.3,63.7,74.4,93.9,268.4
idclip,tok_in_place_multi_prompts,disabled,conf4,36.2,63.2,73.9,93.6,266.8
idclip,tok_in_place_multi_prompts,disabled,conf3,36.1,64.0,75.1,93.9,269.1
idclip,tok_in_place_multi_prompts,shallow-vpt-5,conf1,40.7,68.6,79.0,95.7,284.0
idclip,tok_in_place_multi_prompts,shallow-vpt-5,conf2,38.3,65.5,75.4,93.8,272.9
idclip,tok_in_place_multi_prompts,shallow-vpt-5,conf3,36.1,63.5,73.8,93.0,266.4
idclip,tok_beginning_multi_prompts,disabled,conf5,35.9,63.3,74.6,93.4,267.2
idclip,tok_beginning_multi_prompts,disabled,conf1,36.9,67.2,78.0,95.4,277.5


# Results - Entities retrieval (best entities sum checkpoint)

In [20]:
# collect all data
ROOT = "runs"

metrics = collect_all(ROOT, 'best-entities-sum', test_config='entities')
metrics_baselines = collect_all(ROOT, 'original_checkpoint', test_config='entities')    # baseline model
metrics_concat = pd.concat([metrics, metrics_baselines], axis=0, join='outer')

metrics = summarize_metrics(
    metrics_concat,
    training_setup=["with_entities", np.nan],
    finetuning=["disabled", "shallow-vpt-5", np.nan],
    # tok_position=["tok_in_place_multi_prompts", np.nan],
    # tok_position_inference=["in_place", None]
    )

# remove all columns containing "entity-r"
metrics = metrics.loc[:, ~metrics.columns.str.contains('entity-r')]
metrics.drop(columns="entities_sum", inplace=True)

# remove data, translator, tok_position, training-setup, loss, lr, tok_position_inference from the multi index
metrics.index = metrics.index.droplevel(['data', 'translator', 'training-setup', 'loss', 'lr'])

# reorder columns to t2i-r@1 t2i-r@5 t2i-r@10 t2i-r@50 contrastive_t2i_sum
metrics = metrics[["entity-kmin-r@1", "entity-kmin-r@5", "entity-kmin-r@10", "entity-kmin-r@50", "entities_kmin_sum", "mAP"]]

# transform in percentage
metrics = metrics * 100

metrics

# latex = render_to_latex(
#     metrics, 
#     caption="Entities Retrieval",
#     clines="skip-last;data",
#     hrules=True,
#     column_format="llllcccccc",
#     convert_css=True
# )

# print(latex)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,entity-kmin-r@1,entity-kmin-r@5,entity-kmin-r@10,entity-kmin-r@50,entities_kmin_sum,mAP
model,tok_position,finetuning,inference_config,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
idclip,tok_in_place_multi_prompts,disabled,conf5,19.8,10.8,13.5,27.1,71.3,3.5
idclip,tok_in_place_multi_prompts,disabled,conf2,20.1,11.1,12.6,27.8,71.7,3.4
idclip,tok_in_place_multi_prompts,disabled,conf3,19.8,10.7,11.9,26.7,69.1,3.3
idclip,tok_in_place_multi_prompts,disabled,conf1,16.2,9.4,11.9,27.1,64.6,3.0
idclip,tok_in_place_multi_prompts,disabled,conf4,20.7,10.7,13.3,27.9,72.6,3.6
idclip,tok_in_place_multi_prompts,shallow-vpt-5,conf5,22.9,13.8,16.2,30.7,83.6,4.2
idclip,tok_in_place_multi_prompts,shallow-vpt-5,conf2,22.6,12.5,15.3,30.2,80.6,4.0
idclip,tok_in_place_multi_prompts,shallow-vpt-5,conf3,21.5,11.4,13.6,28.5,75.0,3.6
idclip,tok_in_place_multi_prompts,shallow-vpt-5,conf1,21.5,12.4,14.9,31.3,80.1,3.9
idclip,tok_in_place_multi_prompts,shallow-vpt-5,conf4,24.3,13.0,16.4,31.3,84.9,4.2
