In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from IPython.display import display
import numpy as np

# %matplotlib inline

from pathlib import Path


In [17]:
def collect_one(run, csv_folder, test_config='general'):
    #if run.name == 'baseline':
    parameters = {p.split('=')[0]: p.split('=')[1] for p in run.as_posix().split('/') if "=" in p}
    # else:
    #     with open(run / 'config.json', 'r') as f:
    #         cfg = json.load(f)
            
    #     run_dir = cfg["run_dir"]
    #     parameters = {p.split('=')[0]: p.split('=')[1] for p in run_dir.split('/') if "=" in p}

    data = []
    for yamlf in (run / 'inference' / csv_folder).rglob('*.csv'):
        test_config_name = yamlf.parent.stem
        if test_config not in yamlf.parent.stem:
            continue
        df = pd.read_csv(yamlf)
        # keep only the last line. This is because until this commit, we were appending the logs and not overriding them, so the last line is the one we want
        df = df.tail(1)
        df['tok_position_inference'] = 'beginning' if 'tok_beginning' in test_config_name else 'in_place' if 'tok_in_place' in test_config_name else None
        df['inference_config'] = test_config_name.split('retrieval_')[-1] if 'retrieval_' in test_config_name else None
        if run.name == 'baseline':
            df['model'] = 'clip_original'
        data.append(df)
    
    data = pd.concat(data)
    # data.columns.names = ['type', 'metric']
    # data.sort_values(by=['type', 'metric'], axis=1, inplace=True)
    data.drop(columns=['epoch', 'step'], inplace=True)
    
    if data.empty:
        print(f'Pred folder is empty: {csv_folder}')
    
    for k, v in parameters.items():
        data[k] = v
    
    return data

def collect_all(root, csv_folder, test_config='general'):
    root = Path(root)
    metrics = [collect_one(csvf.parents[1], csvf.name, test_config=test_config) for csvf in list(root.rglob(csv_folder))]
    metrics = pd.concat(metrics, ignore_index=True)
    return metrics

default_fields_dict = {
    'r1': lambda x: u"{:.1f}".format(x),
    'r5': lambda x: u"{:.1f}".format(x),
    'r10': lambda x: u"{:.1f}".format(x),
    'meanr': lambda x: u"{:.1f}".format(x),
    'medr': lambda x: int(x),
    'spice': lambda x: u"{:.3f}".format(x),
    'spacy': lambda x: u"{:.3f}".format(x),
}
def render_to_latex(metrics, rename_func=default_fields_dict, **latex_kwargs):
    m = metrics.copy()
    # renaming
    for col, lambda_fn in rename_func.items():
        m[col] = m[col].apply(lambda_fn)
    # m = m.applymap(lambda x: u"{:.2f}".format(x))
    ltex = m.style.to_latex(
        **latex_kwargs
    )
    return ltex

In [18]:
# Compute metrics for each detected run

def summarize_metrics(
        metrics, 
        dataset=None, 
        model=None,
        translator=None,
        tok_position=None, 
        training_setup=None, 
        inference_config=None,
        loss=None, 
        learning_rate=None, 
        finetuning=None, 
        drop_i2t=True,
        decimal_places=3):
    
    if dataset is not None:
        metrics = metrics[(metrics['data'] == dataset)]
        metrics.drop(columns="data", inplace=True)

    # TODO: as of now, there is only one split seed.
    # In the future, we would have to average among different splits
    # metrics.drop(columns="split_seed", inplace=True)

    id_vars = ['data', 'model', 'translator', 'tok_position', 'training-setup', 'loss', 'lr', 'finetuning', 'inference_config']

    if translator is not None:
        metrics = metrics[metrics['translator'].isin(translator)]
        if len(translator) == 1:
            metrics.drop(columns="translator", inplace=True)
            id_vars.remove('translator')
    if model is not None:
        metrics = metrics[metrics['model'].isin(model)]
        if len(model) == 1:
            metrics.drop(columns="model", inplace=True)
            id_vars.remove('model')
    if learning_rate is not None:
        metrics = metrics[metrics['lr'].isin(learning_rate)]
        if len(learning_rate) == 1:
            metrics.drop(columns="lr", inplace=True)
            id_vars.remove('lr')
    if finetuning is not None:
        metrics = metrics[metrics['finetuning'].isin(finetuning)]
        if len(finetuning) == 1:
            metrics.drop(columns="finetuning", inplace=True)
            id_vars.remove('finetuning')
    if tok_position is not None:
        metrics = metrics[metrics['tok_position'].isin(tok_position)]
        if len(tok_position) == 1:
            metrics.drop(columns="tok_position", inplace=True)
            id_vars.remove('tok_position')
    if training_setup is not None:
        metrics = metrics[metrics['training-setup'].isin(training_setup)]
        if len(training_setup) == 1:
            metrics.drop(columns="training-setup", inplace=True)
            id_vars.remove('training-setup')
    if loss is not None:
        metrics = metrics[metrics['loss'].isin(loss)]
        if len(loss) == 1:
            metrics.drop(columns="loss", inplace=True)
            id_vars.remove('loss')
    if inference_config is not None:
        metrics = metrics[metrics['inference_config'].isin(inference_config)]
        if len(inference_config) == 1:
            metrics.drop(columns="inference_config", inplace=True)
            id_vars.remove('inference_config')

    if drop_i2t:
        # remove columns containing i2t in the name of the second level of the multiindex
        metrics = metrics.loc[:, ~metrics.columns.str.contains('i2t')]

    # round to given decimal places
    metrics = metrics.round(decimal_places)

    metrics.set_index(id_vars, inplace=True)
    metrics.sort_index(inplace=True)

    return metrics

In [19]:
# rename content of the table
def rename_fn(v):
    mapping = {'ContrastiveFixed': 'Triplet',
               'InfoNCELoss': 'InfoNCE'}
    if v in mapping:
        return mapping[v]
    return v

def render_to_latex(metrics, rename_func=default_fields_dict, **latex_kwargs):
    m = metrics.copy()
     # make bold the best values

    # Custom function to highlight the maximum value in each group
    def highlight_best(data):
        attr = 'font-weight: bold'
        result = pd.DataFrame('', index=data.index, columns=data.columns)
        for col in data.columns:
            best_idx = data[col].idxmax()
            # for idx in best_idx.values:
            result.loc[best_idx, col] = attr
        return result

    styled_df = m.style.apply(highlight_best, axis=None)
    ltex = styled_df.format(precision=2).to_latex(
        **latex_kwargs
    )
    return ltex

# Results - General retrieval (best contrastive sum checkpoint)

In [26]:
# collect all data
ROOT = "runs"

metrics = collect_all(ROOT, 'best-contrastive-sum')
metrics_baselines = collect_all(ROOT, 'original_checkpoint')    # baseline model
metrics_concat = pd.concat([metrics, metrics_baselines], axis=0, join='outer')

metrics = summarize_metrics(
    metrics_concat,
    training_setup=["with_entities", np.nan],
    finetuning=["disabled", "shallow-vpt-5"], #, np.nan],
    model=["idclip"],
    inference_config=["conf1", "conf3", "conf1_static5", "conf3_static5", "baseline", "baseline_with_original_names"],
    tok_position=["tok_beginning_multi_prompts"],
    # tok_position=["tok_in_place_multi_prompts", np.nan],
    # tok_position_inference=["in_place", None]
    )

# remove contrastive_sum columns
metrics.drop(columns="contrastive_sum", inplace=True)

# remove data, translator, tok_position, training-setup, loss, lr, tok_position_inference from the multi index
metrics.index = metrics.index.droplevel(['data', 'translator', 'training-setup', 'loss', 'lr'])

# reorder columns to t2i-r@1 t2i-r@5 t2i-r@10 t2i-r@50 contrastive_t2i_sum
metrics = metrics[["t2i-r@1", "t2i-r@5", "t2i-r@10", "t2i-r@50", "contrastive_t2i_sum"]]

# transform in percentage
metrics = metrics * 100

# # Select the top 2 rows with the highest "contrastive_t2i_sum" for each group of multiindex elements except "inference_config"
# # a = metrics.groupby(['model', 'tok_position', 'finetuning'])['contrastive_t2i_sum'].apply(lambda x: x.nlargest(1).index)
# # Reset the index to work with groupby and nlargest
# metrics_reset = metrics.reset_index()
# a = metrics_reset.groupby(['model', 'tok_position', 'finetuning'])['contrastive_t2i_sum'].nlargest(2)
# # Select the top 2 rows with the highest "contrastive_t2i_sum" for each group of multiindex elements except "inference_config"
# best_metrics = metrics_reset.loc[metrics_reset.groupby(['model', 'tok_position', 'finetuning'])['contrastive_t2i_sum'].nlargest(5).index.get_level_values(-1).tolist()]
# # Set the index back to the original multiindex
# best_metrics.set_index(['model', 'tok_position', 'finetuning', 'inference_config'], inplace=True)

# best_metrics

latex = render_to_latex(
    metrics, 
    caption="General Retrieval",
    clines="skip-last;data",
    hrules=True,
    column_format="llllccccc",
    convert_css=True
)

metrics.to_csv('metrics_training_with_tok_at_beginning.csv')

print(latex)

# metrics

\begin{table}
\caption{General Retrieval}
\begin{tabular}{llllccccc}
\toprule
 &  & t2i-r@1 & t2i-r@5 & t2i-r@10 & t2i-r@50 & contrastive_t2i_sum \\
finetuning & inference_config &  &  &  &  &  \\
\midrule
\multirow[c]{6}{*}{disabled} & baseline & 12.10 & 56.20 & 69.10 & 92.20 & 229.60 \\
 & baseline_with_original_names & 26.20 & 58.20 & 70.10 & 91.80 & 246.30 \\
 & conf1 & 36.30 & 66.60 & 77.80 & 95.20 & 275.90 \\
 & conf1_static5 & 39.30 & 69.70 & \bfseries 80.80 & 96.50 & 286.20 \\
 & conf3 & 36.40 & 65.00 & 75.80 & 94.40 & 271.50 \\
 & conf3_static5 & 38.00 & 67.10 & 78.30 & 95.50 & 278.90 \\
\cline{1-7}
\multirow[c]{6}{*}{shallow-vpt-5} & baseline & 12.40 & 57.90 & 70.60 & 93.20 & 234.10 \\
 & baseline_with_original_names & 27.90 & 61.70 & 72.90 & 93.00 & 255.50 \\
 & conf1 & 39.20 & 68.20 & 78.50 & 95.70 & 281.60 \\
 & conf1_static5 & \bfseries 43.20 & \bfseries 71.20 & 80.80 & \bfseries 96.70 & \bfseries 291.90 \\
 & conf3 & 37.50 & 66.10 & 76.30 & 94.30 & 274.10 \\
 & conf3_sta

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metrics.drop(columns="model", inplace=True)


# Results - Entities retrieval (best contrastive sum checkpoint)

In [25]:
# collect all data
ROOT = "runs"

metrics = collect_all(ROOT, 'best-contrastive-sum', test_config='entities')
metrics_baselines = collect_all(ROOT, 'original_checkpoint', test_config='entities')    # baseline model
metrics_concat = pd.concat([metrics, metrics_baselines], axis=0, join='outer')

metrics = summarize_metrics(
    metrics_concat,
    training_setup=["with_entities", np.nan],
    finetuning=["disabled", "shallow-vpt-5"], #, np.nan],
    model=["idclip"],
    # inference_config=["conf4", "baseline_with_original_names"],
    tok_position=["tok_beginning_multi_prompts"],
    # tok_position=["tok_in_place_multi_prompts", np.nan],
    # tok_position_inference=["in_place", None]
    )

# remove all columns containing "entity-r"
metrics = metrics.loc[:, ~metrics.columns.str.contains('entity-r')]
metrics.drop(columns="entities_sum", inplace=True)

# remove data, translator, tok_position, training-setup, loss, lr, tok_position_inference from the multi index
metrics.index = metrics.index.droplevel(['data', 'translator', 'training-setup', 'loss', 'lr'])

# reorder columns to t2i-r@1 t2i-r@5 t2i-r@10 t2i-r@50 contrastive_t2i_sum
metrics = metrics[["entity-kmin-r@1", "entity-kmin-r@5", "entity-kmin-r@10", "entity-kmin-r@50", "entities_kmin_sum", "mAP"]]

# transform in percentage
metrics = metrics * 100

latex = render_to_latex(
    metrics, 
    caption="Entities Retrieval",
    clines="skip-last;data",
    hrules=True,
    column_format="llllcccccc",
    convert_css=True
)

metrics
# print(latex)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metrics.drop(columns="model", inplace=True)


Unnamed: 0_level_0,Unnamed: 1_level_0,entity-kmin-r@1,entity-kmin-r@5,entity-kmin-r@10,entity-kmin-r@50,entities_kmin_sum,mAP
finetuning,inference_config,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
disabled,baseline_with_original_names,12.8,6.0,8.4,17.5,44.8,2.1
disabled,conf1,15.4,11.1,13.5,28.0,68.0,3.1
disabled,conf1_2toks,16.5,10.8,13.1,28.1,68.5,3.0
disabled,conf1_3toks,18.2,10.8,12.8,27.4,69.2,3.1
disabled,conf2,20.7,12.6,14.4,28.0,75.7,3.7
disabled,conf3,20.4,11.8,13.9,28.0,74.1,3.6
disabled,conf4,21.8,11.8,14.2,28.2,75.9,3.6
disabled,conf5,21.8,11.8,14.0,27.5,75.1,3.6
disabled,conf6,16.2,10.1,12.2,27.9,66.4,2.9
shallow-vpt-5,baseline_with_original_names,15.4,7.3,9.2,19.1,51.0,2.5


# Results - General retrieval (best entities sum checkpoint)

In [None]:
# collect all data
ROOT = "runs"

metrics = collect_all(ROOT, 'best-entities-sum')
metrics_baselines = collect_all(ROOT, 'original_checkpoint')    # baseline model
metrics_concat = pd.concat([metrics, metrics_baselines], axis=0, join='outer')

metrics = summarize_metrics(
    metrics_concat,
    training_setup=["with_entities", np.nan],
    finetuning=["disabled", "shallow-vpt-5", np.nan],
    # tok_position=["tok_in_place_multi_prompts", np.nan],
    # tok_position_inference=["in_place", None]
    )

# remove contrastive_sum columns
metrics.drop(columns="contrastive_sum", inplace=True)

# remove data, translator, tok_position, training-setup, loss, lr, tok_position_inference from the multi index
metrics.index = metrics.index.droplevel(['data', 'translator', 'training-setup', 'loss', 'lr'])

# reorder columns to t2i-r@1 t2i-r@5 t2i-r@10 t2i-r@50 contrastive_t2i_sum
metrics = metrics[["t2i-r@1", "t2i-r@5", "t2i-r@10", "t2i-r@50", "contrastive_t2i_sum"]]

# transform in percentage
metrics = metrics * 100

# latex = render_to_latex(
#     metrics, 
#     caption="General Retrieval",
#     clines="skip-last;data",
#     hrules=True,
#     column_format="llllccccc",
#     convert_css=True
# )

# print(latex)

metrics

# Results - Entities retrieval (best entities sum checkpoint)

In [None]:
# collect all data
ROOT = "runs"

metrics = collect_all(ROOT, 'best-entities-sum', test_config='entities')
metrics_baselines = collect_all(ROOT, 'original_checkpoint', test_config='entities')    # baseline model
metrics_concat = pd.concat([metrics, metrics_baselines], axis=0, join='outer')

metrics = summarize_metrics(
    metrics_concat,
    training_setup=["with_entities", np.nan],
    finetuning=["disabled", "shallow-vpt-5", np.nan],
    # tok_position=["tok_in_place_multi_prompts", np.nan],
    # tok_position_inference=["in_place", None]
    )

# remove all columns containing "entity-r"
metrics = metrics.loc[:, ~metrics.columns.str.contains('entity-r')]
metrics.drop(columns="entities_sum", inplace=True)

# remove data, translator, tok_position, training-setup, loss, lr, tok_position_inference from the multi index
metrics.index = metrics.index.droplevel(['data', 'translator', 'training-setup', 'loss', 'lr'])

# reorder columns to t2i-r@1 t2i-r@5 t2i-r@10 t2i-r@50 contrastive_t2i_sum
metrics = metrics[["entity-kmin-r@1", "entity-kmin-r@5", "entity-kmin-r@10", "entity-kmin-r@50", "entities_kmin_sum", "mAP"]]

# transform in percentage
metrics = metrics * 100

metrics

# latex = render_to_latex(
#     metrics, 
#     caption="Entities Retrieval",
#     clines="skip-last;data",
#     hrules=True,
#     column_format="llllcccccc",
#     convert_css=True
# )

# print(latex)