In [1]:
import numpy as np
import pandas as pd

# Hyperparameter tuning results

In [2]:
def remove_underscore_after(val): # Remove underscore, and keep the part after the underscore
    return val.split('_')[-1]
def remove_underscore_before(val): # Remove underscore, and keep the part before the underscore
    return val.split('_')[0]
def map2d(func, grid): # Mapping for 2d arrays, from: https://stackoverflow.com/questions/70742445/elegant-map-over-2d-list
    return [[func(value) for value in row] for row in grid]
def full_display(df):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(df)
def prep_and_store_results(txt_path: str): # Prepare the (text-file) results to be stored in a csv
    with open(txt_path, 'r') as f:
        results = f.readlines()

    # Remove any non-result lines from the eval file, and split the lines on the tab character
    # (results have format: model_name\tdataset_name\tmetric_name\tmetric_value)
    results = [r.replace('\n','').split('\t') for r in results if '\t' in r]

    # Make a dataframe from the results
    df = pd.DataFrame(results, columns = ['model', 'dataset', 'metric', 'value'])
    df['value'] = pd.to_numeric(df['value'])
    # Remove the timestamp from the model names
    df['model'] = df['model'].map(lambda x: '-'.join(x.split('-')[2:]))

    # Make a list of model names, split by parameters - model names look like var1_xxx-var2_yyy-var3_zzz-... so split on '-'
    models = df['model'].str.split('-').tolist()
    
    # Remove all underscores from our 2d list, keep one list of the param names and one with param vals
    model_names_list = map2d(remove_underscore_before, models) # Keep the part before the underscore, aka the variable name
    model_names = np.array(model_names_list)
    model_val_list = map2d(remove_underscore_after, models) # Keep the part after the underscore, aka the variable's value
    model_vals = np.array(model_val_list)
    
    print('Number of evaluations:', model_vals.shape[0])

    # Splitting model name into columns, using the list of variables and their values
    for i in range(model_names.shape[-1]):
        name = model_names[0][i]
        val = model_vals[:,i]
        df[name] = val
        try: # Try to make columns numeric if possible
            df[name] = pd.to_numeric(df[name]) 
        except:
            pass
    
    # Remove columns that aren't used
    df = df.drop(['vit', 'model', 'data', 'ALL',  'kw'], axis = 1) #'method', 'AL.iter', 'ratio', 'PL',
    if 'fold' in df.columns.tolist():
        df = df.drop(['fold'], axis = 1)
    
    # Replace 'None' with NaN, to allow conversion to numerical
    df['AL.iter'] = df['AL.iter'].replace('None', np.nan)
    df['AL.iter'] = pd.to_numeric(df['AL.iter'])
    df['AL.epochs'] = df['AL.epochs'].replace('None', np.nan)
    df['AL.epochs'] = pd.to_numeric(df['AL.epochs'])

    cols = sorted(df.columns.tolist()) # Get a list of the columns of the dataframe
    print('Column names:', cols)

    display(df)

    # Group by the model parameters and randomly X model runs to use in the analysis (5 for test, 9 for val) 
    df_grouped = df.groupby(list(set(cols)-set(['value'])), dropna = False).sample(frac = 1).head(5 if 'test' in txt_path else 9)
    # Compute mean, std performance and number of runs for each model 
    df_grouped = df_grouped.agg({'value':['mean', 'std', 'count']})

    df_grouped.to_csv(txt_path.replace('.txt', '.csv'))
    display(df_grouped)
    return df_grouped

def get_results_per_model(df, hyperparam_tuning = True):
    if hyperparam_tuning: # Only report on the results for a specific label ratio if we're hyperparam tuning
        df = df[(df['ratio'] == 0.1)]
    df_no_finetune = df[(df['epochs']==0)]
    df_baseline = df[(df['AL.iter'].isna()) & (df['method'] == 'base') & (df['epochs'] > 0)]
    df_S_CLIP = df[(df['AL.iter'].isna()) & (df['method'] == 'ours') & (df['PL'].str.contains('ot.'))]
    df_soft_PL = df[(df['AL.iter'].isna()) & (df['method'] == 'ours') & (df['PL'].str.contains('soft.'))]
    df_hard_PL = df[(df['AL.iter'].isna()) & (df['method'] == 'ours') & (df['PL'].str.contains('hard.'))]
    df_basic_AL = df[(df['AL.iter']>0) & (df['epochs']==15)]
    
    return { # return a dictionary of results per model
        'baseline-finetuned': df_baseline, 'baseline-not-finetuned' : df_no_finetune, 's-clip': df_S_CLIP, 'soft-pl': df_soft_PL, 
        'hard-pl': df_hard_PL, 'basic-al': df_basic_AL, 'probvlm': None 
    }
    

In [3]:
validation_path = './eval.txt'
df_grouped = prep_and_store_results(validation_path)

  model_names = np.array(model_names_list)
  model_vals = np.array(model_val_list)


Number of evaluations: 50081


IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [None]:
df_results = df_grouped.reset_index()
df_results['AL.epochs'] = df_results['AL.epochs'].fillna(df_results['epochs'])
df_results = df_results.sort_values(['metric', 'dataset', ('value', 'mean')])
display(df_results)

In [None]:
# Get the results, split per model (e.g. get results of baseline, s-clip, etc.)
results_dict = get_results_per_model(df_results)
df_baseline = results_dict['baseline-finetuned']
df_basic_AL = results_dict['basic-al']
df_S_CLIP = results_dict['s-clip']
df_soft_PL = results_dict['soft-pl']
df_hard_PL = results_dict['hard-pl']

In [None]:
# Results with basic active learning
full_display(df_basic_AL.groupby(['metric', 'dataset']).tail(3)) #

In [None]:
full_display(df_baseline.groupby(['metric', 'dataset']).tail(3))

In [None]:
full_display(df_S_CLIP.groupby(['metric', 'dataset']).tail(3))

In [None]:
full_display(df_soft_PL.groupby(['metric', 'dataset']).tail(3))

In [None]:
full_display(df_hard_PL.groupby(['metric', 'dataset']).tail(3))

# Test split evaluation results

In [None]:
test_path = './test_eval.txt'
df_grouped = prep_and_store_results(test_path)

In [None]:
df_results = df_grouped.reset_index()
df_results['AL.epochs'] = df_results['AL.epochs'].fillna(df_results['epochs'])
df_results = df_results.sort_values(['metric', 'dataset', ('value', 'mean')])
display(df_results)

In [None]:
# Get the results, split per model (e.g. get results of baseline, s-clip, etc.)
results_dict = get_results_per_model(df_results, False)
df_no_finetune = results_dict['baseline-not-finetuned']
df_baseline = results_dict['baseline-finetuned']
df_basic_AL = results_dict['basic-al']
df_S_CLIP = results_dict['s-clip']
df_soft_PL = results_dict['soft-pl']
df_hard_PL = results_dict['hard-pl']

In [None]:
display(df_no_finetune)

In [None]:
df_baseline = df_baseline[(df_baseline['epochs']==25)&(df_baseline['bs']==64)&(df_baseline['lr']==5e-5)]
full_display(df_baseline) # .groupby(['metric', 'dataset'])

In [None]:
def performance_per_label_ratio(df, metric, dataset):
    df_filtered = df[(df['metric'] == metric) & (df['dataset'] == dataset)]
    # Ensure the order is from the smallest label ratio to the largest
    df_filtered = df_filtered.sort_values(by='ratio')
    display(df_filtered)
    # Get the performance mean for the metric
    performance = df_filtered[('value', 'mean')].to_numpy()
    label_ratios = df_filtered['ratio'].to_numpy()
    return performance, label_ratios

In [None]:
import matplotlib.pyplot as plt
def plot_model_comparison(results_dict, metric, dataset):
    epochs = {'baseline-finetuned': 25, 'basic-al': 15, 's-clip': 25, 'soft-pl': 30, 
              'hard-pl': 25,  'baseline-not-finetuned': 0}
    # Get the performance of each model, for the given metric and dataset
    for model in results_dict:
        model_results = results_dict[model]
        # If we have any results for the given model, add it to the plot
        if model_results is not None and model_results.shape[0] > 0: 
            # Filter for correct number of epochs
            model_results = model_results[model_results['epochs'] == epochs[model]]
            performance, label_ratios = performance_per_label_ratio(model_results, metric, dataset)
            if model == 'baseline-not-finetuned': # repeat static baseline performance 5 times (once for each label ratio)
                performance = 5 * [performance] 
            plt.plot(performance, label = model)
    # Add information about the dataset, metric and label ratios to the plot
    plt.title(f'{metric} (dataset: {dataset})')
    plt.xticks(np.arange(label_ratios.shape[0]), label_ratios)
    ylabel = 'recall' if 'R@' in metric else 'accuracy'
    plt.ylabel(ylabel)
    plt.xlabel('Label ratio')
    plt.legend()
    plt.show()


In [None]:
metric = 'image_to_text_R@1'
datasets = ['RSICD', 'UCM', 'Sydney']
plot_model_comparison(results_dict, metric, 'RSICD')

In [None]:
plot_model_comparison(results_dict, metric, 'UCM')

In [None]:
plot_model_comparison(results_dict, metric, 'Sydney')

In [None]:
metric = 'zeroshot-val-top1'
datasets = ["RSICD-CLS", "UCM-CLS", "WHU-RS19", "RSSCN7", "AID", "RESISC45"]
for dataset in datasets:
    print(dataset, metric)
    plot_model_comparison(results_dict, metric, dataset)