# Export training and eval data

This creates the `wandb-runsXsweepsXeval.csv` file that contains all the sweep and run metadata merged with the post-training evaluations, which is running the best model on the validation and test sets.
During the original experiments, there were naming conventions due to there not being enough entropy in the run name generation.
Much of this file deals with the problem of ensuring no name collisions cause results from one model to be attributed to another.
In a small number of cases, no positive identification could be made, and so those training runs are omitted both from `wandb-runsXsweepsXeval.csv` and the reported results.

In [122]:
import pandas as pd

In [None]:
runs = pd.read_csv('analysis/wandb-runs.csv')
print(runs.shape)
sweeps = pd.read_csv('analysis/wandb-sweeps.csv')
print(sweeps.shape)
runsXsweeps = pd.merge(runs, sweeps, left_on='sweep_id', right_on='id', suffixes=('_run', '_sweep'))
runsXsweeps

In [124]:
# exclude the sweeps that were optimized on the test set
test_set_sweeps = ['w21pwzds', 'ta5rm3fd', 'a42hnw5d', '2p72lgel']
runsXsweeps = runsXsweeps[~runsXsweeps['sweep_id'].isin(test_set_sweeps)]
runsXsweeps.shape

(11070, 225)

In [None]:
import ast
is_short = (runsXsweeps['config_command'].apply(ast.literal_eval).apply(lambda x: x[5]).str.split('/').str[1] == 'short')

runsXsweeps['len'] = is_short.apply(lambda x: 'short' if x else 'long')
runsXsweeps['model'] = runsXsweeps['config_name'].str.split(':').str[0].str.split('-').str[0]
runsXsweeps['optimization'] = runsXsweeps['config_name'].str.contains('accproto').apply(lambda x: 'accproto' if x else 'accuracy')
runsXsweeps.groupby(['model', 'optimization', 'len']).count()[['name_run']]

In [126]:
runsXsweeps[runsXsweeps['optimization'] == 'accproto'].groupby(['model', 'dataset', 'backbone', 'activation_function']).count()[['name_run']].unstack([0, 1])

Unnamed: 0_level_0,Unnamed: 1_level_0,name_run
Unnamed: 0_level_1,model,vanilla
Unnamed: 0_level_2,dataset,cub200
backbone,activation_function,Unnamed: 2_level_3
densenet161,cosine,158
densenet161,l2,123
resnet50[pretraining=inaturalist],cosine,188
resnet50[pretraining=inaturalist],l2,164
vgg19,cosine,177
vgg19,l2,159


In [127]:
runsXsweeps[(runsXsweeps['optimization'] == 'accuracy') & (runsXsweeps['len'] == 'long')].groupby(['model', 'dataset', 'backbone', 'activation_function']).count()[['name_run']].unstack([0, 1])

Unnamed: 0_level_0,Unnamed: 1_level_0,name_run,name_run,name_run,name_run
Unnamed: 0_level_1,model,vanilla,vanilla,vanilla,vanilla
Unnamed: 0_level_2,dataset,cars_cropped,cub200,cub200_cropped,dogs
backbone,activation_function,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
densenet161,cosine,134.0,153.0,190.0,90.0
densenet161,l2,81.0,104.0,132.0,86.0
resnet50,cosine,132.0,179.0,259.0,120.0
resnet50,l2,130.0,143.0,213.0,113.0
resnet50[pretraining=inaturalist],cosine,,195.0,314.0,
resnet50[pretraining=inaturalist],l2,,173.0,239.0,
vgg19,cosine,133.0,169.0,215.0,121.0
vgg19,l2,89.0,145.0,199.0,121.0


In [128]:
runsXsweeps[(runsXsweeps['optimization'] == 'accuracy') & (runsXsweeps['len'] == 'short')].groupby(['model', 'dataset', 'backbone', 'activation_function']).count()[['name_run']].unstack([0, 1]).swaplevel(0,1).sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,name_run,name_run,name_run,name_run,name_run,name_run,name_run,name_run,name_run,name_run,name_run,name_run
Unnamed: 0_level_1,model,deformable,deformable,prototree,prototree,st,st,tesnet,tesnet,vanilla,vanilla,vanilla,vanilla
Unnamed: 0_level_2,dataset,cub200,dogs,cub200,dogs,cub200,dogs,cub200,dogs,cars_cropped,cub200,cub200_cropped,dogs
activation_function,backbone,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
cosine,densenet161,167.0,81.0,27.0,15.0,99.0,50.0,88.0,68.0,61.0,165.0,98.0,50.0
cosine,resnet50,,167.0,,68.0,,76.0,,76.0,,,97.0,87.0
cosine,resnet50[pretraining=inaturalist],139.0,,41.0,,135.0,,125.0,,77.0,238.0,,
cosine,vgg19,106.0,114.0,51.0,31.0,64.0,61.0,75.0,101.0,55.0,147.0,103.0,71.0
exp_l2,densenet161,,,26.0,14.0,,,,,,,,
exp_l2,resnet50,,,,23.0,,,,,,,,
exp_l2,resnet50[pretraining=inaturalist],,,144.0,,,,,,,,,
exp_l2,vgg19,,,47.0,22.0,,,,,,,,
l2,densenet161,136.0,58.0,,,57.0,46.0,62.0,31.0,42.0,44.0,58.0,64.0
l2,resnet50,,135.0,,,,59.0,,45.0,,,58.0,68.0


# Load Eval Data

In [None]:
import os
import glob
import pandas as pd

def load_results(directory_path):
    # Get all files matching the pattern
    pkl_files = glob.glob(os.path.join(directory_path, "*-results.pkl"))
    
    if not pkl_files:
        print("No matching .pkl files found in the specified directory.")
        return None
    
    print(f"Found {len(pkl_files)} .pkl files to process.")
    
    # Initialize an empty list to store all dataframes
    all_dataframes = []
    
    # Loop through each file
    for file_path in pkl_files:
        try:
            # Load the DataFrame from the pickle file
            df = pd.read_pickle(file_path).reset_index()
            
            # Optional: Add a column to identify the source file
            file_name = os.path.basename(file_path)
            df['source_file'] = file_name
            
            # Append to the list of dataframes
            all_dataframes.append(df)
            print(f"Loaded: {file_name} with {len(df)} rows")
            
        except Exception as e:
            print(f"Error loading {os.path.basename(file_path)}: {str(e)}")
    
    if not all_dataframes:
        print("No DataFrames were successfully loaded.")
        return None
    
    # Concatenate all dataframes
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    
    print(f"\nSuccessfully concatenated {len(all_dataframes)} DataFrames.")
    print(f"Combined DataFrame shape: {combined_df.shape}")
    
    return combined_df

results = load_results('analysis/eval/data')
results

In [None]:
results['source_sweep'] = results['source_file'].str.extract(r'(.+)-results.pkl')
results[['source_file', 'source_sweep']]

In [None]:
results[results['source_sweep'].isin(runsXsweeps[runsXsweeps['optimization'] == 'accproto']['sweep_id'])]['source_sweep'].unique()

In [None]:
# these are model overwrites loaded for the wrong dataset
results[~results['error'].isna()]

In [None]:
error_free_results = results[results['error'].isna()].rename(columns={'source_sweep': 'sweep_id', 'model': 'run_id'})
error_free_results

In [None]:
error_free_results[error_free_results['sweep_id'].isin(runsXsweeps[runsXsweeps['optimization'] == 'accproto']['sweep_id'])].groupby(['sweep_id']).count()[['run_id']]

In [None]:
runsXsweepsXeval = pd.merge(runsXsweeps, error_free_results, left_on=('name_run', 'sweep_id'), right_on=('run_id', 'sweep_id'), suffixes=('_wandb', '_eval'))
runsXsweepsXeval

In [None]:
runsXsweepsXeval[runsXsweepsXeval['optimization'] == 'accproto'].groupby(['model', 'dataset', 'backbone', 'activation_function']).count()[['name_run']]

In [None]:
runsXsweepsXeval[runsXsweepsXeval['model'] == 'prototree'].groupby(['model', 'dataset', 'backbone', 'activation_function']).count()[['name_run']]

In [None]:
runsXsweepsXeval['name_run'].value_counts().value_counts()

In [139]:
runsXsweepsXeval['best_model'].value_counts().value_counts()
# even though we have mostly positive identifications
# there are a number of models where name collisions occur, which means we need to...

count
1    8128
2     354
3      29
4      12
5       3
9       1
6       1
Name: count, dtype: int64

## Match based on validation scores to remove duplicates

In [None]:
# correct errors for clashes that ended up jumping backbones

import os
import glob
import pandas as pd

def load_backbone_map(directory_path):
    # Get all files matching the pattern
    csv_files = glob.glob(os.path.join(directory_path, "*-backbones.csv"))
    
    if not csv_files:
        print("No matching .csv files found in the specified directory.")
        return None
    
    print(f"Found {len(csv_files)} .csv files to process.")
    
    # Initialize an empty list to store all dataframes
    all_dataframes = []
    
    # Loop through each file
    for file_path in csv_files:
        try:
            # Load the DataFrame from the pickle file
            df = pd.read_csv(file_path, header=None)
            df.columns=['model_file', 'arch_text', 'backbone_text', 'af_text', 'class_indicator_number']
            
            # Optional: Add a column to identify the source file
            file_name = os.path.basename(file_path)
            df['source_file'] = file_name
            
            # Append to the list of dataframes
            all_dataframes.append(df)
            print(f"Loaded: {file_name} with {len(df)} rows")
            
        except Exception as e:
            print(f"Error loading {os.path.basename(file_path)}: {str(e)}")
    
    if not all_dataframes:
        print("No DataFrames were successfully loaded.")
        return None
    
    # Concatenate all dataframes
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    
    print(f"\nSuccessfully concatenated {len(all_dataframes)} DataFrames.")
    print(f"Combined DataFrame shape: {combined_df.shape}")
    
    return combined_df

results = load_backbone_map('analysis/eval/model-backbones')
results = results.drop_duplicates('model_file')
results
# error files are prototree sweeps that were relaunched

In [141]:
def recovered_backbone(txt):
    if 'vgg' in txt.lower():
        return 'vgg19'
    if 'resnet' in txt.lower():
        return 'resnet50'
    if 'densenet' in txt.lower():
        return 'densenet161'
    
def recovered_arch(txt):
    if 'VanillaProtoPNet' in txt:
        return 'vanilla|tesnet'
    if 'DeformableProtoPNet' in txt:
        return 'deformable'
    if 'ProtoTree' in txt:
        return 'prototree'
    if 'STProtoPNet' in txt:
        return 'st'
    
def recover_af(txt):
    if 'L2Activation' == txt:
        return 'l2'
    elif 'CosPrototypeActivation' == txt:
        return 'cosine'
    elif 'ExpL2Activation' == txt:
        return 'exp_l2'

# now nothing that this does not make the pretraining clear
results['backbone_recovered'] = results['backbone_text'].apply(recovered_backbone)
results['arch_recovered'] = results['arch_text'].apply(recovered_arch)
results['af_recovered'] = results['af_text'].apply(recover_af)

modeled_results = results.copy()

for model in ['tesnet', 'vanilla']:
    model_res = results[results['arch_recovered'] == 'vanilla|tesnet'].copy()
    model_res['arch_recovered'] = model
    modeled_results = pd.concat([modeled_results, model_res])

modeled_results = modeled_results[modeled_results['arch_recovered'] != 'vanilla|tesnet']
modeled_results[['arch_recovered', 'backbone_recovered', 'af_recovered', 'class_indicator_number']].value_counts(dropna=False)

arch_recovered  backbone_recovered  af_recovered  class_indicator_number
vanilla         resnet50            cosine        200                       1220
tesnet          resnet50            cosine        200                       1220
vanilla         resnet50            l2            200                        903
tesnet          resnet50            l2            200                        903
                vgg19               cosine        200                        782
                                                                            ... 
prototree       resnet50            exp_l2        202                          1
                                                  128                          1
                                                  127                          1
                                                  126                          1
vanilla         NaN                 cosine        10                           1
Name: count, Length: 139, dtype: int

In [146]:
# because of pruning, the prototree numbers are reliable, but that's fine, because there aren't any name collisions on prototree
modeled_results[['class_indicator_number']].value_counts()

class_indicator_number
200                       10737
120                        2949
196                        1534
199                         138
201                          10
124                           9
126                           9
122                           8
121                           8
123                           8
125                           6
131                           6
202                           5
128                           3
130                           3
133                           2
149                           2
138                           2
10                            2
127                           2
132                           1
188                           1
274                           1
39                            1
54                            1
67                            1
92                            1
119                           1
186                           1
31                            1
163              

In [147]:
modeled_results['arch_recovered'].value_counts(dropna=False)

arch_recovered
tesnet        6628
vanilla       6628
deformable    1095
st             818
prototree      290
Name: count, dtype: int64

In [None]:
runsXsweepsXevalXbackbone = runsXsweepsXeval.merge(modeled_results, left_on='best_model', right_on='model_file', suffixes=('', '_recovered'), how='left')
runsXsweepsXevalXbackbone

In [164]:
def backbone_equal(row):
    if pd.isna(row['backbone']) or pd.isna(row['backbone_recovered']):
        return pd.NA
    return row['backbone'].split('[')[0] == row['backbone_recovered']

def arch_equal(row):
    if pd.isna(row['model']) or pd.isna(row['arch_recovered']):
        return pd.NA
    return row['model'] == row['arch_recovered']

def af_equal(row):
    if pd.isna(row['activation_function']) or pd.isna(row['af_recovered']):
        return pd.NA
    return row['activation_function'] == row['af_recovered']

class_name_map = {
    200: 'cub200',
    120: 'dogs',
    196: 'cars',
}

def dataset_from_classes(row):
    if row['model'] == 'prototree':
        return True
    else:
        if row['class_indicator_number'] == 200:
            return row['dataset'] in ['cub200', 'cub200_cropped']
        elif row['class_indicator_number'] == 120:
            return row['dataset'] == 'dogs'
        elif row['class_indicator_number'] == 196:
            return row['dataset'] == 'cars_cropped'


runsXsweepsXevalXbackbone['backbone_equal'] = runsXsweepsXevalXbackbone.apply(backbone_equal, axis=1)
runsXsweepsXevalXbackbone['arch_equal'] = runsXsweepsXevalXbackbone.apply(arch_equal, axis=1)
runsXsweepsXevalXbackbone['af_equal'] = runsXsweepsXevalXbackbone.apply(af_equal, axis=1)
runsXsweepsXevalXbackbone['dataset_equal'] = runsXsweepsXevalXbackbone.apply(dataset_from_classes, axis=1)
runsXsweepsXevalXbackbone.groupby(['arch_equal', 'backbone_equal', 'af_equal', 'dataset_equal']).count()[['name_run']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,name_run
arch_equal,backbone_equal,af_equal,dataset_equal,Unnamed: 4_level_1
False,False,False,True,46
False,False,True,True,197
False,True,False,True,14
False,True,True,True,6452
True,False,False,True,35
True,False,True,True,191
True,True,False,True,8
True,True,True,True,8671


In [165]:
good_results = runsXsweepsXevalXbackbone[
    (runsXsweepsXevalXbackbone['arch_equal']) &
    (runsXsweepsXevalXbackbone['backbone_equal'])&
    (runsXsweepsXevalXbackbone['af_equal'])&
    (runsXsweepsXevalXbackbone['dataset_equal'])].copy()
good_results['best_model'].value_counts().value_counts()
# still a number of models with duplicate identifications

count
1    8354
2     125
3      14
4       5
5       1
Name: count, dtype: int64

In [166]:
(good_results['model_path'] == good_results['best_model']).value_counts()

True    8671
Name: count, dtype: int64

In [167]:
(good_results['model_path'] == good_results['model_file']).value_counts()

True    8671
Name: count, dtype: int64

In [168]:
good_results.groupby(['model', 'optimization', 'len']).count()[['name_run']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,name_run
model,optimization,len,Unnamed: 3_level_1
deformable,accuracy,short,1117
prototree,accuracy,short,290
st,accuracy,short,824
tesnet,accuracy,short,713
vanilla,accproto,long,898
vanilla,accuracy,long,3096
vanilla,accuracy,short,1733


#### Filter based on accuracy

The only possible outstanding error is mixing up a tesnet model with a vanilla model or a model trained with the wrong resnet pretraining.
There's no way to distinguish them from the outside.
However, if there's only one model that meets all the criteria, then that must be the original model. Other than that, none of our results will be wrong if we don't allow any models any models with better accuracy on validation than in training to slip through.
There is, unfortunately, some rounding error in the accuracies recorded during training, so we have to have a little margin.

In [None]:
closest_matches = pd.DataFrame()
good_results['eval_acc_diff'] = (good_results['best[prototypes_embedded]/eval/accuracy'] - good_results['val.accuracy']).abs()
good_results['eval_acc_proto_diff'] = (good_results['best[prototypes_embedded]/eval/acc_proto_score'] - good_results['val.acc_proto_score']).abs()
for model_path, df in good_results[good_results['optimization'] == 'accuracy'].groupby('model_path'):
    min_acc_diff = df['eval_acc_diff'].min()
    if min_acc_diff <= 0.005 or len(df) == 1:
        closest_matches = pd.concat([closest_matches, df[df['eval_acc_diff'] == min_acc_diff]])

for model_path, df in good_results[good_results['optimization'] == 'accproto'].groupby('model_path'):
    min_acc_proto_diff = df['eval_acc_proto_diff'].min()
    if min_acc_proto_diff < 0.005 or len(df) == 1:
        closest_matches = pd.concat([closest_matches, df[df['eval_acc_proto_diff'] == min_acc_proto_diff]])

closest_df = pd.DataFrame(closest_matches)
closest_df

In [172]:
closest_df.groupby(['model', 'optimization', 'len']).count()[['name_run']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,name_run
model,optimization,len,Unnamed: 3_level_1
deformable,accuracy,short,1072
prototree,accuracy,short,290
st,accuracy,short,797
tesnet,accuracy,short,683
vanilla,accproto,long,891
vanilla,accuracy,long,2975
vanilla,accuracy,short,1692


In [173]:
closest_df.groupby(['optimization', 'len', 'model', 'dataset', 'backbone', 'activation_function']).count()[['name_run']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,name_run
optimization,len,model,dataset,backbone,activation_function,Unnamed: 6_level_1
accproto,long,vanilla,cub200,densenet161,cosine,135
accproto,long,vanilla,cub200,densenet161,l2,120
accproto,long,vanilla,cub200,resnet50[pretraining=inaturalist],cosine,184
accproto,long,vanilla,cub200,resnet50[pretraining=inaturalist],l2,142
accproto,long,vanilla,cub200,vgg19,cosine,175
...,...,...,...,...,...,...
accuracy,short,vanilla,dogs,densenet161,l2,41
accuracy,short,vanilla,dogs,resnet50,cosine,85
accuracy,short,vanilla,dogs,resnet50,l2,43
accuracy,short,vanilla,dogs,vgg19,cosine,71


## Export

In [163]:
closest_df.to_csv('analysis/wandb-runsXsweepsXeval.csv', index=False)