In [2]:
# Common code block with reused functions and imports
import os
import pandas as pd
from tabulate import tabulate

# Define test paths
test_1_path = 'test_1'
test_2_path = 'test_2'
test_3_path = 'test_3'

def get_accuracy(folder_path, dataset_name):
    """Get accuracy values for all models in a dataset folder"""
    folder_path = os.path.join(folder_path, dataset_name)
    accuracy_dict = {}
    
    if not os.path.exists(folder_path):
        return accuracy_dict
        
    for model_folder in os.listdir(folder_path):
        model_path = os.path.join(folder_path, model_folder)
        if os.path.isdir(model_path):
            summary_file = os.path.join(model_path, 'summary_metrics.csv')
            if os.path.exists(summary_file):
                df = pd.read_csv(summary_file)
                accuracy = df.loc[df['Model'] == model_folder, 'Accuracy'].values[0]
                accuracy_dict[model_folder] = accuracy
    return accuracy_dict

def create_dataframe(dataset_name, test_1_accuracy, test_2_accuracy, test_3_accuracy):
    """Create a DataFrame with accuracy values for all tests"""
    df = pd.DataFrame({
        'Model': list(test_1_accuracy.keys()),
        'Test 1': list(test_1_accuracy.values()),
        'Test 1 (few-shot)': [test_3_accuracy.get(model, '') for model in test_1_accuracy.keys()],
        'Test 2': [test_2_accuracy.get(model, '') for model in test_1_accuracy.keys()],
    })
    
    # Set index and sort
    df.set_index('Model', inplace=True)
    df.sort_index(inplace=True)
    
    # Reorder specific models if they exist
    for model in ['gpt-4o-mini', 'gpt-4o', 'clip-vit-base-patch32']:
        if model in df.index:
            df = df.reindex([idx for idx in df.index if idx != model] + [model])
    
    # Move clip-vit-base-patch32 to the top if it exists
    if 'clip-vit-base-patch32' in df.index:
        df = df.reindex(['clip-vit-base-patch32'] + [idx for idx in df.index if idx != 'clip-vit-base-patch32'])
    
    # Rename columns based on dataset
    df.columns = ['zero-shot (labels)', f'few-shot[*](dataset/{dataset_name}-data/few-shot/README.md) (labels)', 'zero-shot (descriptions)']
    
    return df

def calculate_baseline(dataset_name):
    """Calculate baseline accuracy from baseline.csv if it exists"""
    baseline_path = f'dataset/{dataset_name}-data/baseline.csv'
    if os.path.exists(baseline_path):
        baseline_df = pd.read_csv(baseline_path)
        baseline_df = baseline_df[baseline_df['Class Name'] != 'Mean']
        
        # Calculate TP as recall * (TP + FN) = recall * # Test Images
        baseline_df['Recall'] = baseline_df['Recall'].str.rstrip('%').astype('float') / 100.0
        baseline_df['# Test Images'] = baseline_df['# Test Images'].str.replace(',', '').astype('int')
        baseline_df['True Positives'] = baseline_df['# Test Images'] * baseline_df['Recall']
        
        # Calculate accuracy as TP / # Test Images
        total_true_positives = baseline_df['True Positives'].sum()
        total_test_images = baseline_df['# Test Images'].sum()
        baseline_accuracy = total_true_positives / total_test_images
        
        return baseline_accuracy
    return None

In [3]:
# Dataset: ArtDL
dataset_name = "ArtDL"
test_1_accuracy = get_accuracy(test_1_path, dataset_name)
test_2_accuracy = get_accuracy(test_2_path, dataset_name)
test_3_accuracy = get_accuracy(test_3_path, dataset_name)

# Create DataFrame
df = create_dataframe(dataset_name, test_1_accuracy, test_2_accuracy, test_3_accuracy)

# Add baseline if available
baseline_accuracy = calculate_baseline(dataset_name)
if baseline_accuracy is not None:
    df.loc['Baseline'] = [f"{baseline_accuracy:.2%}", "", ""]

# Format and display
print(f"## {dataset_name} Results")
print(df.to_markdown())
print("\n")

## ArtDL Results
| Model                     | zero-shot (labels)   | few-shot[*](dataset/ArtDL-data/few-shot/README.md) (labels)   | zero-shot (descriptions)   |
|:--------------------------|:---------------------|:--------------------------------------------------------------|:---------------------------|
| clip-vit-base-patch32     | 16.15%               | 21.41%                                                        | 31.55%                     |
| clip-vit-base-patch16     | 25.64%               | 29.13%                                                        | 28.70%                     |
| clip-vit-large-patch14    | 30.58%               | 31.71%                                                        | 44.31%                     |
| siglip-base-patch16-512   | 48.71%               | 55.90%                                                        | 68.19%                     |
| siglip-large-patch16-384  | 54.45%               | 53.49%                                                

In [4]:
# Dataset: ICONCLASS
dataset_name = "ICONCLASS"
test_1_accuracy = get_accuracy(test_1_path, dataset_name)
test_2_accuracy = get_accuracy(test_2_path, dataset_name)
test_3_accuracy = get_accuracy(test_3_path, dataset_name)

# Create DataFrame
df = create_dataframe(dataset_name, test_1_accuracy, test_2_accuracy, test_3_accuracy)

# Format and display
print(f"## {dataset_name} Results")
print(df.to_markdown())
print("\n")

## ICONCLASS Results
| Model                     | zero-shot (labels)   | few-shot[*](dataset/ICONCLASS-data/few-shot/README.md) (labels)   | zero-shot (descriptions)   |
|:--------------------------|:---------------------|:------------------------------------------------------------------|:---------------------------|
| clip-vit-base-patch32     | 23.48%               | 29.56%                                                            | 26.35%                     |
| clip-vit-base-patch16     | 28.89%               | 32.60%                                                            | 26.35%                     |
| clip-vit-large-patch14    | 39.36%               | 42.74%                                                            | 34.29%                     |
| siglip-base-patch16-512   | 43.41%               | 42.23%                                                            | 31.59%                     |
| siglip-large-patch16-384  | 49.16%               | 49.49%                    

In [7]:
# Dataset: IconArt
dataset_name = "wikidata"
test_1_accuracy = get_accuracy(test_1_path, dataset_name)
test_2_accuracy = get_accuracy(test_2_path, dataset_name)
test_3_accuracy = get_accuracy(test_3_path, dataset_name)

# Create DataFrame
df = create_dataframe(dataset_name, test_1_accuracy, test_2_accuracy, test_3_accuracy)

# Format and display
print(f"## {dataset_name} Results")
print(df.to_markdown())
print("\n")

## wikidata Results
| Model                     | zero-shot (labels)   | few-shot[*](dataset/wikidata-data/few-shot/README.md) (labels)   | zero-shot (descriptions)   |
|:--------------------------|:---------------------|:-----------------------------------------------------------------|:---------------------------|
| clip-vit-base-patch32     | 40.95%               |                                                                  | 42.20%                     |
| clip-vit-base-patch16     | 48.33%               |                                                                  | 42.90%                     |
| clip-vit-large-patch14    | 54.46%               |                                                                  | 50.84%                     |
| siglip-base-patch16-512   | 56.55%               |                                                                  | 45.40%                     |
| siglip-large-patch16-384  | 60.31%               |                                  

In [6]:
# Template for adding a new dataset
"""
# Dataset: [NEW_DATASET_NAME]
dataset_name = "[NEW_DATASET_NAME]"
test_1_accuracy = get_accuracy(test_1_path, dataset_name)
test_2_accuracy = get_accuracy(test_2_path, dataset_name)
test_3_accuracy = get_accuracy(test_3_path, dataset_name)

# Create DataFrame
df = create_dataframe(dataset_name, test_1_accuracy, test_2_accuracy, test_3_accuracy)

# Add baseline if available
baseline_accuracy = calculate_baseline(dataset_name)
if baseline_accuracy is not None:
    df.loc['Baseline'] = [f"{baseline_accuracy:.2%}", "", ""]

# Format and display
print(f"## {dataset_name} Results")
print(df.to_markdown())
print("\n")
"""


'\n# Dataset: [NEW_DATASET_NAME]\ndataset_name = "[NEW_DATASET_NAME]"\ntest_1_accuracy = get_accuracy(test_1_path, dataset_name)\ntest_2_accuracy = get_accuracy(test_2_path, dataset_name)\ntest_3_accuracy = get_accuracy(test_3_path, dataset_name)\n\n# Create DataFrame\ndf = create_dataframe(dataset_name, test_1_accuracy, test_2_accuracy, test_3_accuracy)\n\n# Add baseline if available\nbaseline_accuracy = calculate_baseline(dataset_name)\nif baseline_accuracy is not None:\n    df.loc[\'Baseline\'] = [f"{baseline_accuracy:.2%}", "", ""]\n\n# Format and display\nprint(f"## {dataset_name} Results")\nprint(df.to_markdown())\nprint("\n")\n'