# SETUP

## Repo Installation

In [None]:
import sys

# Submodlib for DISTIL; utilize oom_fix branch instead of default pip installation.
!git clone https://github.com/decile-team/submodlib.git
!cd submodlib && git checkout oom_fix
!cd submodlib && pip install -r requirements.txt
!cd submodlib && python setup.py bdist_wheel
!cd submodlib && pip install .

# DISTIL
!git clone https://github.com/decile-team/distil.git
sys.path.append("/content/distil/")

# Plotting utilities
!sudo apt-get update
!sudo apt update
!sudo apt-get install texlive-latex-recommended
!sudo apt install texlive-latex-extra
!sudo apt install dvipng
!sudo apt install cm-super

## Imports

In [None]:
import copy
import csv
import gc
import json
import math
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
import re
import sklearn
import submodlib
import sys
import time
import torch
import torch.multiprocessing as mp
import torch.optim as optim
import zipfile

from distil.active_learning_strategies.badge import BADGE
from distil.active_learning_strategies.entropy_sampling import EntropySampling
from distil.active_learning_strategies.least_confidence_sampling import LeastConfidenceSampling
from distil.active_learning_strategies.margin_sampling import MarginSampling
from distil.active_learning_strategies.partition_strategy import PartitionStrategy as ALPartitionStrategy
from distil.active_learning_strategies.random_sampling import RandomSampling
from distil.active_learning_strategies.strategy import Strategy
from distil.utils.models import MnistNet, ResNet18
from distil.utils.train_helper import data_train
from distil.utils.utils import LabeledToUnlabeledDataset

from PIL import Image

from scipy.io import loadmat

from google.colab import drive
from google.colab import files
drive.mount("/content/drive")

from torch import nn, Tensor
from torch.utils.data import ConcatDataset, DataLoader, Dataset, Subset

from torchvision import datasets, transforms
from torchvision.datasets.utils import download_and_extract_archive
from torchvision._internally_replaced_utils import load_state_dict_from_url

from typing import Type, Any, Callable, Union, List, Optional

## Saving Parameters

In [None]:
mount_point_directory = "/content/drive/"
drive.mount(mount_point_directory)
mount_point_directory = os.path.join(mount_point_directory, "MyDrive")

google_drive_directory = "auto_labeling_experiments/results/"
base_save_directory = os.path.join(mount_point_directory, google_drive_directory)

google_drive_directory = "AL_HC_Auto/Human_Expts/"
base_labeling_directory = os.path.join(mount_point_directory, google_drive_directory)

google_drive_directory = "auto_labeling_experiments/model/"
model_directory = os.path.join(mount_point_directory, google_drive_directory)

dataset_root_directory = "/content/"

os.makedirs(model_directory, exist_ok=True)

## Pre-Experiment

In [None]:
args = {'islogs': False,
        'optimizer': 'sgd',
        'isverbose': True,
        'isreset': True,
        'max_accuracy': 0.99,
        'n_epoch': 300,
        'lr': 0.001,
        'device': 'cuda',
        'batch_size': 64,
        'thread_count': 3,
        'metric': 'cosine',
        'embedding_type': 'gradients',
        'gradType': 'bias_linear'}

# EXPERIMENT

## Definitions

### Evaluation Utilities

In [None]:
def get_label_counts(dataset, nclasses, batch_size=64):

    label_counts = [0 for x in range(nclasses)]
    dataloader = DataLoader(dataset, shuffle=False, batch_size=batch_size)

    with torch.no_grad():
        for batch_idx, (data, labels) in enumerate(dataloader):
            for cls in range(nclasses):
                count = len(torch.where(labels==cls)[0])
                label_counts[cls] += count

    return label_counts

def get_labels(dataset, batch_size=64):

    dataloader = DataLoader(dataset, shuffle=False, batch_size=batch_size)
    
    all_labels = []

    with torch.no_grad():
        for batch_idx, (data, labels) in enumerate(dataloader):
            all_labels.extend(labels)

    return torch.tensor(all_labels)

### Experiment Fixture Creation

In [None]:
def get_tiny_imagenet(dataset_root_path):

    # Download and extract TinyImageNet if it isn't already.
    filepath = os.path.join(dataset_root_directory, "tiny-imagenet-200.zip")
    if not os.path.exists(filepath):
        download_command = F"wget -P {dataset_root_directory} http://cs231n.stanford.edu/tiny-imagenet-200.zip"
        os.system(download_command)

    dataset_path = os.path.join(dataset_root_directory, "tiny-imagenet-200")
    if not os.path.exists(dataset_path):
        with zipfile.ZipFile(filepath, 'r') as zip_ref:
            zip_ref.extractall(dataset_root_directory)

    # TinyImageNet has a test set, but it's labels are not available (following good practice).
    # Hence, we must evaluate on the validation set. We prepare the validation set according to 
    # https://towardsdatascience.com/pytorch-ignite-classifying-tiny-imagenet-with-efficientnet-e5b1768e5e8f
    # so that PyTorch's ImageFolder class can be used.
    validation_dir = os.path.join(dataset_path, 'val')

    # Open and read val annotations text file
    with open(os.path.join(validation_dir, 'val_annotations.txt'), 'r') as fp:
        data = fp.readlines()

    # Create image filename to class dictionary
    val_image_filename_to_class_dict = {}
    for line in data:
        words = line.split('\t')
        val_image_filename_to_class_dict[words[0]] = words[1]

    # Map each image into its own class folder
    old_val_img_dir = os.path.join(validation_dir, 'images')
    for img, folder in val_image_filename_to_class_dict.items():
        newpath = (os.path.join(validation_dir, folder, 'images'))
        if not os.path.exists(newpath):
            os.makedirs(newpath)
        if os.path.exists(os.path.join(old_val_img_dir, img)):
            os.rename(os.path.join(old_val_img_dir, img), os.path.join(newpath, img))
    if os.path.exists(old_val_img_dir):
        os.rmdir(old_val_img_dir)

def get_experiment_fixture(dataset_root_path, dataset_name, seed_set_size, model_name, model_base_path, init_model_train_args):

    # Load the dataset
    if dataset_name == "CIFAR10":

        train_transform = transforms.Compose([transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
        test_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])

        full_train_dataset = datasets.CIFAR10(dataset_root_path, download=True, train=True, transform=train_transform)
        test_dataset = datasets.CIFAR10(dataset_root_path, download=True, train=False, transform=test_transform)

        nclasses = 10 # NUM CLASSES HERE

    elif dataset_name == "CIFAR100":

        train_transform = transforms.Compose([transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761))])
        test_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761))])

        full_train_dataset = datasets.CIFAR100(dataset_root_path, download=True, train=True, transform=train_transform)
        test_dataset = datasets.CIFAR100(dataset_root_path, download=True, train=False, transform=test_transform)

        nclasses = 100 # NUM CLASSES HERE

    elif dataset_name == "MNIST":

        image_dim=28
        train_transform = transforms.Compose([transforms.Resize((image_dim, image_dim)), transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
        test_transform = transforms.Compose([transforms.Resize((image_dim, image_dim)), transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])

        full_train_dataset = datasets.MNIST(dataset_root_path, download=True, train=True, transform=train_transform)
        test_dataset = datasets.MNIST(dataset_root_path, download=True, train=False, transform=test_transform)

        nclasses = 10 # NUM CLASSES HERE

    elif dataset_name == "TinyImageNet":

        get_tiny_imagenet(dataset_root_path)

        train_transform = transforms.Compose([transforms.RandomCrop(64, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])
        test_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # ImageNet mean/std

        # Use val as test
        train_path = os.path.join(dataset_root_path, "tiny-imagenet-200", "train")
        test_path = os.path.join(dataset_root_path, "tiny-imagenet-200", "val")
        full_train_dataset = datasets.ImageFolder(train_path, transform=train_transform)
        test_dataset = datasets.ImageFolder(test_path, transform=test_transform)

        nclasses = 200 

    if model_name == "resnet18":
        model = ResNet18(num_classes=nclasses)
    elif model_name == "mnistnet":
        model = MnistNet()
    else:
        raise ValueError("Add model implementation")

    # Seed the rng used in dataset splits
    np.random.seed(42)

    # Retrieve the labels of the training set
    train_labels = get_labels(full_train_dataset)

    # Derive a list of indices that will represent the training set indices. The rest will represent the unlabeled set indices.
    per_class_size = seed_set_size // nclasses
    initial_train_idx = []
    for cls in range(nclasses):

        # Sample random points per class to form a balanced seed
        cls_idx = torch.where(train_labels==cls)[0]
        chosen_idx = np.random.choice(cls_idx, size=per_class_size, replace=False)
        initial_train_idx.extend(chosen_idx)

    # See if a model has already been trained for this fixture.
    model_name = F"{dataset_name}_{model_name}_{seed_set_size}"
    model_save_path = os.path.join(model_base_path, model_name)

    if os.path.isfile(model_save_path):
        print("Found Initial Model")
        state_dict = torch.load(model_save_path)
        model.load_state_dict(state_dict)
    else:
        print("Training Initial Model...")
        init_trainer = data_train(Subset(full_train_dataset, initial_train_idx), model, init_model_train_args)
        model = init_trainer.train(None)
        torch.save(model.state_dict(), model_save_path)

    return full_train_dataset, test_dataset, initial_train_idx, model, nclasses

### Datasets

In [None]:
class CarsDataset(Dataset):
    """
    PyTorch interface for Stanford Cars-196 dataset. Thanks to https://github.com/dtiarks/pytorch_cars
    for the skeleton.
    """

    def __init__(self, root_directory, train=True, download=False, transform=None):
        """
        Args:
            mat_anno (string): Path to the MATLAB annotation file.
            data_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """

        cars_root = os.path.join(root_directory, "cars196")
        cars_meta = os.path.join(cars_root, "devkit", "cars_meta.mat")

        # Download the dataset if needed.
        if download:
            archives = [("http://ai.stanford.edu/~jkrause/car196/cars_train.tgz", "cars_train.tgz"),
                        ("http://ai.stanford.edu/~jkrause/car196/cars_test.tgz", "cars_test.tgz"),
                        ("https://ai.stanford.edu/~jkrause/cars/car_devkit.tgz", "car_devkit.tgz")]
            for archive, file_to_check in archives:
                check_path = os.path.join(cars_root, file_to_check)
                if os.path.exists(check_path):
                    continue
                
                download_and_extract_archive(archive, cars_root)

        # Set annotations + data source depending on split
        if train:
            mat_anno = os.path.join(cars_root, "devkit", "cars_train_annos.mat")
            self.data_dir = os.path.join(cars_root, "cars_train")
        else:
            mat_anno = os.path.join(cars_root, "devkit", "cars_test_annos.mat")
            self.data_dir = os.path.join(cars_root, "cars_test")

        self.full_data_set = loadmat(mat_anno)
        self.car_annotations = self.full_data_set['annotations'][0]
        self.car_names = np.array(loadmat(cars_meta)['class_names'][0])

        self.transform = transform

    def __len__(self):
        return len(self.car_annotations)

    def __getitem__(self, idx):
        img_name = os.path.join(self.data_dir, self.car_annotations[idx][-1][0])
        image = Image.open(img_name)
        car_class = self.car_annotations[idx][-2][0][0] - 1 # zero-index the class

        if self.transform:
            image = self.transform(image)

        return image, car_class
    
class BirdsDataset(Dataset):

    def __init__(self, root_directory, train=True, download=False, transform=None):

        birds_root = os.path.join(root_directory, "caltech_birds")

        # Download if needed
        if download:
            archives = [("http://www.vision.caltech.edu/visipedia-data/CUB-200-2011/CUB_200_2011.tgz", "CUB_200_2011")]
            for archive, file_to_check in archives:
                check_path = os.path.join(birds_root, file_to_check)
                if os.path.exists(check_path):
                    continue
                
                download_and_extract_archive(archive, birds_root)

        # Get ID-to-filename map
        id_to_filename_map = {}
        id_to_image_path = os.path.join(birds_root, "CUB_200_2011", "images.txt")
        with open(id_to_image_path, "r") as map_file:
            map_reader = csv.reader(map_file, delimiter = " ")
            for (image_id, rel_path) in map_reader:
                id_to_filename_map[int(image_id)] = rel_path

        # Get ID-to-class map
        id_to_class_map = {}
        id_to_class_path = os.path.join(birds_root, "CUB_200_2011", "image_class_labels.txt")
        with open(id_to_class_path, "r") as map_file:
            map_reader = csv.reader(map_file, delimiter = " ")
            for (image_id, class_label) in map_reader:
                id_to_class_map[int(image_id)] = int(class_label) - 1  # Subtract 1 for pytorch labeling scheme.                

        # Get the train-test split
        train_test_split_path = os.path.join(birds_root, "CUB_200_2011", "train_test_split.txt")
        split_subset = []
        with open(train_test_split_path, "r") as split_file:
            split_reader = csv.reader(split_file, delimiter = " ")
            for image_id, is_train_image in split_reader:
                if int(is_train_image) and train:
                    split_subset.append(int(image_id))
                elif not int(is_train_image) and not train:
                    split_subset.append(int(image_id))

        # Get list of filepaths and corresponding classes
        self.filepaths = []
        self.classes = []
        image_folder_root = os.path.join(birds_root, "CUB_200_2011", "images")
        for image_id in split_subset:
            self.filepaths.append(os.path.join(image_folder_root, id_to_filename_map[image_id]))
            self.classes.append(id_to_class_map[image_id])

        self.transform = transform

    def __getitem__(self, index):

        img_name = self.filepaths[index]
        label = self.classes[index]
        image = Image.open(img_name)

        if self.transform is not None:
            image = self.transform(image)

        return image, label

    def __len__(self):
        return len(self.classes)

class DogsDataset(Dataset):

    def __init__(self, root_directory, train=True, download=False, transform=None):

        dogs_root = os.path.join(root_directory, "stanford_dogs")

        # Download if needed
        if download:
            archives = [("http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar","images.tar"),
                        ("http://vision.stanford.edu/aditya86/ImageNetDogs/annotation.tar","annotation.tar"),
                        ("http://vision.stanford.edu/aditya86/ImageNetDogs/lists.tar","lists.tar")]
            for archive, file_to_check in archives:
                check_path = os.path.join(dogs_root, file_to_check)
                if os.path.exists(check_path):
                    continue
                
                download_and_extract_archive(archive, dogs_root)

        if train:
            dataset_mat_path = os.path.join(dogs_root, "train_list.mat")
        else:
            dataset_mat_path = os.path.join(dogs_root, "test_list.mat")

        dataset_mat = loadmat(dataset_mat_path)

        self.filepaths = []
        for file_name in dataset_mat['file_list']:
            file_name = file_name[0][0]
            filepath = os.path.join(dogs_root, "Images", file_name)
            self.filepaths.append(filepath)

        self.labels = []
        for label in dataset_mat['labels']:
            label = label[0] - 1
            self.labels.append(label)

        self.transform = transform

    def __getitem__(self, index):

        img_name = self.filepaths[index]
        label = self.labels[index]
        image = Image.open(img_name)

        if self.transform is not None:
            image = self.transform(image)

        return image, label

    def __len__(self):
        return len(self.labels)

# PLOTTING

## Definitions

### Default Plot Styling

In [None]:
matplotlib.rcParams['text.usetex'] = True
matplotlib.rcParams['axes.spines.right'] = False
matplotlib.rcParams['axes.spines.top'] = False
plt.rc('font', family='serif')
plt.rc('xtick', labelsize=18)
plt.rc('ytick', labelsize=18)
matplotlib.rc('text', usetex=True)
matplotlib.rcParams['text.latex.preamble']=[r"\usepackage{amsmath,amsfonts}"]
matplotlib.rcParams['text.latex.preamble']=[r"\usepackage{bm}"]
plt.rc('axes', linewidth=1)
plt.rc('font', weight='bold')
matplotlib.rcParams['text.latex.preamble'] = [r'\boldmath']

figdim = (12,6)
figdpi = 120

shade_alpha = 0.2
axis_label_font_size = 18
legend_font_size = 18
plot_title_font_size = 18

### Load Experiment Results

#### Labeling Experiments

In [None]:
def get_labeling_experiment_results(base_labeling_directory, dataset_name):

    if dataset_name == "uc_merced":
        folder_name = "uc_merced"
    elif dataset_name == "svhn":
        folder_name = "SVHN"
    elif dataset_name == "stl10":
        folder_name = "STL10"
    elif dataset_name == "cifar10":
        folder_name = "CIFAR-10"

    # Get path of directory containing all csv files for that dataset
    dataset_csv_path = os.path.join(base_labeling_directory, folder_name)

    # Get all files in path
    files_in_path = os.listdir(dataset_csv_path)

    average_ratios = []
    median_ratios = []

    for file_in_path in files_in_path:

        # Ignore files that aren't summaries
        if "summary" not in file_in_path:
            continue

        # Get path to file
        file_path = os.path.join(dataset_csv_path, file_in_path)

        # Get a csv reader
        with open(file_path, "r") as csvfile:
            summary_reader = csv.reader(csvfile, delimiter=',')
            for row in summary_reader:
                if len(row) != 4 or (row[0] != '0' and row[0] != '1'):
                    continue
                
                print(file_in_path, row)

                change_flag = row[0]
                avg_time = row[2]
                med_time = row[3]

                if change_flag is not None:
                    if int(change_flag):
                        avg_time_to_correct = float(avg_time)
                        med_time_to_correct = float(med_time)
                    else:
                        avg_time_to_verify = float(avg_time)
                        med_time_to_verify = float(med_time)
        
        average_ratios.append(avg_time_to_correct / avg_time_to_verify)
        median_ratios.append(med_time_to_correct / med_time_to_verify)
    
    return average_ratios, median_ratios

#### Method Experiments

In [None]:
def get_experiment_results(auto_label_load_directory, dataset_name, al_strategy, seed_size, b_config):

    # Get the list of filenames from the auto_label_load_directory
    filenames = os.listdir(auto_label_load_directory)    

    # File names take the form F"{dataset}_{al_strategy}_{human_correct_strategy}_{auto_assign_strategy}_{b1}_{b2}_{b3}_{seed_size}_{rounds}_{run_count}.json"
    # What is not provided by this method is the human_correct_strategy, auto_assign_strategy, rounds, and run_count.
    # When returning the experiment list, we want to mark the list of experiments by human_correct_strategy, auto_assign_strategy, rounds.
    experiment_results = {}

    for filename in filenames:

        # Get filename fields
        if "highest_confidence" in filename:
            filename_field_array = filename.split(".")[0].split("_highest_confidence_")
            filename_fields = filename_field_array[0].split("_") + ["highest_confidence"] + filename_field_array[1].split("_")
        else:
            filename_fields = filename.split(".")[0].split("_")

        # Ensure filename fields match the fixed fields
        if dataset_name != filename_fields[0] or al_strategy != filename_fields[1] or seed_size != int(filename_fields[7]):
            print(filename_fields)
            continue

        # Ensure b2, b3 fields in filename are in b_config list
        if (int(filename_fields[4]), int(filename_fields[5]), int(filename_fields[6]), filename_fields[10] == "True") not in b_config:
            continue

        # If execution reaches here, then the filename is an experiment we want to include. We mark it by its (human_correct_strategy, auto_assign_strategy,
        # b2, b3, rounds, is_adaptive) tuple.
        exp_tuple = (filename_fields[2], filename_fields[3], int(filename_fields[4]), int(filename_fields[5]), int(filename_fields[6]), int(filename_fields[8]), filename_fields[10] == "True")
        experiment_results_path = os.path.join(auto_label_load_directory, filename)
        
        # Get the json, putting it in the experiment results dictionary.
        with open(experiment_results_path, "r") as json_file:
            exp_dict = json.load(json_file)
            
            if exp_tuple not in experiment_results:
                experiment_results[exp_tuple] = [exp_dict]
            else:
                experiment_results[exp_tuple].append(exp_dict)

    # In each dictionary, keep only as many rounds as all dicts have.
    for key in experiment_results:
        max_rounds = float('inf')
        experiment_dict_list = experiment_results[key]

        for experiment_dict in experiment_dict_list:
            round_count = len(experiment_dict['set_sizes']) - 1 # -1 for initial round
            max_rounds = min(max_rounds, round_count)

        for index, exp_dict in enumerate(experiment_dict_list):
            exp_dict['auto_assigned_selected_idx']          = exp_dict['auto_assigned_selected_idx'][:max_rounds]
            exp_dict['human_corrected_selected_idx']        = exp_dict['human_corrected_selected_idx'][:max_rounds]
            exp_dict['active_learning_selected_idx']        = exp_dict['active_learning_selected_idx'][:max_rounds]
            exp_dict['auto_assigned_selection_matrices']    = exp_dict['auto_assigned_selection_matrices'][:max_rounds]
            exp_dict['human_corrected_selection_matrices']  = exp_dict['human_corrected_selection_matrices'][:max_rounds]
            exp_dict['auto_assign_selection_times']         = exp_dict['auto_assign_selection_times'][:max_rounds]
            exp_dict['human_correct_selection_times']       = exp_dict['human_correct_selection_times'][:max_rounds]
            exp_dict['al_selection_times']                  = exp_dict['al_selection_times'][:max_rounds]
            exp_dict['test_accuracies']                     = exp_dict['test_accuracies'][:max_rounds+1]
            exp_dict['set_sizes']                           = exp_dict['set_sizes'][:max_rounds+1]
            exp_dict['train_times']                         = exp_dict['train_times'][:max_rounds]
            experiment_dict_list[index] = exp_dict

        experiment_results[key] = experiment_dict_list

    return experiment_results

### Averages/STDs of Experiments

**Averages/STDs of Lists**

In [None]:
def get_avg_std(list_of_lists):

    # Calculate average list
    avg_list = None
    for a_list in list_of_lists:
        if avg_list is None:
            avg_list = a_list
        else:
            avg_list = [(x + y) for (x,y) in zip(avg_list, a_list)]
    num_lists = len(list_of_lists)
    avg_list = [(x / num_lists) for x in avg_list]

    if num_lists == 1:
        std_list = [0.0 for x in range(len(avg_list))]
        return avg_list, std_list

    # Calculate sample standard dev. list
    std_list = None
    for a_list in list_of_lists:
        to_add_list = [(x - y)*(x - y) for (x,y) in zip(a_list, avg_list)]
        if std_list is None:
            std_list = to_add_list
        else:
            std_list = [(x + y) for (x,y) in zip(to_add_list, std_list)]
    std_list = [math.sqrt(x / (num_lists - 1)) for x in std_list]
    
    return avg_list, std_list

#### Get Average/STD of Test Accuracies

In [None]:
def get_avg_std_test_acc(list_of_exps):

    list_of_lists = []
    for exp in list_of_exps:
        list_of_lists.append(exp['test_accuracies'])
    return get_avg_std(list_of_lists)

#### Get Average/STD of % Correctly Auto-Labeled Points

In [None]:
def get_avg_std_correctly_auto_labeled_points(list_of_exps):

    list_of_lists = []
    for exp in list_of_exps:
        selection_matrices = exp['auto_assigned_selection_matrices']
        exp_correctly_labeled_points_list = []
        for selection_matrix in selection_matrices:
            nclasses = len(selection_matrix)
            correctly_labeled_points_frac = 0.
            for i in range(nclasses):
                correctly_labeled_points_frac += selection_matrix[i][i]
            if sum([sum(selection_matrix_row) for selection_matrix_row in selection_matrix]) > 0:
                correctly_labeled_points_frac /= sum([sum(selection_matrix_row) for selection_matrix_row in selection_matrix])
            exp_correctly_labeled_points_list.append(correctly_labeled_points_frac)
        list_of_lists.append(exp_correctly_labeled_points_list)
    return get_avg_std(list_of_lists)

#### Get Average/STD of % Correctly Suggested Human-Corrected Points

In [None]:
def get_avg_std_correctly_suggested_human_corrected_points(list_of_exps):

    list_of_lists = []
    for exp in list_of_exps:
        selection_matrices = exp['human_corrected_selection_matrices']
        exp_correctly_labeled_points_list = []
        for selection_matrix in selection_matrices:
            nclasses = len(selection_matrix)
            correctly_labeled_points_frac = 0.
            for i in range(nclasses):
                correctly_labeled_points_frac += selection_matrix[i][i]
            if sum([sum(selection_matrix_row) for selection_matrix_row in selection_matrix]) > 0:
                correctly_labeled_points_frac /= sum([sum(selection_matrix_row) for selection_matrix_row in selection_matrix])
            exp_correctly_labeled_points_list.append(correctly_labeled_points_frac)
        list_of_lists.append(exp_correctly_labeled_points_list)
    return get_avg_std(list_of_lists)

#### Get Average/STD of % Correctly Suggested AL Points

In [None]:
def get_avg_std_correctly_suggested_al_points(list_of_exps, al_pseudo_dataset):

    experiment_suggestion_accuracies = []
    for experiment in list_of_exps:

        experiment_al_suggestion_accuracy = []
        al_selected_points = experiment['active_learning_selected_idx']

        for round in al_selected_points:
            total_points_in_round = len(round)
            total_correctly_suggested_points_in_round = 0
            for selected_index, predicted_class in round:
                _, ground_truth_label = al_pseudo_dataset[selected_index]
                if ground_truth_label == predicted_class:
                    total_correctly_suggested_points_in_round += 1
            al_suggestion_acc_in_round = total_correctly_suggested_points_in_round / total_points_in_round
            experiment_al_suggestion_accuracy.append(al_suggestion_acc_in_round)

        experiment_suggestion_accuracies.append(experiment_al_suggestion_accuracy)
    
    return get_avg_std(experiment_suggestion_accuracies)

#### Get Average/STD of Cumulative Corrections Made

In [None]:
def get_avg_std_total_corrections_needed(list_of_exps):

    list_of_lists = []
    for exp in list_of_exps:
        selection_matrices = exp['human_corrected_selection_matrices']
        exp_correctly_labeled_points_list = []
        working_sum = 0
        for selection_matrix in selection_matrices:
            nclasses = len(selection_matrix)
            corrections_needed = 0
            for i in range(nclasses):
                corrections_needed += sum(selection_matrix[i]) - selection_matrix[i][i]
            working_sum += corrections_needed
            exp_correctly_labeled_points_list.append(working_sum)
        list_of_lists.append(exp_correctly_labeled_points_list)
    return get_avg_std(list_of_lists)

#### Get Average/STD of # Incorrectly Labeled Points versus Score

In [None]:
def get_avg_std_inc_label_score(exp_save_directory, dataset_root_directory, dataset_name, model_name, model_directory, selection_mode, seed_set_size, budget, alpha, beta, train_cap, per_exp_runs, args, round, budget_granularity=0.1, human_corrected=False):

    full_dataset, test_dataset, init_train_idx, model, nclasses = get_experiment_fixture(dataset_root_directory, dataset_name, seed_set_size, model_name, model_directory, args)
    list_of_exps = get_experiment_results(exp_save_directory, dataset_root_directory, dataset_name, model_name, model_directory, selection_mode, seed_set_size, budget, alpha, beta, train_cap, per_exp_runs, args)

    list_of_lists = []
    point_cutoffs = []
    for exp in list_of_exps:
        if human_corrected:
            selection_matrix = exp['human_corrected_selected_idx'][round]
        else:
            selection_matrix = exp['auto_assigned_selected_idx'][round]

        correctness_gain_list = []
        for sel_class, selection_row in enumerate(selection_matrix):
            if len(selection_row) == 0:
                continue

            max_gain = max(selection_row, key=lambda x:x[1])[1]
            min_gain = min(selection_row, key=lambda x:x[1])[1]

            for sel_idx, gain in selection_row:
                _, true_label = full_dataset[sel_idx]
                normalized_gain = (gain - min_gain) / (max_gain - min_gain)
                correctness_gain_list.append((sel_class == true_label, normalized_gain))

        correctness_gain_list = sorted(correctness_gain_list, key=lambda x:x[1])
        budget = len(correctness_gain_list)
        num_budget_slices = int(1 / budget_granularity)
        point_cutoffs = [int(i * budget / num_budget_slices) for i in range(num_budget_slices + 1)]

        total_computed_gain = 0.
        incorrectly_labeled_total_list = []
        for point_cutoff in point_cutoffs:
            if point_cutoff == 0:
                gain_cutoff = -1
            else:
                gain_cutoff = correctness_gain_list[point_cutoff - 1][1]

            total_incorrect = 0

            for correct, norm_gain in correctness_gain_list:
                if norm_gain > gain_cutoff:
                    break
                if not correct:
                    total_incorrect += 1
            incorrectly_labeled_total_list.append(total_incorrect)
        list_of_lists.append(incorrectly_labeled_total_list)

    return point_cutoffs, get_avg_std(list_of_lists)                   

#### Get Labeling Cost over Experiment

In [None]:
def get_avg_labeling_costs(list_of_exps, cost_to_check_label, cost_to_assign_label, use_al_pseudo=False, al_pseudo_dataset=None):

    if use_al_pseudo and al_pseudo_dataset is None:
        raise ValueError("Need to pass dataset as well")

    list_of_lists = []
    for exp in list_of_exps:
        exp_labeling_costs = [0]                                                        # Initial round has no labeling cost
        human_corrected_selection_matrices = exp['human_corrected_selection_matrices']  # We need to know which labels were correct & which ones were incorrect when involving human
        active_learning_selected_idx = exp['active_learning_selected_idx']              # We need to know how many active learning elements were selected
        working_sum = 0

        for round_num, selection_matrix in enumerate(human_corrected_selection_matrices):
            nclasses = len(selection_matrix)

            correctly_labeled_points = 0
            incorrectly_labeled_points = 0
            for i in range(nclasses):
                correctly_labeled_points += selection_matrix[i][i]
                incorrectly_labeled_points += sum(selection_matrix[i]) - selection_matrix[i][i]

            # Determine if the AL selected in the round has pseudo-label info.
            if len(active_learning_selected_idx[round_num]) > 0:
                if len(active_learning_selected_idx[round_num][0]) == 2 and use_al_pseudo:
                    # AL round has pseudo-label info. See which ones were correctly labeled, which ones weren't.
                    for selected_index, predicted_class in active_learning_selected_idx[round_num]:
                        _, ground_truth_label = al_pseudo_dataset[selected_index]
                        if ground_truth_label == predicted_class:
                            correctly_labeled_points += 1
                        else:
                            incorrectly_labeled_points += 1
                else:
                    # AL round does not have pseudo-label info
                    incorrectly_labeled_points += len(active_learning_selected_idx[round_num])
            
            round_labeling_cost = correctly_labeled_points * cost_to_check_label + incorrectly_labeled_points * cost_to_assign_label
            working_sum += round_labeling_cost
            exp_labeling_costs.append(working_sum)

        list_of_lists.append(exp_labeling_costs)
    average_labeling_costs, std_labeling_costs = get_avg_std(list_of_lists)
    return average_labeling_costs

#### Determine AL Round Cutoff

We purposefully run more AL rounds than necessary. Cut off AL rounds where it fits.

In [None]:
def determine_al_round_cutoff(experiment_dictionary, full_train_dataset, c_v, c_a):

    # Return: AL round cutoff.

    # Get max labeling cost of non-AL method.
    max_labeling_cost = -float('inf')
    for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_dictionary:

        if b2 == 0 and b3 == 0:
            continue

        exp_results_list = experiment_dictionary[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        
        if labeling_costs[-1] > max_labeling_cost:
            max_labeling_cost = labeling_costs[-1]
            
    # Get AL round whose labeling cost is just before the max labeling cost.
    for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_dictionary:

        if b2 != 0 or b3 != 0:
            continue

        exp_results_list = experiment_dictionary[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        
        for cutoff_index in range(len(labeling_costs)):
            if labeling_costs[cutoff_index] > max_labeling_cost:
                return cutoff_index + 1, cutoff_index + 1

### Labeling Efficiency

In [None]:
def find_cusp(labeling_cost, acc_list, target_acc):

    high_index = -1

    for i,acc in enumerate(acc_list):
        if acc >= target_acc:
            high_index = i
            break

    # Corner case: If target acc is equal to first in list, return corresponding labeling cost.
    if high_index == 0:
        return labeling_cost[0]

    if high_index == -1:
        return None

    acc_ratio = (target_acc - acc_list[high_index-1]) / (acc_list[high_index] - acc_list[high_index-1])

    calc_budget = acc_ratio * labeling_cost[high_index] + (1 - acc_ratio) * labeling_cost[high_index - 1]

    return calc_budget

def produce_label_efficiency_data(base_labeling_cost, base_method_accuracies, compared_labeling_cost, compared_method_accuracies, samples_on_x_axis):

    # To get label efficiency data, we first need to calculate a range of accuracies that are shared across all methods.

    # Calculate the min of the max accuracies
    max_base_acc = max(base_method_accuracies)
    max_compared_acc = max(compared_method_accuracies)
    max_lower_bound = min([max_base_acc, max_compared_acc])

    # Calculate the min accuracy obtained
    min_acc = min(min(base_method_accuracies), min(compared_method_accuracies))

    # Calculate range of accuracies to generate
    high_low_acc_diff = max_lower_bound - min_acc

    # Generate a list whose granularity depends on the samples on the x_axis.
    target_acc_grain = high_low_acc_diff / (samples_on_x_axis - 1) - 0.000000001  # subtract a very, very tiny amount to prevent floating point issues
    target_acc_list = [min_acc + x * (target_acc_grain) for x in range(samples_on_x_axis)]    

    # For each such accuracy, calculate the corresponding labeling costs needed to reach that accuracy.
    # Note: find_cusp() uses linear interpolation to find labeling costs.
    labeling_efficiencies = []
    for target_accuracy in target_acc_list:
        base_required_labeling_cost = find_cusp(base_labeling_cost, base_method_accuracies, target_accuracy)
        comp_required_labeling_cost = find_cusp(compared_labeling_cost, compared_method_accuracies, target_accuracy)

        # If indeterminate, append as 1.
        if base_required_labeling_cost == 0 and comp_required_labeling_cost == 0:
            labeling_efficiencies.append(1.)
        else:
            labeling_efficiency = base_required_labeling_cost / comp_required_labeling_cost
            labeling_efficiencies.append(labeling_efficiency)

    return target_acc_list, labeling_efficiencies

##Plot Parameters

**Fixed Parameters**

In [None]:
dataset_root_path = "/content/datasets"
color_dictionary = {("badge",False):                              (0,0,1),
                    ("badge",True):                               (0,0.5,1),
                    ("entropy",False):                            (0.6, 0.2, 0.2),
                    ("entropy",True):                             (0.8, 0.4, 0.2),
                    ("badge","logdetmi","highest_confidence"):    (0,1,1),
                    ("entropy","logdetmi","highest_confidence"):  (1, 0.6, 0.2)}

label_eff_acc_granularity = 30
acronym = "Clarifier"

**$b_1$,$b_2$, $b_3$, $adaptive$ Configs**

In [None]:
b_configs = [(60,400,140,False)]

**$c_a,c_v$ Settings**

In [None]:
cost_configs = [(4,1),
                (6,1),
                (8,1)]

## SUGGESTED LABEL VERIFICATION

### Ratio Summary

In [None]:
def avg(my_list):
    return sum(my_list) / len(my_list)

dataset_name = "cifar10"
cifar10_average_ratios, cifar10_median_ratios = get_labeling_experiment_results(base_labeling_directory, dataset_name)

dataset_name = "svhn"
svhn_average_ratios, svhn_median_ratios = get_labeling_experiment_results(base_labeling_directory, dataset_name)

dataset_name = "stl10"
stl10_average_ratios, stl10_median_ratios = get_labeling_experiment_results(base_labeling_directory, dataset_name)

dataset_name = "uc_merced"
uc_merced_average_ratios, uc_merced_median_ratios = get_labeling_experiment_results(base_labeling_directory, dataset_name)

labels = [r"\textbf{CIFAR-10}", r"\textbf{SVHN}", r"\textbf{STL-10}", r"\textbf{UC Merced}"]
bar_width = 0.35
x = np.arange(len(labels))

figsize = (7.5,7.5)
comparison_fig, ax = plt.subplots(figsize=figsize, dpi=120)

average_of_averages = [avg(cifar10_average_ratios), avg(svhn_average_ratios), avg(stl10_average_ratios), avg(uc_merced_average_ratios)]
average_of_medians = [avg(cifar10_median_ratios), avg(svhn_median_ratios), avg(stl10_median_ratios), avg(uc_merced_median_ratios)]

rects1 = ax.bar(x - bar_width/2, average_of_averages, bar_width, label="Average")
rects2 = ax.bar(x + bar_width/2, average_of_medians, bar_width, label="Median")

ax.set_ylabel(r"\textbf{Average of Ratios}", fontsize=axis_label_font_size)
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.spines["top"].set_visible(True)
ax.spines["right"].set_visible(True)
ax.legend(fontsize=legend_font_size)

for i in x:
    plt.text(i - bar_width/2, average_of_averages[i] + 0.02, r"\textbf{" + str(round(average_of_averages[i],1)) + r"}", ha = "center", fontsize=axis_label_font_size)
    plt.text(i + bar_width/2, average_of_medians[i] + 0.02, r"\textbf{" + str(round(average_of_medians[i],1)) + r"}", ha = "center", fontsize=axis_label_font_size)

comparison_fig.tight_layout()

### Strawman Gains

In [None]:
nrows = 1
ncols = 2

c_a = 3
c_v = 1

figsize = (5.5 * ncols, 6.5 * nrows)
comparison_fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize, dpi=120, gridspec_kw = {'top':0.55,'wspace':0.3,'hspace':0.3})
axes = [axes]

# Note: Figures are not to have titles. The caption serves this purpose.

# Set error bar alpha
shade_alpha = 0.25

legend_line_dictionary = {}

# =========================================================================================================================================================
#
# ====================================================================== CIFAR-10 =========================================================================
#
# =========================================================================================================================================================

# Note: b_configs is a list of tuples of the form (b_1, b_2, b_3, is_adaptive).
dataset = "cifar10"
seed_size = 1000
b_configs = [(600,0,0,False)]

full_train_dataset = datasets.CIFAR10(dataset_root_path, download=True, train=True)
test_dataset = datasets.CIFAR10(dataset_root_path, download=True, train=False)
nclasses = 10 # NUM CLASSES HERE
    
# Set the subplot title, y-axis, x-axis
axes[0][0].set_title(r"\textbf{CIFAR-10}", fontsize=plot_title_font_size)
axes[0][0].set_xlabel(r"\textbf{Labeling Cost (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[0][0].set_ylabel(r"\textbf{Accuracy}", fontsize=axis_label_font_size)

axes[0][1].set_title(r"\textbf{CIFAR-10}", fontsize=plot_title_font_size)
axes[0][1].set_xlabel(r"\textbf{Test Accuracy}", fontsize=axis_label_font_size)
axes[0][1].set_ylabel(r"\textbf{Label Efficiency}", fontsize=axis_label_font_size)

# ===== BADGE =====
al_strategy = "badge"

# Get CIFAR-10 results.
results_directory = os.path.join(base_save_directory, dataset, al_strategy, str(seed_size))
experiment_results = get_experiment_results(results_directory, dataset, al_strategy, seed_size, b_configs)

# Get the average labeling costs across experiments along with the test accuracies. Plot them.
for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    # Attempt to generate a color if there is not already one.
    if (al_strategy,False) not in color_dictionary or (al_strategy,True) not in color_dictionary:
        raise ValueError(F"Add color for {al_strategy}")
    else:
        no_suggestion_draw_color = color_dictionary[(al_strategy,False)]
        suggestion_draw_color = color_dictionary[(al_strategy,True)]

    # PLOT WITHOUT USING AL SUGGESTIONS
    exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
    labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, False, full_train_dataset)
    average_acc, std = get_avg_std_test_acc(exp_results_list)
    lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
    upper_list = [(x+y) for (x,y) in zip(average_acc,std)]
    line = axes[0][0].plot(labeling_costs, average_acc, color=no_suggestion_draw_color, marker='o', linestyle="--")[0]
    axes[0][0].fill_between(labeling_costs, lower_list, upper_list, alpha=shade_alpha, color=no_suggestion_draw_color)
    
    legend_line_dictionary[(al_strategy,False)] = line

    # PLOT USING AL SUGGESTIONS
    labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
    average_acc, std = get_avg_std_test_acc(exp_results_list)
    lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
    upper_list = [(x+y) for (x,y) in zip(average_acc,std)]
    line = axes[0][0].plot(labeling_costs, average_acc, color=suggestion_draw_color, marker='o')[0]
    axes[0][0].fill_between(labeling_costs, lower_list, upper_list, alpha=shade_alpha, color=suggestion_draw_color)

    legend_line_dictionary[(al_strategy,True)] = line

# Do the same for the labeling efficiency.

for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    if (al_strategy,False) not in color_dictionary or (al_strategy,True) not in color_dictionary:
        raise ValueError(F"Add color for {al_strategy}")
    else:
        draw_color = color_dictionary[(al_strategy,True)]

    exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
    with_pseudo_labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
    without_pseudo_labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, False, full_train_dataset)
    average_acc, _ = get_avg_std_test_acc(exp_results_list)

    plot_test_accuracies, labeling_efficiencies = produce_label_efficiency_data(without_pseudo_labeling_costs, average_acc, with_pseudo_labeling_costs, average_acc, label_eff_acc_granularity)

    axes[0][1].plot(plot_test_accuracies, labeling_efficiencies, color=draw_color, marker='o')[0]

# ===== ENTROPY =====
al_strategy = "entropy"

# Get CIFAR-10 results.
results_directory = os.path.join(base_save_directory, dataset, al_strategy, str(seed_size))
experiment_results = get_experiment_results(results_directory, dataset, al_strategy, seed_size, b_configs)

# Get the average labeling costs across experiments along with the test accuracies. Plot them.
for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    # Attempt to generate a color if there is not already one.
    if (al_strategy,False) not in color_dictionary or (al_strategy,True) not in color_dictionary:
        raise ValueError(F"Add color for {al_strategy}")
    else:
        no_suggestion_draw_color = color_dictionary[(al_strategy,False)]
        suggestion_draw_color = color_dictionary[(al_strategy,True)]

    # PLOT WITHOUT USING AL SUGGESTIONS
    exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
    labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, False, full_train_dataset)
    average_acc, std = get_avg_std_test_acc(exp_results_list)
    lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
    upper_list = [(x+y) for (x,y) in zip(average_acc,std)]
    line = axes[0][0].plot(labeling_costs, average_acc, color=no_suggestion_draw_color, marker='o', linestyle="--")[0]
    axes[0][0].fill_between(labeling_costs, lower_list, upper_list, alpha=shade_alpha, color=no_suggestion_draw_color)
    
    legend_line_dictionary[(al_strategy,False)] = line

    # PLOT USING AL SUGGESTIONS
    labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
    average_acc, std = get_avg_std_test_acc(exp_results_list)
    lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
    upper_list = [(x+y) for (x,y) in zip(average_acc,std)]
    line = axes[0][0].plot(labeling_costs, average_acc, color=suggestion_draw_color, marker='o')[0]
    axes[0][0].fill_between(labeling_costs, lower_list, upper_list, alpha=shade_alpha, color=suggestion_draw_color)

    legend_line_dictionary[(al_strategy,True)] = line

# Do the same for the labeling efficiency.

for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    if (al_strategy,False) not in color_dictionary or (al_strategy,True) not in color_dictionary:
        raise ValueError(F"Add color for {al_strategy}")
    else:
        draw_color = color_dictionary[(al_strategy,True)]

    exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
    with_pseudo_labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
    without_pseudo_labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, False, full_train_dataset)
    average_acc, _ = get_avg_std_test_acc(exp_results_list)

    plot_test_accuracies, labeling_efficiencies = produce_label_efficiency_data(without_pseudo_labeling_costs, average_acc, with_pseudo_labeling_costs, average_acc, label_eff_acc_granularity)

    axes[0][1].plot(plot_test_accuracies, labeling_efficiencies, color=draw_color, marker='o')[0]

# Create the legend by obtaining the list of labels
label_list = []
line_list = []
for key in legend_line_dictionary:
    line_list.append(legend_line_dictionary[key])
    al_strategy = key[0].replace("_", "-").capitalize()
    pseudo_info = "w/ Suggest" if key[1] else "w/o Suggest"
    label_list.append(r"\textsc{" + F"{al_strategy} {pseudo_info}" + r"}")
comparison_fig.legend(line_list, label_list, loc="upper center", ncol=2, borderaxespad=3, fontsize=legend_font_size)

## EXPERIMENTS

### All Accuracies

Layout: Each dataset has 2 plots per setting of $b_1,b_2,b_3$: one for test accuracy, one for labeling efficiency. This gives a total of 6 combinations for a total of 12 plots. We can do a 3 row, 4 col figure for now.

In [None]:
nrows = 4
ncols = 4

figsize = (5.5 * ncols, 3.45 * nrows)
comparison_fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize, dpi=120, gridspec_kw = {'top':0.92,'wspace':0.35,'hspace':0.53})

# Note: Figures are not to have titles. The caption serves this purpose.

# Set error bar alpha
shade_alpha = 0.25

legend_line_dictionary = {}

# Seed random color generator
np.random.seed(42)

# =========================================================================================================================================================
#
# ===================================================================== CIFAR-100 =========================================================================
#
# =========================================================================================================================================================

# Note: b_configs is a list of tuples of the form (b_1, b_2, b_3, is_adaptive).
dataset = "cifar100"
seed_size = 10000
b_configs = [(5000,0,0,False),
             (500,2500,2000,False)]
c_a = 4
c_v = 1

# Get CIFAR-10 dataset.
full_train_dataset = datasets.CIFAR100(dataset_root_path, download=True, train=True)
test_dataset = datasets.CIFAR100(dataset_root_path, download=True, train=False)
nclasses = 100 # NUM CLASSES HERE
    
# Set the subplot title, y-axis, x-axis
axes[0][0].set_title(r"\textbf{CIFAR-100}", fontsize=plot_title_font_size)
axes[0][0].set_xlabel(r"\textbf{Labeling Cost (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[0][0].set_ylabel(r"\textbf{Acc. (" + F"{500}," + F"{2500}," + F"{2000})" + r"}", fontsize=axis_label_font_size)

axes[0][1].set_title(r"\textbf{CIFAR-100}", fontsize=plot_title_font_size)
axes[0][1].set_xlabel(r"\textbf{Labeling Cost (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[0][1].set_ylabel(r"\textbf{Acc. (" + F"{500}," + F"{2500}," + F"{2000})" + r"}", fontsize=axis_label_font_size)

axes[0][2].set_title(r"\textbf{CIFAR-100}", fontsize=plot_title_font_size)
axes[0][2].set_xlabel(r"\textbf{Test Accuracy (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[0][2].set_ylabel(r"\textbf{Eff. (" + F"{500}," + F"{2500}," + F"{2000})" + r"}", fontsize=axis_label_font_size)

axes[0][3].set_title(r"\textbf{CIFAR-100}", fontsize=plot_title_font_size)
axes[0][3].set_xlabel(r"\textbf{Test Accuracy (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[0][3].set_ylabel(r"\textbf{Eff. (" + F"{500}," + F"{2500}," + F"{2000})" + r"}", fontsize=axis_label_font_size)

# ===== BADGE =====
al_strategy = "badge"

# Get CIFAR-10 results.
results_directory = os.path.join(base_save_directory, dataset, al_strategy, str(seed_size))
experiment_results = get_experiment_results(results_directory, dataset, al_strategy, seed_size, b_configs)
suggestion_al_round_cutoff, no_suggestion_al_round_cutoff = determine_al_round_cutoff(experiment_results, full_train_dataset, c_v, c_a)

# Get the average labeling costs across experiments along with the test accuracies. Plot them.
for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    # Attempt to generate a color if there is not already one.
    if b2 == 0 and b3 == 0:
        if (al_strategy,False) not in color_dictionary or (al_strategy,True) not in color_dictionary:
            raise ValueError(F"Add color for {al_strategy}")
        else:
            no_suggestion_draw_color = color_dictionary[(al_strategy,False)]
            suggestion_draw_color = color_dictionary[(al_strategy,True)]

            # Draw the suggest AL baseline
            exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
            print(F"RUN COUNT {dataset} {al_strategy} {human_correct_strategy} {auto_assign_strategy} {b1} {b2} {b3} {seed_size} {rounds} {is_adaptive}: {len(exp_results_list)}")
            labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
            average_acc, std = get_avg_std_test_acc(exp_results_list)
            lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
            upper_list = [(x+y) for (x,y) in zip(average_acc,std)]
            
            line = axes[0][0].plot(labeling_costs[:suggestion_al_round_cutoff], average_acc[:suggestion_al_round_cutoff], color=suggestion_draw_color, marker='o')[0]
            axes[0][0].fill_between(labeling_costs[:suggestion_al_round_cutoff], lower_list[:suggestion_al_round_cutoff], upper_list[:suggestion_al_round_cutoff], alpha=shade_alpha, color=suggestion_draw_color)

            if (al_strategy, True) not in legend_line_dictionary:
                legend_line_dictionary[(al_strategy, True)] = line

            # Draw the no-suggest AL baseline
            labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, False, full_train_dataset)
            average_acc, std = get_avg_std_test_acc(exp_results_list)
            lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
            upper_list = [(x+y) for (x,y) in zip(average_acc,std)]
            
            line = axes[0][0].plot(labeling_costs[:no_suggestion_al_round_cutoff], average_acc[:no_suggestion_al_round_cutoff], color=no_suggestion_draw_color, marker='o')[0]
            axes[0][0].fill_between(labeling_costs[:no_suggestion_al_round_cutoff], lower_list[:no_suggestion_al_round_cutoff], upper_list[:no_suggestion_al_round_cutoff], alpha=shade_alpha, color=no_suggestion_draw_color)

            if (al_strategy, False) not in legend_line_dictionary:
                legend_line_dictionary[(al_strategy, False)] = line

            al_key = (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)
    else:
        if (al_strategy, human_correct_strategy, auto_assign_strategy) not in color_dictionary:
            raise ValueError(F"Add color for {al_strategy}+{human_correct_strategy}+{auto_assign_strategy}")
        else:
            draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]

        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        print(F"RUN COUNT {dataset} {al_strategy} {human_correct_strategy} {auto_assign_strategy} {b1} {b2} {b3} {seed_size} {rounds} {is_adaptive}: {len(exp_results_list)}")
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, std = get_avg_std_test_acc(exp_results_list)
        lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
        upper_list = [(x+y) for (x,y) in zip(average_acc,std)]

        line = axes[0][0].plot(labeling_costs, average_acc, color=draw_color, marker='o')[0]
        axes[0][0].fill_between(labeling_costs, lower_list, upper_list, alpha=shade_alpha, color=draw_color)

        if (al_strategy, human_correct_strategy, auto_assign_strategy) not in legend_line_dictionary:
            legend_line_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)] = line


# Do the same for the labeling efficiency.
al_exp_results_list = experiment_results[al_key]
al_labeling_costs = get_avg_labeling_costs(al_exp_results_list, c_v, c_a, False, full_train_dataset)
al_average_acc, _ = get_avg_std_test_acc(al_exp_results_list)

for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    # If this is the AL key, plot the gain with suggestion over no suggestion.
    if b2 == 0 and b3 == 0:
        
        suggestion_draw_color = color_dictionary[(al_strategy, True)]
        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, _ = get_avg_std_test_acc(exp_results_list)

        plot_test_accuracies, labeling_efficiencies = produce_label_efficiency_data(al_labeling_costs[:suggestion_al_round_cutoff], al_average_acc[:suggestion_al_round_cutoff], labeling_costs[:suggestion_al_round_cutoff], average_acc[:suggestion_al_round_cutoff], label_eff_acc_granularity)

        axes[0][2].plot(plot_test_accuracies, labeling_efficiencies, color=suggestion_draw_color, marker='o')[0]
    else:

        draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]
        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, _ = get_avg_std_test_acc(exp_results_list)

        plot_test_accuracies, labeling_efficiencies = produce_label_efficiency_data(al_labeling_costs, al_average_acc, labeling_costs, average_acc, label_eff_acc_granularity)

        axes[0][2].plot(plot_test_accuracies, labeling_efficiencies, color=draw_color, marker='o')[0]

# ===== ENTROPY =====
al_strategy = "entropy"
del al_key

# Get CIFAR-10 results.
results_directory = os.path.join(base_save_directory, dataset, al_strategy, str(seed_size))
experiment_results = get_experiment_results(results_directory, dataset, al_strategy, seed_size, b_configs)
suggestion_al_round_cutoff, no_suggestion_al_round_cutoff = determine_al_round_cutoff(experiment_results, full_train_dataset, c_v, c_a)

# Get the average labeling costs across experiments along with the test accuracies. Plot them.
for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    # Attempt to generate a color if there is not already one.
    if b2 == 0 and b3 == 0:
        if (al_strategy,False) not in color_dictionary or (al_strategy,True) not in color_dictionary:
            raise ValueError(F"Add color for {al_strategy}")
        else:
            no_suggestion_draw_color = color_dictionary[(al_strategy,False)]
            suggestion_draw_color = color_dictionary[(al_strategy,True)]

            # Draw the suggest AL baseline
            exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
            print(F"RUN COUNT {dataset} {al_strategy} {human_correct_strategy} {auto_assign_strategy} {b1} {b2} {b3} {seed_size} {rounds} {is_adaptive}: {len(exp_results_list)}")
            labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
            average_acc, std = get_avg_std_test_acc(exp_results_list)
            lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
            upper_list = [(x+y) for (x,y) in zip(average_acc,std)]
            
            line = axes[0][1].plot(labeling_costs[:suggestion_al_round_cutoff], average_acc[:suggestion_al_round_cutoff], color=suggestion_draw_color, marker='o')[0]
            axes[0][1].fill_between(labeling_costs[:suggestion_al_round_cutoff], lower_list[:suggestion_al_round_cutoff], upper_list[:suggestion_al_round_cutoff], alpha=shade_alpha, color=suggestion_draw_color)

            if (al_strategy, True) not in legend_line_dictionary:
                legend_line_dictionary[(al_strategy, True)] = line

            # Draw the no-suggest AL baseline
            labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, False, full_train_dataset)
            average_acc, std = get_avg_std_test_acc(exp_results_list)
            lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
            upper_list = [(x+y) for (x,y) in zip(average_acc,std)]
            
            line = axes[0][1].plot(labeling_costs[:no_suggestion_al_round_cutoff], average_acc[:no_suggestion_al_round_cutoff], color=no_suggestion_draw_color, marker='o')[0]
            axes[0][1].fill_between(labeling_costs[:no_suggestion_al_round_cutoff], lower_list[:no_suggestion_al_round_cutoff], upper_list[:no_suggestion_al_round_cutoff], alpha=shade_alpha, color=no_suggestion_draw_color)

            if (al_strategy, False) not in legend_line_dictionary:
                legend_line_dictionary[(al_strategy, False)] = line

            al_key = (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)
    else:
        if (al_strategy, human_correct_strategy, auto_assign_strategy) not in color_dictionary:
            raise ValueError(F"Add color for {al_strategy}+{human_correct_strategy}+{auto_assign_strategy}")
        else:
            draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]

        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        print(F"RUN COUNT {dataset} {al_strategy} {human_correct_strategy} {auto_assign_strategy} {b1} {b2} {b3} {seed_size} {rounds} {is_adaptive}: {len(exp_results_list)}")
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, std = get_avg_std_test_acc(exp_results_list)
        lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
        upper_list = [(x+y) for (x,y) in zip(average_acc,std)]

        line = axes[0][1].plot(labeling_costs, average_acc, color=draw_color, marker='o')[0]
        axes[0][1].fill_between(labeling_costs, lower_list, upper_list, alpha=shade_alpha, color=draw_color)

        if (al_strategy, human_correct_strategy, auto_assign_strategy) not in legend_line_dictionary:
            legend_line_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)] = line


# Do the same for the labeling efficiency.
al_exp_results_list = experiment_results[al_key]
al_labeling_costs = get_avg_labeling_costs(al_exp_results_list, c_v, c_a, False, full_train_dataset)
al_average_acc, _ = get_avg_std_test_acc(al_exp_results_list)

for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    # If this is the AL key, plot the gain with suggestion over no suggestion.
    if b2 == 0 and b3 == 0:
        
        suggestion_draw_color = color_dictionary[(al_strategy, True)]
        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, _ = get_avg_std_test_acc(exp_results_list)

        plot_test_accuracies, labeling_efficiencies = produce_label_efficiency_data(al_labeling_costs[:suggestion_al_round_cutoff], al_average_acc[:suggestion_al_round_cutoff], labeling_costs[:suggestion_al_round_cutoff], average_acc[:suggestion_al_round_cutoff], label_eff_acc_granularity)

        axes[0][3].plot(plot_test_accuracies, labeling_efficiencies, color=suggestion_draw_color, marker='o')[0]
    else:

        draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]
        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, _ = get_avg_std_test_acc(exp_results_list)

        plot_test_accuracies, labeling_efficiencies = produce_label_efficiency_data(al_labeling_costs, al_average_acc, labeling_costs, average_acc, label_eff_acc_granularity)

        axes[0][3].plot(plot_test_accuracies, labeling_efficiencies, color=draw_color, marker='o')[0]

# =========================================================================================================================================================
#
# ======================================================================= BIRDS ===========================================================================
#
# =========================================================================================================================================================

# Note: b_configs is a list of tuples of the form (b_1, b_2, b_3, is_adaptive).
dataset = "birds"
seed_size = 600
b_configs = [(600,0,0,False),
             (60,400,140,False)]
c_a = 4
c_v = 1

# Get CUB dataset.
full_train_dataset = BirdsDataset(dataset_root_path, download=True, train=True)
test_dataset = BirdsDataset(dataset_root_path, download=True, train=False)
nclasses = 200

# Set the subplot title, y-axis, x-axis
axes[1][0].set_title(r"\textbf{CUB-200-2011}", fontsize=plot_title_font_size)
axes[1][0].set_xlabel(r"\textbf{Labeling Cost (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[1][0].set_ylabel(r"\textbf{Acc. (" + F"{60}," + F"{400}," + F"{140})" + r"}", fontsize=axis_label_font_size)

axes[1][1].set_title(r"\textbf{CUB-200-2011}", fontsize=plot_title_font_size)
axes[1][1].set_xlabel(r"\textbf{Labeling Cost (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[1][1].set_ylabel(r"\textbf{Acc. (" + F"{60}," + F"{400}," + F"{140})" + r"}", fontsize=axis_label_font_size)

axes[1][2].set_title(r"\textbf{CUB-200-2011}", fontsize=plot_title_font_size)
axes[1][2].set_xlabel(r"\textbf{Test Accuracy (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[1][2].set_ylabel(r"\textbf{Eff. (" + F"{60}," + F"{400}," + F"{140})" + r"}", fontsize=axis_label_font_size)

axes[1][3].set_title(r"\textbf{CUB-200-2011}", fontsize=plot_title_font_size)
axes[1][3].set_xlabel(r"\textbf{Test Accuracy (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[1][3].set_ylabel(r"\textbf{Eff. (" + F"{60}," + F"{400}," + F"{140})" + r"}", fontsize=axis_label_font_size)

# ===== BADGE =====
al_strategy = "badge"

# Get CIFAR-10 results.
results_directory = os.path.join(base_save_directory, dataset, al_strategy, str(seed_size))
experiment_results = get_experiment_results(results_directory, dataset, al_strategy, seed_size, b_configs)
suggestion_al_round_cutoff, no_suggestion_al_round_cutoff = determine_al_round_cutoff(experiment_results, full_train_dataset, c_v, c_a)
suggestion_al_round_cutoff -= 1
no_suggestion_al_round_cutoff -= 1

# Get the average labeling costs across experiments along with the test accuracies. Plot them.
for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    # Attempt to generate a color if there is not already one.
    if b2 == 0 and b3 == 0:
        if (al_strategy,False) not in color_dictionary or (al_strategy,True) not in color_dictionary:
            raise ValueError(F"Add color for {al_strategy}")
        else:
            no_suggestion_draw_color = color_dictionary[(al_strategy,False)]
            suggestion_draw_color = color_dictionary[(al_strategy,True)]

            # Draw the suggest AL baseline
            exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
            print(F"RUN COUNT {dataset} {al_strategy} {human_correct_strategy} {auto_assign_strategy} {b1} {b2} {b3} {seed_size} {rounds} {is_adaptive}: {len(exp_results_list)}")
            labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
            average_acc, std = get_avg_std_test_acc(exp_results_list)
            lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
            upper_list = [(x+y) for (x,y) in zip(average_acc,std)]
            
            line = axes[1][0].plot(labeling_costs[:suggestion_al_round_cutoff], average_acc[:suggestion_al_round_cutoff], color=suggestion_draw_color, marker='o')[0]
            axes[1][0].fill_between(labeling_costs[:suggestion_al_round_cutoff], lower_list[:suggestion_al_round_cutoff], upper_list[:suggestion_al_round_cutoff], alpha=shade_alpha, color=suggestion_draw_color)

            if (al_strategy, True) not in legend_line_dictionary:
                legend_line_dictionary[(al_strategy, True)] = line

            # Draw the no-suggest AL baseline
            labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, False, full_train_dataset)
            average_acc, std = get_avg_std_test_acc(exp_results_list)
            lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
            upper_list = [(x+y) for (x,y) in zip(average_acc,std)]
            
            line = axes[1][0].plot(labeling_costs[:no_suggestion_al_round_cutoff], average_acc[:no_suggestion_al_round_cutoff], color=no_suggestion_draw_color, marker='o')[0]
            axes[1][0].fill_between(labeling_costs[:no_suggestion_al_round_cutoff], lower_list[:no_suggestion_al_round_cutoff], upper_list[:no_suggestion_al_round_cutoff], alpha=shade_alpha, color=no_suggestion_draw_color)

            if (al_strategy, False) not in legend_line_dictionary:
                legend_line_dictionary[(al_strategy, False)] = line

            al_key = (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)
    else:
        if (al_strategy, human_correct_strategy, auto_assign_strategy) not in color_dictionary:
            raise ValueError(F"Add color for {al_strategy}+{human_correct_strategy}+{auto_assign_strategy}")
        else:
            draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]

        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        print(F"RUN COUNT {dataset} {al_strategy} {human_correct_strategy} {auto_assign_strategy} {b1} {b2} {b3} {seed_size} {rounds} {is_adaptive}: {len(exp_results_list)}")
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, std = get_avg_std_test_acc(exp_results_list)
        lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
        upper_list = [(x+y) for (x,y) in zip(average_acc,std)]

        line = axes[1][0].plot(labeling_costs, average_acc, color=draw_color, marker='o')[0]
        axes[1][0].fill_between(labeling_costs, lower_list, upper_list, alpha=shade_alpha, color=draw_color)

        if (al_strategy, human_correct_strategy, auto_assign_strategy) not in legend_line_dictionary:
            legend_line_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)] = line


# Do the same for the labeling efficiency.
al_exp_results_list = experiment_results[al_key]
al_labeling_costs = get_avg_labeling_costs(al_exp_results_list, c_v, c_a, False, full_train_dataset)
al_average_acc, _ = get_avg_std_test_acc(al_exp_results_list)

for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    # If this is the AL key, plot the gain with suggestion over no suggestion.
    if b2 == 0 and b3 == 0:
        
        suggestion_draw_color = color_dictionary[(al_strategy, True)]
        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, _ = get_avg_std_test_acc(exp_results_list)

        plot_test_accuracies, labeling_efficiencies = produce_label_efficiency_data(al_labeling_costs[:suggestion_al_round_cutoff], al_average_acc[:suggestion_al_round_cutoff], labeling_costs[:suggestion_al_round_cutoff], average_acc[:suggestion_al_round_cutoff], label_eff_acc_granularity)

        axes[1][2].plot(plot_test_accuracies, labeling_efficiencies, color=suggestion_draw_color, marker='o')[0]
    else:

        draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]
        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, _ = get_avg_std_test_acc(exp_results_list)

        plot_test_accuracies, labeling_efficiencies = produce_label_efficiency_data(al_labeling_costs, al_average_acc, labeling_costs, average_acc, label_eff_acc_granularity)

        axes[1][2].plot(plot_test_accuracies, labeling_efficiencies, color=draw_color, marker='o')[0]

# ===== ENTROPY =====
al_strategy = "entropy"
del al_key

# Get CIFAR-10 results.
results_directory = os.path.join(base_save_directory, dataset, al_strategy, str(seed_size))
experiment_results = get_experiment_results(results_directory, dataset, al_strategy, seed_size, b_configs)
suggestion_al_round_cutoff, no_suggestion_al_round_cutoff = determine_al_round_cutoff(experiment_results, full_train_dataset, c_v, c_a)

# Get the average labeling costs across experiments along with the test accuracies. Plot them.
for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    # Attempt to generate a color if there is not already one.
    if b2 == 0 and b3 == 0:
        if (al_strategy,False) not in color_dictionary or (al_strategy,True) not in color_dictionary:
            raise ValueError(F"Add color for {al_strategy}")
        else:
            no_suggestion_draw_color = color_dictionary[(al_strategy,False)]
            suggestion_draw_color = color_dictionary[(al_strategy,True)]

            # Draw the suggest AL baseline
            exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
            print(F"RUN COUNT {dataset} {al_strategy} {human_correct_strategy} {auto_assign_strategy} {b1} {b2} {b3} {seed_size} {rounds} {is_adaptive}: {len(exp_results_list)}")
            labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
            average_acc, std = get_avg_std_test_acc(exp_results_list)
            lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
            upper_list = [(x+y) for (x,y) in zip(average_acc,std)]
            
            line = axes[1][1].plot(labeling_costs[:suggestion_al_round_cutoff], average_acc[:suggestion_al_round_cutoff], color=suggestion_draw_color, marker='o')[0]
            axes[1][1].fill_between(labeling_costs[:suggestion_al_round_cutoff], lower_list[:suggestion_al_round_cutoff], upper_list[:suggestion_al_round_cutoff], alpha=shade_alpha, color=suggestion_draw_color)

            if (al_strategy, True) not in legend_line_dictionary:
                legend_line_dictionary[(al_strategy, True)] = line

            # Draw the no-suggest AL baseline
            labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, False, full_train_dataset)
            average_acc, std = get_avg_std_test_acc(exp_results_list)
            lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
            upper_list = [(x+y) for (x,y) in zip(average_acc,std)]
            
            line = axes[1][1].plot(labeling_costs[:no_suggestion_al_round_cutoff], average_acc[:no_suggestion_al_round_cutoff], color=no_suggestion_draw_color, marker='o')[0]
            axes[1][1].fill_between(labeling_costs[:no_suggestion_al_round_cutoff], lower_list[:no_suggestion_al_round_cutoff], upper_list[:no_suggestion_al_round_cutoff], alpha=shade_alpha, color=no_suggestion_draw_color)

            if (al_strategy, False) not in legend_line_dictionary:
                legend_line_dictionary[(al_strategy, False)] = line

            al_key = (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)
    else:
        if (al_strategy, human_correct_strategy, auto_assign_strategy) not in color_dictionary:
            raise ValueError(F"Add color for {al_strategy}+{human_correct_strategy}+{auto_assign_strategy}")
        else:
            draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]

        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        print(F"RUN COUNT {dataset} {al_strategy} {human_correct_strategy} {auto_assign_strategy} {b1} {b2} {b3} {seed_size} {rounds} {is_adaptive}: {len(exp_results_list)}")
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, std = get_avg_std_test_acc(exp_results_list)
        lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
        upper_list = [(x+y) for (x,y) in zip(average_acc,std)]

        line = axes[1][1].plot(labeling_costs, average_acc, color=draw_color, marker='o')[0]
        axes[1][1].fill_between(labeling_costs, lower_list, upper_list, alpha=shade_alpha, color=draw_color)

        if (al_strategy, human_correct_strategy, auto_assign_strategy) not in legend_line_dictionary:
            legend_line_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)] = line


# Do the same for the labeling efficiency.
al_exp_results_list = experiment_results[al_key]
al_labeling_costs = get_avg_labeling_costs(al_exp_results_list, c_v, c_a, False, full_train_dataset)
al_average_acc, _ = get_avg_std_test_acc(al_exp_results_list)

for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    # If this is the AL key, plot the gain with suggestion over no suggestion.
    if b2 == 0 and b3 == 0:
        
        suggestion_draw_color = color_dictionary[(al_strategy, True)]
        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, _ = get_avg_std_test_acc(exp_results_list)

        plot_test_accuracies, labeling_efficiencies = produce_label_efficiency_data(al_labeling_costs[:suggestion_al_round_cutoff], al_average_acc[:suggestion_al_round_cutoff], labeling_costs[:suggestion_al_round_cutoff], average_acc[:suggestion_al_round_cutoff], label_eff_acc_granularity)

        axes[1][3].plot(plot_test_accuracies, labeling_efficiencies, color=suggestion_draw_color, marker='o')[0]
    else:

        draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]
        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, _ = get_avg_std_test_acc(exp_results_list)

        plot_test_accuracies, labeling_efficiencies = produce_label_efficiency_data(al_labeling_costs, al_average_acc, labeling_costs, average_acc, label_eff_acc_granularity)

        axes[1][3].plot(plot_test_accuracies, labeling_efficiencies, color=draw_color, marker='o')[0]

# =========================================================================================================================================================
#
# ==================================================================== BIRDS PART 2 =======================================================================
#
# =========================================================================================================================================================

# Note: b_configs is a list of tuples of the form (b_1, b_2, b_3, is_adaptive).
dataset = "birds"
seed_size = 600
b_configs = [(600,0,0,False),
             (60,300,240,False)]

c_a = 4
c_v = 1

# Get CUB dataset.
full_train_dataset = BirdsDataset(dataset_root_path, download=True, train=True)
test_dataset = BirdsDataset(dataset_root_path, download=True, train=False)
nclasses = 200

# Set the subplot title, y-axis, x-axis
axes[2][0].set_title(r"\textbf{CUB-200-2011}", fontsize=plot_title_font_size)
axes[2][0].set_xlabel(r"\textbf{Labeling Cost (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[2][0].set_ylabel(r"\textbf{Acc. (" + F"{60}," + F"{300}," + F"{240})" + r"}", fontsize=axis_label_font_size)

axes[2][1].set_title(r"\textbf{CUB-200-2011}", fontsize=plot_title_font_size)
axes[2][1].set_xlabel(r"\textbf{Labeling Cost (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[2][1].set_ylabel(r"\textbf{Acc. (" + F"{60}," + F"{300}," + F"{240})" + r"}", fontsize=axis_label_font_size)

axes[2][2].set_title(r"\textbf{CUB-200-2011}", fontsize=plot_title_font_size)
axes[2][2].set_xlabel(r"\textbf{Test Accuracy (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[2][2].set_ylabel(r"\textbf{Eff. (" + F"{60}," + F"{300}," + F"{240})" + r"}", fontsize=axis_label_font_size)

axes[2][3].set_title(r"\textbf{CUB-200-2011}", fontsize=plot_title_font_size)
axes[2][3].set_xlabel(r"\textbf{Test Accuracy (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[2][3].set_ylabel(r"\textbf{Eff. (" + F"{60}," + F"{300}," + F"{240})" + r"}", fontsize=axis_label_font_size)

# ===== BADGE =====
al_strategy = "badge"

# Get CIFAR-10 results.
results_directory = os.path.join(base_save_directory, dataset, al_strategy, str(seed_size))
experiment_results = get_experiment_results(results_directory, dataset, al_strategy, seed_size, b_configs)
suggestion_al_round_cutoff, no_suggestion_al_round_cutoff = determine_al_round_cutoff(experiment_results, full_train_dataset, c_v, c_a)
suggestion_al_round_cutoff -= 1
no_suggestion_al_round_cutoff -= 1

# Get the average labeling costs across experiments along with the test accuracies. Plot them.
for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    # Attempt to generate a color if there is not already one.
    if b2 == 0 and b3 == 0:
        if (al_strategy,False) not in color_dictionary or (al_strategy,True) not in color_dictionary:
            raise ValueError(F"Add color for {al_strategy}")
        else:
            no_suggestion_draw_color = color_dictionary[(al_strategy,False)]
            suggestion_draw_color = color_dictionary[(al_strategy,True)]

            # Draw the suggest AL baseline
            exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
            print(F"RUN COUNT {dataset} {al_strategy} {human_correct_strategy} {auto_assign_strategy} {b1} {b2} {b3} {seed_size} {rounds} {is_adaptive}: {len(exp_results_list)}")
            labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
            average_acc, std = get_avg_std_test_acc(exp_results_list)
            lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
            upper_list = [(x+y) for (x,y) in zip(average_acc,std)]
            
            line = axes[2][0].plot(labeling_costs[:suggestion_al_round_cutoff], average_acc[:suggestion_al_round_cutoff], color=suggestion_draw_color, marker='o')[0]
            axes[2][0].fill_between(labeling_costs[:suggestion_al_round_cutoff], lower_list[:suggestion_al_round_cutoff], upper_list[:suggestion_al_round_cutoff], alpha=shade_alpha, color=suggestion_draw_color)

            if (al_strategy, True) not in legend_line_dictionary:
                legend_line_dictionary[(al_strategy, True)] = line

            # Draw the no-suggest AL baseline
            labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, False, full_train_dataset)
            average_acc, std = get_avg_std_test_acc(exp_results_list)
            lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
            upper_list = [(x+y) for (x,y) in zip(average_acc,std)]
            
            line = axes[2][0].plot(labeling_costs[:no_suggestion_al_round_cutoff], average_acc[:no_suggestion_al_round_cutoff], color=no_suggestion_draw_color, marker='o')[0]
            axes[2][0].fill_between(labeling_costs[:no_suggestion_al_round_cutoff], lower_list[:no_suggestion_al_round_cutoff], upper_list[:no_suggestion_al_round_cutoff], alpha=shade_alpha, color=no_suggestion_draw_color)

            if (al_strategy, False) not in legend_line_dictionary:
                legend_line_dictionary[(al_strategy, False)] = line

            al_key = (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)
    else:
        if (al_strategy, human_correct_strategy, auto_assign_strategy) not in color_dictionary:
            raise ValueError(F"Add color for {al_strategy}+{human_correct_strategy}+{auto_assign_strategy}")
        else:
            draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]

        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        print(F"RUN COUNT {dataset} {al_strategy} {human_correct_strategy} {auto_assign_strategy} {b1} {b2} {b3} {seed_size} {rounds} {is_adaptive}: {len(exp_results_list)}")
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, std = get_avg_std_test_acc(exp_results_list)
        lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
        upper_list = [(x+y) for (x,y) in zip(average_acc,std)]

        line = axes[2][0].plot(labeling_costs, average_acc, color=draw_color, marker='o')[0]
        axes[2][0].fill_between(labeling_costs, lower_list, upper_list, alpha=shade_alpha, color=draw_color)

        if (al_strategy, human_correct_strategy, auto_assign_strategy) not in legend_line_dictionary:
            legend_line_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)] = line


# Do the same for the labeling efficiency.
al_exp_results_list = experiment_results[al_key]
al_labeling_costs = get_avg_labeling_costs(al_exp_results_list, c_v, c_a, False, full_train_dataset)
al_average_acc, _ = get_avg_std_test_acc(al_exp_results_list)

for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    # If this is the AL key, plot the gain with suggestion over no suggestion.
    if b2 == 0 and b3 == 0:
        
        suggestion_draw_color = color_dictionary[(al_strategy, True)]
        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, _ = get_avg_std_test_acc(exp_results_list)

        plot_test_accuracies, labeling_efficiencies = produce_label_efficiency_data(al_labeling_costs[:suggestion_al_round_cutoff], al_average_acc[:suggestion_al_round_cutoff], labeling_costs[:suggestion_al_round_cutoff], average_acc[:suggestion_al_round_cutoff], label_eff_acc_granularity)

        axes[2][2].plot(plot_test_accuracies, labeling_efficiencies, color=suggestion_draw_color, marker='o')[0]
    else:

        draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]
        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, _ = get_avg_std_test_acc(exp_results_list)

        plot_test_accuracies, labeling_efficiencies = produce_label_efficiency_data(al_labeling_costs, al_average_acc, labeling_costs, average_acc, label_eff_acc_granularity)

        axes[2][2].plot(plot_test_accuracies, labeling_efficiencies, color=draw_color, marker='o')[0]

# ===== ENTROPY =====
al_strategy = "entropy"
del al_key

# Get CIFAR-10 results.
results_directory = os.path.join(base_save_directory, dataset, al_strategy, str(seed_size))
experiment_results = get_experiment_results(results_directory, dataset, al_strategy, seed_size, b_configs)
suggestion_al_round_cutoff, no_suggestion_al_round_cutoff = determine_al_round_cutoff(experiment_results, full_train_dataset, c_v, c_a)

# Get the average labeling costs across experiments along with the test accuracies. Plot them.
for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    # Attempt to generate a color if there is not already one.
    if b2 == 0 and b3 == 0:
        if (al_strategy,False) not in color_dictionary or (al_strategy,True) not in color_dictionary:
            raise ValueError(F"Add color for {al_strategy}")
        else:
            no_suggestion_draw_color = color_dictionary[(al_strategy,False)]
            suggestion_draw_color = color_dictionary[(al_strategy,True)]

            # Draw the suggest AL baseline
            exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
            print(F"RUN COUNT {dataset} {al_strategy} {human_correct_strategy} {auto_assign_strategy} {b1} {b2} {b3} {seed_size} {rounds} {is_adaptive}: {len(exp_results_list)}")
            labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
            average_acc, std = get_avg_std_test_acc(exp_results_list)
            lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
            upper_list = [(x+y) for (x,y) in zip(average_acc,std)]
            
            line = axes[2][1].plot(labeling_costs[:suggestion_al_round_cutoff], average_acc[:suggestion_al_round_cutoff], color=suggestion_draw_color, marker='o')[0]
            axes[2][1].fill_between(labeling_costs[:suggestion_al_round_cutoff], lower_list[:suggestion_al_round_cutoff], upper_list[:suggestion_al_round_cutoff], alpha=shade_alpha, color=suggestion_draw_color)

            if (al_strategy, True) not in legend_line_dictionary:
                legend_line_dictionary[(al_strategy, True)] = line

            # Draw the no-suggest AL baseline
            labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, False, full_train_dataset)
            average_acc, std = get_avg_std_test_acc(exp_results_list)
            lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
            upper_list = [(x+y) for (x,y) in zip(average_acc,std)]
            
            line = axes[2][1].plot(labeling_costs[:no_suggestion_al_round_cutoff], average_acc[:no_suggestion_al_round_cutoff], color=no_suggestion_draw_color, marker='o')[0]
            axes[2][1].fill_between(labeling_costs[:no_suggestion_al_round_cutoff], lower_list[:no_suggestion_al_round_cutoff], upper_list[:no_suggestion_al_round_cutoff], alpha=shade_alpha, color=no_suggestion_draw_color)

            if (al_strategy, False) not in legend_line_dictionary:
                legend_line_dictionary[(al_strategy, False)] = line

            al_key = (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)
    else:
        if (al_strategy, human_correct_strategy, auto_assign_strategy) not in color_dictionary:
            raise ValueError(F"Add color for {al_strategy}+{human_correct_strategy}+{auto_assign_strategy}")
        else:
            draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]

        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        print(F"RUN COUNT {dataset} {al_strategy} {human_correct_strategy} {auto_assign_strategy} {b1} {b2} {b3} {seed_size} {rounds} {is_adaptive}: {len(exp_results_list)}")
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, std = get_avg_std_test_acc(exp_results_list)
        lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
        upper_list = [(x+y) for (x,y) in zip(average_acc,std)]

        line = axes[2][1].plot(labeling_costs, average_acc, color=draw_color, marker='o')[0]
        axes[2][1].fill_between(labeling_costs, lower_list, upper_list, alpha=shade_alpha, color=draw_color)

        if (al_strategy, human_correct_strategy, auto_assign_strategy) not in legend_line_dictionary:
            legend_line_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)] = line


# Do the same for the labeling efficiency.
al_exp_results_list = experiment_results[al_key]
al_labeling_costs = get_avg_labeling_costs(al_exp_results_list, c_v, c_a, False, full_train_dataset)
al_average_acc, _ = get_avg_std_test_acc(al_exp_results_list)

for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    # If this is the AL key, plot the gain with suggestion over no suggestion.
    if b2 == 0 and b3 == 0:
        
        suggestion_draw_color = color_dictionary[(al_strategy, True)]
        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, _ = get_avg_std_test_acc(exp_results_list)

        plot_test_accuracies, labeling_efficiencies = produce_label_efficiency_data(al_labeling_costs[:suggestion_al_round_cutoff], al_average_acc[:suggestion_al_round_cutoff], labeling_costs[:suggestion_al_round_cutoff], average_acc[:suggestion_al_round_cutoff], label_eff_acc_granularity)

        axes[2][3].plot(plot_test_accuracies, labeling_efficiencies, color=suggestion_draw_color, marker='o')[0]
    else:

        draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]
        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, _ = get_avg_std_test_acc(exp_results_list)

        plot_test_accuracies, labeling_efficiencies = produce_label_efficiency_data(al_labeling_costs, al_average_acc, labeling_costs, average_acc, label_eff_acc_granularity)

        axes[2][3].plot(plot_test_accuracies, labeling_efficiencies, color=draw_color, marker='o')[0]

# =========================================================================================================================================================
#
# ======================================================================== DOGS ===========================================================================
#
# =========================================================================================================================================================

# Note: b_configs is a list of tuples of the form (b_1, b_2, b_3, is_adaptive).
dataset = "dogs"
seed_size = 500
b_configs = [(1000,0,0,False),
             (100,600,300,False)]

c_a = 4
c_v = 1

# Get Dogs dataset.
full_train_dataset = DogsDataset(dataset_root_path, download=True, train=True)
test_dataset = DogsDataset(dataset_root_path, download=True, train=False)
nclasses = 120

# Set the subplot title, y-axis, x-axis
axes[3][0].set_title(r"\textbf{Stanford Dogs}", fontsize=plot_title_font_size)
axes[3][0].set_xlabel(r"\textbf{Labeling Cost (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[3][0].set_ylabel(r"\textbf{Acc. (" + F"{100}," + F"{600}," + F"{300})" + r"}", fontsize=axis_label_font_size)

axes[3][1].set_title(r"\textbf{Stanford Dogs}", fontsize=plot_title_font_size)
axes[3][1].set_xlabel(r"\textbf{Labeling Cost (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[3][1].set_ylabel(r"\textbf{Acc. (" + F"{100}," + F"{600}," + F"{300})" + r"}", fontsize=axis_label_font_size)

axes[3][2].set_title(r"\textbf{Stanford Dogs}", fontsize=plot_title_font_size)
axes[3][2].set_xlabel(r"\textbf{Test Accuracy (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[3][2].set_ylabel(r"\textbf{Eff. (" + F"{100}," + F"{600}," + F"{300})" + r"}", fontsize=axis_label_font_size)

axes[3][3].set_title(r"\textbf{Stanford Dogs}", fontsize=plot_title_font_size)
axes[3][3].set_xlabel(r"\textbf{Test Accuracy (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[3][3].set_ylabel(r"\textbf{Eff. (" + F"{100}," + F"{600}," + F"{300})" + r"}", fontsize=axis_label_font_size)

# ===== BADGE =====
al_strategy = "badge"

# Get CIFAR-10 results.
results_directory = os.path.join(base_save_directory, dataset, al_strategy, str(seed_size))
experiment_results = get_experiment_results(results_directory, dataset, al_strategy, seed_size, b_configs)
suggestion_al_round_cutoff, no_suggestion_al_round_cutoff = determine_al_round_cutoff(experiment_results, full_train_dataset, c_v, c_a)

# Get the average labeling costs across experiments along with the test accuracies. Plot them.
for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    # Attempt to generate a color if there is not already one.
    if b2 == 0 and b3 == 0:
        if (al_strategy,False) not in color_dictionary or (al_strategy,True) not in color_dictionary:
            raise ValueError(F"Add color for {al_strategy}")
        else:
            no_suggestion_draw_color = color_dictionary[(al_strategy,False)]
            suggestion_draw_color = color_dictionary[(al_strategy,True)]

            # Draw the suggest AL baseline
            exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
            print(F"RUN COUNT {dataset} {al_strategy} {human_correct_strategy} {auto_assign_strategy} {b1} {b2} {b3} {seed_size} {rounds} {is_adaptive}: {len(exp_results_list)}")
            labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
            average_acc, std = get_avg_std_test_acc(exp_results_list)
            lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
            upper_list = [(x+y) for (x,y) in zip(average_acc,std)]
            
            line = axes[3][0].plot(labeling_costs[:suggestion_al_round_cutoff], average_acc[:suggestion_al_round_cutoff], color=suggestion_draw_color, marker='o')[0]
            axes[3][0].fill_between(labeling_costs[:suggestion_al_round_cutoff], lower_list[:suggestion_al_round_cutoff], upper_list[:suggestion_al_round_cutoff], alpha=shade_alpha, color=suggestion_draw_color)

            if (al_strategy, True) not in legend_line_dictionary:
                legend_line_dictionary[(al_strategy, True)] = line

            # Draw the no-suggest AL baseline
            labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, False, full_train_dataset)
            average_acc, std = get_avg_std_test_acc(exp_results_list)
            lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
            upper_list = [(x+y) for (x,y) in zip(average_acc,std)]
            
            line = axes[3][0].plot(labeling_costs[:no_suggestion_al_round_cutoff], average_acc[:no_suggestion_al_round_cutoff], color=no_suggestion_draw_color, marker='o')[0]
            axes[3][0].fill_between(labeling_costs[:no_suggestion_al_round_cutoff], lower_list[:no_suggestion_al_round_cutoff], upper_list[:no_suggestion_al_round_cutoff], alpha=shade_alpha, color=no_suggestion_draw_color)

            if (al_strategy, False) not in legend_line_dictionary:
                legend_line_dictionary[(al_strategy, False)] = line

            al_key = (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)
    else:
        if (al_strategy, human_correct_strategy, auto_assign_strategy) not in color_dictionary:
            raise ValueError(F"Add color for {al_strategy}+{human_correct_strategy}+{auto_assign_strategy}")
        else:
            draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]

        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        print(F"RUN COUNT {dataset} {al_strategy} {human_correct_strategy} {auto_assign_strategy} {b1} {b2} {b3} {seed_size} {rounds} {is_adaptive}: {len(exp_results_list)}")
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, std = get_avg_std_test_acc(exp_results_list)
        lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
        upper_list = [(x+y) for (x,y) in zip(average_acc,std)]

        line = axes[3][0].plot(labeling_costs, average_acc, color=draw_color, marker='o')[0]
        axes[3][0].fill_between(labeling_costs, lower_list, upper_list, alpha=shade_alpha, color=draw_color)

        if (al_strategy, human_correct_strategy, auto_assign_strategy) not in legend_line_dictionary:
            legend_line_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)] = line


# Do the same for the labeling efficiency.
al_exp_results_list = experiment_results[al_key]
al_labeling_costs = get_avg_labeling_costs(al_exp_results_list, c_v, c_a, False, full_train_dataset)
al_average_acc, _ = get_avg_std_test_acc(al_exp_results_list)

for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    # If this is the AL key, plot the gain with suggestion over no suggestion.
    if b2 == 0 and b3 == 0:
        
        suggestion_draw_color = color_dictionary[(al_strategy, True)]
        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, _ = get_avg_std_test_acc(exp_results_list)

        plot_test_accuracies, labeling_efficiencies = produce_label_efficiency_data(al_labeling_costs[:suggestion_al_round_cutoff], al_average_acc[:suggestion_al_round_cutoff], labeling_costs[:suggestion_al_round_cutoff], average_acc[:suggestion_al_round_cutoff], label_eff_acc_granularity)

        axes[3][2].plot(plot_test_accuracies, labeling_efficiencies, color=suggestion_draw_color, marker='o')[0]
    else:

        draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]
        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, _ = get_avg_std_test_acc(exp_results_list)

        plot_test_accuracies, labeling_efficiencies = produce_label_efficiency_data(al_labeling_costs, al_average_acc, labeling_costs, average_acc, label_eff_acc_granularity)

        axes[3][2].plot(plot_test_accuracies, labeling_efficiencies, color=draw_color, marker='o')[0]

# ===== ENTROPY =====
al_strategy = "entropy"
del al_key

# Get CIFAR-10 results.
results_directory = os.path.join(base_save_directory, dataset, al_strategy, str(seed_size))
experiment_results = get_experiment_results(results_directory, dataset, al_strategy, seed_size, b_configs)
suggestion_al_round_cutoff, no_suggestion_al_round_cutoff = determine_al_round_cutoff(experiment_results, full_train_dataset, c_v, c_a)

# Get the average labeling costs across experiments along with the test accuracies. Plot them.
for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    # Attempt to generate a color if there is not already one.
    if b2 == 0 and b3 == 0:
        if (al_strategy,False) not in color_dictionary or (al_strategy,True) not in color_dictionary:
            raise ValueError(F"Add color for {al_strategy}")
        else:
            no_suggestion_draw_color = color_dictionary[(al_strategy,False)]
            suggestion_draw_color = color_dictionary[(al_strategy,True)]

            # Draw the suggest AL baseline
            exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
            print(F"RUN COUNT {dataset} {al_strategy} {human_correct_strategy} {auto_assign_strategy} {b1} {b2} {b3} {seed_size} {rounds} {is_adaptive}: {len(exp_results_list)}")
            labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
            average_acc, std = get_avg_std_test_acc(exp_results_list)
            lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
            upper_list = [(x+y) for (x,y) in zip(average_acc,std)]
            
            line = axes[3][1].plot(labeling_costs[:suggestion_al_round_cutoff], average_acc[:suggestion_al_round_cutoff], color=suggestion_draw_color, marker='o')[0]
            axes[3][1].fill_between(labeling_costs[:suggestion_al_round_cutoff], lower_list[:suggestion_al_round_cutoff], upper_list[:suggestion_al_round_cutoff], alpha=shade_alpha, color=suggestion_draw_color)

            if (al_strategy, True) not in legend_line_dictionary:
                legend_line_dictionary[(al_strategy, True)] = line

            # Draw the no-suggest AL baseline
            labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, False, full_train_dataset)
            average_acc, std = get_avg_std_test_acc(exp_results_list)
            lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
            upper_list = [(x+y) for (x,y) in zip(average_acc,std)]
            
            line = axes[3][1].plot(labeling_costs[:no_suggestion_al_round_cutoff], average_acc[:no_suggestion_al_round_cutoff], color=no_suggestion_draw_color, marker='o')[0]
            axes[3][1].fill_between(labeling_costs[:no_suggestion_al_round_cutoff], lower_list[:no_suggestion_al_round_cutoff], upper_list[:no_suggestion_al_round_cutoff], alpha=shade_alpha, color=no_suggestion_draw_color)

            if (al_strategy, False) not in legend_line_dictionary:
                legend_line_dictionary[(al_strategy, False)] = line

            al_key = (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)
    else:
        if (al_strategy, human_correct_strategy, auto_assign_strategy) not in color_dictionary:
            raise ValueError(F"Add color for {al_strategy}+{human_correct_strategy}+{auto_assign_strategy}")
        else:
            draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]

        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        print(F"RUN COUNT {dataset} {al_strategy} {human_correct_strategy} {auto_assign_strategy} {b1} {b2} {b3} {seed_size} {rounds} {is_adaptive}: {len(exp_results_list)}")
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, std = get_avg_std_test_acc(exp_results_list)
        lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
        upper_list = [(x+y) for (x,y) in zip(average_acc,std)]

        line = axes[3][1].plot(labeling_costs, average_acc, color=draw_color, marker='o')[0]
        axes[3][1].fill_between(labeling_costs, lower_list, upper_list, alpha=shade_alpha, color=draw_color)

        if (al_strategy, human_correct_strategy, auto_assign_strategy) not in legend_line_dictionary:
            legend_line_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)] = line


# Do the same for the labeling efficiency.
al_exp_results_list = experiment_results[al_key]
al_labeling_costs = get_avg_labeling_costs(al_exp_results_list, c_v, c_a, False, full_train_dataset)
al_average_acc, _ = get_avg_std_test_acc(al_exp_results_list)

for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    # If this is the AL key, plot the gain with suggestion over no suggestion.
    if b2 == 0 and b3 == 0:
        
        suggestion_draw_color = color_dictionary[(al_strategy, True)]
        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, _ = get_avg_std_test_acc(exp_results_list)

        plot_test_accuracies, labeling_efficiencies = produce_label_efficiency_data(al_labeling_costs[:suggestion_al_round_cutoff], al_average_acc[:suggestion_al_round_cutoff], labeling_costs[:suggestion_al_round_cutoff], average_acc[:suggestion_al_round_cutoff], label_eff_acc_granularity)

        axes[3][3].plot(plot_test_accuracies, labeling_efficiencies, color=suggestion_draw_color, marker='o')[0]
    else:

        draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]
        exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
        labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, True, full_train_dataset)
        average_acc, _ = get_avg_std_test_acc(exp_results_list)

        plot_test_accuracies, labeling_efficiencies = produce_label_efficiency_data(al_labeling_costs, al_average_acc, labeling_costs, average_acc, label_eff_acc_granularity)

        axes[3][3].plot(plot_test_accuracies, labeling_efficiencies, color=draw_color, marker='o')[0]

# =========================================================================================================================================================
#
# =================================================================== FINISH PLOT =========================================================================
#
# =========================================================================================================================================================

# Create the legend by obtaining the list of labels
label_list = []
line_list = []
for key in legend_line_dictionary:
    line_list.append(legend_line_dictionary[key])
    if len(key) == 2:
        if key[1]:
            label_list.append(r"\textsc{" + F"{key[0].capitalize()}" + r" w/ Suggest}")
        else:
            label_list.append(r"\textsc{" + F"{key[0].capitalize()}" + r" w/o Suggest}")
    else:
        al_strategy = key[0].replace("_", "-").capitalize()
        human_correct_strategy = key[1].replace("_", "-").capitalize()
        auto_assign_strategy = key[2].replace("_", "-").capitalize()
        label_list.append(r"\textsc{" + F"{al_strategy} w/ {acronym}" + r"}")

permutation = [1,5,0,4,2,3]
label_list = [label_list[i] for i in permutation]
line_list = [line_list[i] for i in permutation]
comparison_fig.legend(line_list, label_list, loc="upper center", ncol=3, borderaxespad=0, fontsize=legend_font_size)

### Suggested Label Accuracy

Currently, a 3 row, 4 column figure showcasing the suggestion accuracy of each component.

In [None]:
nrows = 2
ncols = 3

figsize = (5.5 * ncols, 4.5 * nrows)
comparison_fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize, dpi=120, gridspec_kw = {'top':0.90,'wspace':0.35,'hspace':0.35})

# Note: Figures are not to have titles. The caption serves this purpose.

# Set error bar alpha
shade_alpha = 0.25

legend_line_dictionary = {}

# =========================================================================================================================================================
#
# ====================================================================== DOGS ==========================================================================
#
# =========================================================================================================================================================

dataset = "dogs"
seed_size = 500
b_configs = [(100,600,300,False)]

c_a = 4
c_v = 1

# Get Dogs dataset.
full_train_dataset = DogsDataset(dataset_root_path, download=True, train=True)
test_dataset = DogsDataset(dataset_root_path, download=True, train=False)
nclasses = 120

# Set the subplot title, y-axis, x-axis
axes[0][0].set_title(r"\textbf{Stanford Dogs}", fontsize=plot_title_font_size)
axes[0][0].set_xlabel(r"\textbf{Round}", fontsize=axis_label_font_size)
axes[0][0].set_ylabel(r"\textbf{AL Acc. (" + F"{100}," + F"{600}," + F"{300})" + r"}", fontsize=axis_label_font_size)

axes[0][1].set_title(r"\textbf{Stanford Dogs}", fontsize=plot_title_font_size)
axes[0][1].set_xlabel(r"\textbf{Round}", fontsize=axis_label_font_size)
axes[0][1].set_ylabel(r"\textbf{SMI Acc. (" + F"{100}," + F"{600}," + F"{300})" + r"}", fontsize=axis_label_font_size)

axes[0][2].set_title(r"\textbf{Stanford Dogs}", fontsize=plot_title_font_size)
axes[0][2].set_xlabel(r"\textbf{Round}", fontsize=axis_label_font_size)
axes[0][2].set_ylabel(r"\textbf{Auto Acc. (" + F"{100}," + F"{600}," + F"{300})" + r"}", fontsize=axis_label_font_size)

# ===== BADGE =====
al_strategy = "badge"

# Get CIFAR-10 results.
results_directory = os.path.join(base_save_directory, dataset, al_strategy, str(seed_size))
experiment_results = get_experiment_results(results_directory, dataset, al_strategy, seed_size, b_configs)

# Get the average suggestion accuracies across experiments along with the test accuracies. Plot them.
for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:
   
    exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]

    # Get row number
    draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]
    round_count = len(exp_results_list[0]['set_sizes']) - 1

    round_numbers = [i+1 for i in range(round_count)]

    al_labeled_acc, al_labeled_std = get_avg_std_correctly_suggested_al_points(exp_results_list, full_train_dataset)
    lower_list = [(x-y) for (x,y) in zip(al_labeled_acc,al_labeled_std)]
    upper_list = [(x+y) for (x,y) in zip(al_labeled_acc,al_labeled_std)]
    axes[0][0].plot(round_numbers, al_labeled_acc, color=draw_color, marker='o')
    axes[0][0].fill_between(round_numbers, lower_list, upper_list, alpha=shade_alpha, color=draw_color)

    human_correct_acc, human_correct_std = get_avg_std_correctly_suggested_human_corrected_points(exp_results_list)
    lower_list = [(x-y) for (x,y) in zip(human_correct_acc,human_correct_std)]
    upper_list = [(x+y) for (x,y) in zip(human_correct_acc,human_correct_std)]
    axes[0][1].plot(round_numbers, human_correct_acc, color=draw_color, marker='o')
    axes[0][1].fill_between(round_numbers, lower_list, upper_list, alpha=shade_alpha, color=draw_color)

    auto_acc, auto_std = get_avg_std_correctly_auto_labeled_points(exp_results_list)
    lower_list = [(x-y) for (x,y) in zip(auto_acc, auto_std)]
    upper_list = [(x+y) for (x,y) in zip(auto_acc, auto_std)]
    line = axes[0][2].plot(round_numbers, auto_acc, color=draw_color, marker='o')[0]
    axes[0][2].fill_between(round_numbers, lower_list, upper_list, alpha=shade_alpha, color=draw_color)

    if (al_strategy, human_correct_strategy, auto_assign_strategy) not in legend_line_dictionary:
        legend_line_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)] = line

# ===== ENTROPY =====
al_strategy = "entropy"

# Get CIFAR-10 results.
results_directory = os.path.join(base_save_directory, dataset, al_strategy, str(seed_size))
experiment_results = get_experiment_results(results_directory, dataset, al_strategy, seed_size, b_configs)

# Get the average suggestion accuracies across experiments along with the test accuracies. Plot them.
for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:
   
    exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]

    # Get row number
    draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]
    round_count = len(exp_results_list[0]['set_sizes']) - 1

    round_numbers = [i+1 for i in range(round_count)]

    al_labeled_acc, al_labeled_std = get_avg_std_correctly_suggested_al_points(exp_results_list, full_train_dataset)
    lower_list = [(x-y) for (x,y) in zip(al_labeled_acc,al_labeled_std)]
    upper_list = [(x+y) for (x,y) in zip(al_labeled_acc,al_labeled_std)]
    axes[0][0].plot(round_numbers, al_labeled_acc, color=draw_color, marker='o')
    axes[0][0].fill_between(round_numbers, lower_list, upper_list, alpha=shade_alpha, color=draw_color)

    human_correct_acc, human_correct_std = get_avg_std_correctly_suggested_human_corrected_points(exp_results_list)
    lower_list = [(x-y) for (x,y) in zip(human_correct_acc,human_correct_std)]
    upper_list = [(x+y) for (x,y) in zip(human_correct_acc,human_correct_std)]
    axes[0][1].plot(round_numbers, human_correct_acc, color=draw_color, marker='o')
    axes[0][1].fill_between(round_numbers, lower_list, upper_list, alpha=shade_alpha, color=draw_color)

    auto_acc, auto_std = get_avg_std_correctly_auto_labeled_points(exp_results_list)
    lower_list = [(x-y) for (x,y) in zip(auto_acc, auto_std)]
    upper_list = [(x+y) for (x,y) in zip(auto_acc, auto_std)]
    line = axes[0][2].plot(round_numbers, auto_acc, color=draw_color, marker='o')[0]
    axes[0][2].fill_between(round_numbers, lower_list, upper_list, alpha=shade_alpha, color=draw_color)

    if (al_strategy, human_correct_strategy, auto_assign_strategy) not in legend_line_dictionary:
        legend_line_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)] = line

# =========================================================================================================================================================
#
# ======================================================================= BIRDS ===========================================================================
#
# =========================================================================================================================================================

# Note: b_configs is a list of tuples of the form (b_1, b_2, b_3, is_adaptive).
dataset = "birds"
seed_size = 600
b_configs = [(60,400,140,False)]

# Get CUB dataset.
full_train_dataset = BirdsDataset(dataset_root_path, download=True, train=True)
test_dataset = BirdsDataset(dataset_root_path, download=True, train=False)
nclasses = 200
    
# Set the subplot title, y-axis, x-axis
axes[1][0].set_title(r"\textbf{CUB-200-2011}", fontsize=plot_title_font_size)
axes[1][0].set_xlabel(r"\textbf{Round}", fontsize=axis_label_font_size)
axes[1][0].set_ylabel(r"\textbf{AL Acc. (" + F"{60}," + F"{400}," + F"{140})" + r"}", fontsize=axis_label_font_size)

axes[1][1].set_title(r"\textbf{CUB-200-2011}", fontsize=plot_title_font_size)
axes[1][1].set_xlabel(r"\textbf{Round}", fontsize=axis_label_font_size)
axes[1][1].set_ylabel(r"\textbf{SMI Acc. (" + F"{60}," + F"{400}," + F"{140})" + r"}", fontsize=axis_label_font_size)

axes[1][2].set_title(r"\textbf{CUB-200-2011}", fontsize=plot_title_font_size)
axes[1][2].set_xlabel(r"\textbf{Round}", fontsize=axis_label_font_size)
axes[1][2].set_ylabel(r"\textbf{Auto Acc. (" + F"{60}," + F"{400}," + F"{140})" + r"}", fontsize=axis_label_font_size)

# ===== BADGE =====
al_strategy = "badge"

# Get CIFAR-10 results.
results_directory = os.path.join(base_save_directory, dataset, al_strategy, str(seed_size))
experiment_results = get_experiment_results(results_directory, dataset, al_strategy, seed_size, b_configs)

# Get the average suggestion accuracies across experiments along with the test accuracies. Plot them.
for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:
   
    exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]

    # Get row number
    draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]
    round_count = len(exp_results_list[0]['set_sizes']) - 1

    round_numbers = [i+1 for i in range(round_count)]

    al_labeled_acc, al_labeled_std = get_avg_std_correctly_suggested_al_points(exp_results_list, full_train_dataset)
    lower_list = [(x-y) for (x,y) in zip(al_labeled_acc,al_labeled_std)]
    upper_list = [(x+y) for (x,y) in zip(al_labeled_acc,al_labeled_std)]
    axes[1][0].plot(round_numbers, al_labeled_acc, color=draw_color, marker='o')
    axes[1][0].fill_between(round_numbers, lower_list, upper_list, alpha=shade_alpha, color=draw_color)

    human_correct_acc, human_correct_std = get_avg_std_correctly_suggested_human_corrected_points(exp_results_list)
    lower_list = [(x-y) for (x,y) in zip(human_correct_acc,human_correct_std)]
    upper_list = [(x+y) for (x,y) in zip(human_correct_acc,human_correct_std)]
    axes[1][1].plot(round_numbers, human_correct_acc, color=draw_color, marker='o')
    axes[1][1].fill_between(round_numbers, lower_list, upper_list, alpha=shade_alpha, color=draw_color)

    auto_acc, auto_std = get_avg_std_correctly_auto_labeled_points(exp_results_list)
    lower_list = [(x-y) for (x,y) in zip(auto_acc, auto_std)]
    upper_list = [(x+y) for (x,y) in zip(auto_acc, auto_std)]
    line = axes[1][2].plot(round_numbers, auto_acc, color=draw_color, marker='o')[0]
    axes[1][2].fill_between(round_numbers, lower_list, upper_list, alpha=shade_alpha, color=draw_color)

    if (al_strategy, human_correct_strategy, auto_assign_strategy) not in legend_line_dictionary:
        legend_line_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)] = line

# ===== ENTROPY =====
al_strategy = "entropy"

# Get CIFAR-10 results.
results_directory = os.path.join(base_save_directory, dataset, al_strategy, str(seed_size))
experiment_results = get_experiment_results(results_directory, dataset, al_strategy, seed_size, b_configs)

# Get the average suggestion accuracies across experiments along with the test accuracies. Plot them.
for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:
   
    exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]

    # Get row number
    draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]
    round_count = len(exp_results_list[0]['set_sizes']) - 1

    round_numbers = [i+1 for i in range(round_count)]

    al_labeled_acc, al_labeled_std = get_avg_std_correctly_suggested_al_points(exp_results_list, full_train_dataset)
    lower_list = [(x-y) for (x,y) in zip(al_labeled_acc,al_labeled_std)]
    upper_list = [(x+y) for (x,y) in zip(al_labeled_acc,al_labeled_std)]
    axes[1][0].plot(round_numbers, al_labeled_acc, color=draw_color, marker='o')
    axes[1][0].fill_between(round_numbers, lower_list, upper_list, alpha=shade_alpha, color=draw_color)

    human_correct_acc, human_correct_std = get_avg_std_correctly_suggested_human_corrected_points(exp_results_list)
    lower_list = [(x-y) for (x,y) in zip(human_correct_acc,human_correct_std)]
    upper_list = [(x+y) for (x,y) in zip(human_correct_acc,human_correct_std)]
    axes[1][1].plot(round_numbers, human_correct_acc, color=draw_color, marker='o')
    axes[1][1].fill_between(round_numbers, lower_list, upper_list, alpha=shade_alpha, color=draw_color)

    auto_acc, auto_std = get_avg_std_correctly_auto_labeled_points(exp_results_list)
    lower_list = [(x-y) for (x,y) in zip(auto_acc, auto_std)]
    upper_list = [(x+y) for (x,y) in zip(auto_acc, auto_std)]
    line = axes[1][2].plot(round_numbers, auto_acc, color=draw_color, marker='o')[0]
    axes[1][2].fill_between(round_numbers, lower_list, upper_list, alpha=shade_alpha, color=draw_color)

    if (al_strategy, human_correct_strategy, auto_assign_strategy) not in legend_line_dictionary:
        legend_line_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)] = line

# Create the legend by obtaining the list of labels
label_list = []
line_list = []
for key in legend_line_dictionary:
    line_list.append(legend_line_dictionary[key])
    if type(key) == str:
        label_list.append(r"\textsc{" + F"{key.capitalize()}" + r"}")
    else:
        al_strategy = key[0].replace("_", "-").capitalize()
        human_correct_strategy = key[1].replace("_", "-").capitalize()
        auto_assign_strategy = key[2].replace("_", "-").capitalize()
        label_list.append(r"\textsc{" + F"{al_strategy} w/ {acronym}" + r"}")
comparison_fig.legend(line_list, label_list, loc="upper center", ncol=2, borderaxespad=0, fontsize=legend_font_size)    

### Ablation


In [None]:
use_al_pseudo = True
nrows = 2
ncols = 4

figsize = (5.5 * ncols, 5.5 * nrows)
comparison_fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize, dpi=120, gridspec_kw = {'top':0.92,'wspace':0.3,'hspace':0.3})

# Note: Figures are not to have titles. The caption serves this purpose.

# Set error bar alpha
shade_alpha = 0.25

legend_line_dictionary = {}

# Seed random color generator
np.random.seed(42)

# =========================================================================================================================================================
#
# ===================================================================== CIFAR-100 =========================================================================
#
# =========================================================================================================================================================

# Note: b_configs is a list of tuples of the form (b_1, b_2, b_3, is_adaptive).
dataset = "cifar100"
seed_size = 5000
b_configs = [(0,3500,500,False),
             (1000,0,500,False),
             (1000,3500,0,False),
             (0,3500,0,False)]
base_b_config = [(1000,3500,500,False)]
c_a = 2
c_v = 1

# Get CIFAR-100 dataset.
full_train_dataset = datasets.CIFAR100(dataset_root_path, download=True, train=True)
test_dataset = datasets.CIFAR100(dataset_root_path, download=True, train=False)
nclasses = 100 # NUM CLASSES HERE
    
column_map = {(0,3500,500): 0,
             (1000,0,500):  1,
             (1000,3500,0): 2,
             (0,3500,0):    3}

# Set the subplot title, y-axis, x-axis
axes[0][0].set_title(r"\textbf{No AL (CIFAR-100)}", fontsize=plot_title_font_size)
axes[0][0].set_xlabel(r"\textbf{Labeling Cost (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[0][0].set_ylabel(r"\textbf{ACC (" + F"{0}," + F"{3500}," + F"{500})" + r"}", fontsize=axis_label_font_size)

axes[0][1].set_title(r"\textbf{No SMI (CIFAR-100)}", fontsize=plot_title_font_size)
axes[0][1].set_xlabel(r"\textbf{Labeling Cost (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[0][1].set_ylabel(r"\textbf{ACC (" + F"{1000}," + F"{0}," + F"{500})" + r"}", fontsize=axis_label_font_size)

axes[0][2].set_title(r"\textbf{No Auto (CIFAR-100)}", fontsize=plot_title_font_size)
axes[0][2].set_xlabel(r"\textbf{Labeling Cost (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[0][2].set_ylabel(r"\textbf{ACC (" + F"{1000}," + F"{3500}," + F"{0})" + r"}", fontsize=axis_label_font_size)

axes[0][3].set_title(r"\textbf{Only SMI (CIFAR-100)}", fontsize=plot_title_font_size)
axes[0][3].set_xlabel(r"\textbf{Labeling Cost (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[0][3].set_ylabel(r"\textbf{ACC (" + F"{0}," + F"{3500}," + F"{0})" + r"}", fontsize=axis_label_font_size)

al_strategy = "entropy"

# Get CIFAR-10 results.
results_directory = os.path.join(base_save_directory, dataset, al_strategy, str(seed_size))
experiment_results = get_experiment_results(results_directory, dataset, al_strategy, seed_size, b_configs)
base_experiment_results = get_experiment_results(results_directory, dataset, al_strategy, seed_size, base_b_config)

# Get the average labeling costs across experiments along with the test accuracies. Plot them.
for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    if (al_strategy, human_correct_strategy, auto_assign_strategy) not in color_dictionary:
        raise ValueError(F"Add color for {al_strategy}+{human_correct_strategy}+{auto_assign_strategy}")
    else:
        draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]

    col_num = column_map[(b1,b2,b3)]

    exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
    labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, use_al_pseudo, full_train_dataset)
    average_acc, std = get_avg_std_test_acc(exp_results_list)
    lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
    upper_list = [(x+y) for (x,y) in zip(average_acc,std)]

    line = axes[0][col_num].plot(labeling_costs, average_acc, color=(0,0,0), marker='o')[0]
    axes[0][col_num].fill_between(labeling_costs, lower_list, upper_list, alpha=shade_alpha, color=(0,0,0))

    # Draw base method
    max_base_cost = labeling_costs[-1]
    exp_results_list = base_experiment_results[list(base_experiment_results.keys())[0]]
    labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, use_al_pseudo, full_train_dataset)
    for i, labeling_cost in enumerate(labeling_costs):
        if labeling_cost > max_base_cost:
            max_display = i
            if b2 == 0:
                max_display += 1
            break
    average_acc, std = get_avg_std_test_acc(exp_results_list)
    lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
    upper_list = [(x+y) for (x,y) in zip(average_acc,std)]

    line = axes[0][col_num].plot(labeling_costs[:max_display], average_acc[:max_display], color=draw_color, marker='o', linestyle="--")[0]
    axes[0][col_num].fill_between(labeling_costs[:max_display], lower_list[:max_display], upper_list[:max_display], alpha=shade_alpha, color=draw_color)

c_a = 6
c_v = 1

# Set the subplot title, y-axis, x-axis
axes[1][0].set_title(r"\textbf{No AL (CIFAR-100)}", fontsize=plot_title_font_size)
axes[1][0].set_xlabel(r"\textbf{Labeling Cost (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[1][0].set_ylabel(r"\textbf{ACC (" + F"{0}," + F"{3500}," + F"{500})" + r"}", fontsize=axis_label_font_size)

axes[1][1].set_title(r"\textbf{No SMI (CIFAR-100)}", fontsize=plot_title_font_size)
axes[1][1].set_xlabel(r"\textbf{Labeling Cost (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[1][1].set_ylabel(r"\textbf{ACC (" + F"{1000}," + F"{0}," + F"{500})" + r"}", fontsize=axis_label_font_size)

axes[1][2].set_title(r"\textbf{No Auto (CIFAR-100)}", fontsize=plot_title_font_size)
axes[1][2].set_xlabel(r"\textbf{Labeling Cost (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[1][2].set_ylabel(r"\textbf{ACC (" + F"{1000}," + F"{3500}," + F"{0})" + r"}", fontsize=axis_label_font_size)

axes[1][3].set_title(r"\textbf{Only SMI (CIFAR-100)}", fontsize=plot_title_font_size)
axes[1][3].set_xlabel(r"\textbf{Labeling Cost (" + F"{c_a}," + F"{c_v})" + r"}", fontsize=axis_label_font_size)
axes[1][3].set_ylabel(r"\textbf{ACC (" + F"{0}," + F"{3500}," + F"{0})" + r"}", fontsize=axis_label_font_size)

al_strategy = "entropy"

# Get CIFAR-10 results.
results_directory = os.path.join(base_save_directory, dataset, al_strategy, str(seed_size))
experiment_results = get_experiment_results(results_directory, dataset, al_strategy, seed_size, b_configs)
base_experiment_results = get_experiment_results(results_directory, dataset, al_strategy, seed_size, base_b_config)

# Get the average labeling costs across experiments along with the test accuracies. Plot them.
for (human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive) in experiment_results:

    if (al_strategy, human_correct_strategy, auto_assign_strategy) not in color_dictionary:
        raise ValueError(F"Add color for {al_strategy}+{human_correct_strategy}+{auto_assign_strategy}")
    else:
        draw_color = color_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)]

    col_num = column_map[(b1,b2,b3)]

    exp_results_list = experiment_results[(human_correct_strategy, auto_assign_strategy, b1, b2, b3, rounds, is_adaptive)]
    labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, use_al_pseudo, full_train_dataset)
    average_acc, std = get_avg_std_test_acc(exp_results_list)
    lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
    upper_list = [(x+y) for (x,y) in zip(average_acc,std)]

    line = axes[1][col_num].plot(labeling_costs, average_acc, color=(0,0,0), marker='o')[0]
    axes[1][col_num].fill_between(labeling_costs, lower_list, upper_list, alpha=shade_alpha, color=(0,0,0))
    
    if (al_strategy, human_correct_strategy, auto_assign_strategy) not in legend_line_dictionary:
        legend_line_dictionary[(al_strategy, human_correct_strategy, auto_assign_strategy)] = line
    
    # Draw base method
    max_base_cost = labeling_costs[-1]
    exp_results_list = base_experiment_results[list(base_experiment_results.keys())[0]]
    labeling_costs = get_avg_labeling_costs(exp_results_list, c_v, c_a, use_al_pseudo, full_train_dataset)
    for i, labeling_cost in enumerate(labeling_costs):
        if labeling_cost > max_base_cost:
            max_display = i
            break
    average_acc, std = get_avg_std_test_acc(exp_results_list)
    lower_list = [(x-y) for (x,y) in zip(average_acc,std)]
    upper_list = [(x+y) for (x,y) in zip(average_acc,std)]

    line = axes[1][col_num].plot(labeling_costs[:max_display], average_acc[:max_display], color=draw_color, marker='o', linestyle="--")[0]
    axes[1][col_num].fill_between(labeling_costs[:max_display], lower_list[:max_display], upper_list[:max_display], alpha=shade_alpha, color=draw_color)

    if "base" not in legend_line_dictionary:
        legend_line_dictionary["base"] = line

# Create the legend by obtaining the list of labels
label_list = []
line_list = []
for key in legend_line_dictionary:
    line_list.append(legend_line_dictionary[key])
    if key == "base":
        label_list.append(r"\textsc{Entropy w/ " + F"{acronym}" + r"}")
    else:
        label_list.append(r"\textsc{Ablated Entropy w/ " + F"{acronym}" + r"}")
perm = [1,0]
label_list = [label_list[i] for i in perm]
line_list = [line_list[i] for i in perm]
comparison_fig.legend(line_list, label_list, loc="upper center", ncol=2, borderaxespad=0, fontsize=legend_font_size)    