In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os.path as osp
import glob
import os.path as osp

In [4]:
all_models=["resnet18", "resnet34", "xception"]
all_datasets=['cifar10']
all_optim = ['adam', 'sgd', 'rmsprobe', 'sparseadam']
all_initialization = ['pretrain', 'kaiming_normal']
all_lr_scheduler = ['reduceLR', 'none', 'cosine_annealingLR']
all_noise_injection = ["0.0", "0.03", "0.07", "0.13"]
all_noise_sparsity = ["0.0", "0.2", "0.4", "0.6"]
all_lr_rate = ['0.001', '0.1']
all_folds = ['0', '1', '2']
all_phase = ['train', 'validation']
all_epochs = 5
device = 'cuda:0'

In [5]:
# Define a function to find the index of the maximum value in a list
def argmax_list(lst):
    return max(range(len(lst)), key=lst.__getitem__)

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_density(path="img.png", dataframes=[], target_col=None, colors=[], labels=[], title=""):
    plt.figure()  # Create a new figure for each plot
    sns.set_theme(style="darkgrid")
    for i, df in enumerate(dataframes):
        sns.kdeplot(df[target_col], fill=True, color=colors[i], label=labels[i])
    plt.legend()
    plt.title(title)
    plt.savefig(path)
    plt.close()

In [None]:
root = osp.join("/home/vision/Repo/cleanset/logs")
iterations_log = None
for dataset_name in all_datasets:
    dataset_info = pd.read_csv(f"dataset/{dataset_name}/info.csv")
    dataset_info[dataset_info['phase'] == 'train']
    for model_name in all_models:
        for optim in all_optim:
            for initialization in all_initialization:
                for lr_scheduler in all_lr_scheduler:
                    for noise_injection in all_noise_injection:
                        for noise_sparsity in all_noise_sparsity:
                            for lr_rate in all_lr_rate:
                                columns = {'epoch': int, 'fold': int, 'sample': float, 'label': float, 'phase': float, 'prediction': str}
                                samples_training_data = pd.DataFrame(columns=columns.keys())
                                for phase in all_phase:
                                    for epoch in range(all_epochs):
                                        all_fold_loss = []
                                        epoch = f"{epoch :03d}"
                                        for fold in all_folds:
                                            glob_regex = osp.join(root,dataset_name, model_name, optim, initialization, lr_scheduler, f"np={noise_injection}", f"ns={noise_sparsity}", f"lr={lr_rate}", fold, phase, epoch, '*.pd')
                                            iterations_log = sorted(glob.glob(glob_regex))
                                            if len(iterations_log) == 0:
                                                continue
                                            iterations_log = [pd.read_pickle(file_path) for file_path in iterations_log]
                                            iterations_log = pd.concat(iterations_log, axis=0, ignore_index=True)

                                            iterations_log = iterations_log.drop(columns=['loss'])
                                            iterations_log['prediction'] = iterations_log['proba'].apply(lambda x: argmax_list(x))
                                            iterations_log = iterations_log.drop(columns=['proba'])

                                            iterations_log['phase'] = phase
                                            iterations_log['fold'] = fold
                                            iterations_log['epoch'] = epoch
                                            samples_training_data = samples_training_data._append(iterations_log)
                                if len(samples_training_data):
                                    samples_training_data['correctness'] = samples_training_data['label'] == samples_training_data['prediction']
                                    correctness_per_sample = samples_training_data.groupby(['sample', 'label'])['correctness'].sum().reset_index()
                                    merged_df = pd.merge(correctness_per_sample, dataset_info[['index', 'true_label']], left_on='sample', right_on='index', how='inner')
                                    true_label_sample = merged_df[merged_df['label'] == merged_df['true_label']]
                                    wrong_label_sample = merged_df[merged_df['label'] != merged_df['true_label']]

                                    plot_density(
                                        path=osp.join(root, dataset_name, model_name, optim, initialization, lr_scheduler, f"np={noise_injection}", f"ns={noise_sparsity}", f"lr={lr_rate}", "correctness.png"),
                                        dataframes=[true_label_sample, wrong_label_sample],
                                        target_col='correctness', colors=['g', 'r'], 
                                        labels=['true labels', 'wrong labels'], 
                                        title=f" {dataset_name} {model_name} {optim} {initialization} {lr_scheduler} | np={noise_injection} ns={noise_sparsity}")
                                    
