# Import Dependencies

In [1]:
import numpy as np
import pandas as pd
import glob
import os.path as osp
import os
from configs.general import \
                    MODELS, DATASETS, OPTIMS,\
                    INITIALIZATIONS, LR_SCHEDULERS, NPS, NSS,\
                    PHASES, EPOCHS, FOLDS, DEVICE,\
                    DATA_FILTERING_POLICIES, DATA_RETRIEVAL_POLICIES, \
                    EXPERIMENT_COLS, EXPERIMENT_BASE_DIR, EXPERIMENT_INFO_PATH,\
                    FILTERING_EXPERIMENT_INFO_PATH, FILTERING_EXPERIMENT_BASE_DIR, FILTERING_EXPERIMENT_COLS, DROPOUTS, TRANSFORM_LEVELS


# Create base experiments

In [6]:
table = pd.DataFrame()
for dataset in DATASETS:
    for model in MODELS:
        for optim, LRS in OPTIMS:
            for lr in LRS:
                for lr_ch in LR_SCHEDULERS:
                    for init in INITIALIZATIONS:
                        for _np in NPS:
                            for ns in NSS:
                                if _np == "0.0" and ns != "0.0":
                                    continue
                                for dropout in DROPOUTS:
                                    for transform in TRANSFORM_LEVELS:
                                        row = {
                                            "dataset": dataset,
                                            "model": model,
                                            "dropout": f"drp={dropout}",
                                            "optim": optim,
                                            "lr": f"lr={lr}",
                                            "lr_scheduler": lr_ch,
                                            "init": init,
                                            "transform": transform,
                                            "np": f"np={_np}",
                                            "ns": f"ns={ns}",
                                        }
                                        table = table._append(row, ignore_index=True)

table['folds'] = FOLDS
table['epochs'] = EPOCHS
table['done'] = False
table['valid'] = False
table['test_acc'] = None
table['train_loss'] = None
table['validation_loss'] = None

In [7]:
len(table)

3744

In [8]:
import hashlib
def hash_row_data(row):
    # Convert row data to a string and encode it
    row_str = str(row)
    row_bytes = row_str.encode('utf-8')
    # Apply SHA-256 hash function
    hashed_data = hashlib.sha256(row_bytes).hexdigest()[:10]
    return hashed_data

table['index'] = table[EXPERIMENT_COLS].apply(hash_row_data, axis=1)
table.set_index('index', inplace=True)

In [9]:
result = table.groupby('optim')['lr'].agg(pd.Series.unique)
result

optim
adam    [lr=0.001, lr=0.0001]
sgd         [lr=0.1, lr=0.01]
Name: lr, dtype: object

In [10]:
result = table.groupby('np')['ns'].agg(pd.Series.unique)
result

np
np=0.0                             [ns=0.0]
np=0.03    [ns=0.0, ns=0.2, ns=0.4, ns=0.6]
np=0.07    [ns=0.0, ns=0.2, ns=0.4, ns=0.6]
np=0.13    [ns=0.0, ns=0.2, ns=0.4, ns=0.6]
Name: ns, dtype: object

In [11]:
for column in table.columns:
    unique_values = table[column].unique()
    print(f"Column: {column}: {unique_values}")
print(len(table))

Column: dataset: ['cifar10' 'cifar100' 'mnist']
Column: model: ['resnet18' 'resnet34' 'xception']
Column: dropout: ['drp=0' 'drp=0.3']
Column: optim: ['adam' 'sgd']
Column: lr: ['lr=0.001' 'lr=0.0001' 'lr=0.1' 'lr=0.01']
Column: lr_scheduler: ['none']
Column: init: ['pretrain' 'kaiming_normal']
Column: transform: ['default' 'intermediate']
Column: np: ['np=0.0' 'np=0.03' 'np=0.07' 'np=0.13']
Column: ns: ['ns=0.0' 'ns=0.2' 'ns=0.4' 'ns=0.6']
Column: folds: [3]
Column: epochs: [15]
Column: done: [False]
Column: valid: [False]
Column: test_acc: [None]
Column: train_loss: [None]
Column: validation_loss: [None]
3744


In [12]:
# table.to_csv('experiments.csv')

# Update experiments status

In [None]:
old_experiments = pd.read_csv(EXPERIMENT_INFO_PATH, index_col='index')
experiments = old_experiments.copy()
for index, row in experiments.iterrows():
    experiment_dir = osp.join(EXPERIMENT_BASE_DIR, *[str(row[col]) for col in EXPERIMENT_COLS])
    if osp.isdir(experiment_dir):
        number_of_folds = len(os.listdir(experiment_dir))
        if number_of_folds == FOLDS:
            experiments.loc[index, 'done'] = True

print("total experiments:", len(old_experiments))
print("already done:", (old_experiments['done'] == True).sum())
print("required to update:", (experiments['done'] == True).sum() -(old_experiments['done'] == True).sum())


In [3]:
experiments.to_csv(EXPERIMENT_INFO_PATH)

# Update filtering experiments status

In [None]:
old_experiments = pd.read_csv(FILTERING_EXPERIMENT_INFO_PATH, index_col='index')
experiments = old_experiments.copy()
for index, row in experiments.iterrows():
    experiment_dir = osp.join(FILTERING_EXPERIMENT_BASE_DIR, *[str(row[col]) for col in FILTERING_EXPERIMENT_COLS])
    if osp.isdir(experiment_dir):
        number_of_folds = len(os.listdir(experiment_dir))
        if number_of_folds == FOLDS:
            experiments.loc[index, 'done'] = True

print("total experiments:", len(old_experiments))
print("already done:", (old_experiments['done'] == True).sum())
print("required to update:", (experiments['done'] == True).sum() -(old_experiments['done'] == True).sum())


In [None]:
experiments.to_csv(FILTERING_EXPERIMENT_INFO_PATH)

# Put last vall loss and train loss in dataframe for done experiments 

In [None]:
experiments = pd.read_csv(EXPERIMENT_INFO_PATH, index_col='index')
experiments.head()

In [None]:
target_experiments = experiments[(experiments['done'] == True) & (experiments['validation_loss'].isna() | experiments['train_loss'].isna())]
print('number of experiments: ', len(target_experiments))
target_experiments.head()

In [None]:
for index, row in target_experiments.iterrows():
    experiment_dir = osp.join(EXPERIMENT_BASE_DIR, *[str(row[col]) for col in EXPERIMENT_COLS])
    if osp.isdir(experiment_dir):
        number_of_folds = len(os.listdir(experiment_dir))
        if number_of_folds == FOLDS:
            lossie = {'train': [], 'validation': []}
            for fold in range(FOLDS):
                fold_experiment_dir = osp.join(experiment_dir, str(fold))
                samples_data = pd.DataFrame()
                for phase in PHASES:
                    phase_experiment_dir = osp.join(fold_experiment_dir, phase)
                    last_epoch_experiment_dir = osp.join(phase_experiment_dir, f"{EPOCHS - 1 :03d}")
                    glob_regex = osp.join(last_epoch_experiment_dir, '*.pd')
                    iterations_log = sorted(glob.glob(glob_regex))
                    if len(iterations_log) == 0:
                        print(f"Train for this experiment is not complete \n{row}")
                        continue
                    iterations_log = [pd.read_pickle(file_path) for file_path in iterations_log]
                    iterations_log = pd.concat(iterations_log, axis=0, ignore_index=True)
                    iterations_log = iterations_log.drop(columns=['proba'])
                    samples_data = samples_data._append(iterations_log, ignore_index=True)
                    mean_loss = round(float(samples_data['loss'].mean()), 3)
                    lossie[phase].append(mean_loss)
            print(lossie)

            experiments.loc[index, 'validation_loss'] = f"{lossie['validation'][0]} {lossie['validation'][1]} {lossie['validation'][2]}"
            experiments.loc[index, 'train_loss'] = f"{lossie['train'][0]} {lossie['train'][1]} {lossie['train'][2]}"


In [None]:
experiments[(experiments['done'] == True)]

In [8]:
experiments.to_csv(EXPERIMENT_INFO_PATH)