In [1]:
import pandas as pd
import pymongo
from pprint import pprint
# Connect to client
from pymongo import MongoClient
client = MongoClient('localhost', 27017)

In [2]:
# list databases
print('databases:', client.list_database_names())

databases: ['MNISTdebug', 'SACRED_DB', 'admin', 'config', 'local', 'sacred', 'sacred_db_1', 'sacred_db_template', 'sacred_mnist_example']


In [3]:
# use database of our experiment
db = client['sacred_db_1']
print('Collections of {} db: {}'.format(db.name, db.list_collection_names()))

Collections of sacred_db_1 db: ['runs', 'omniboard.settings', 'fs.files', 'fs.chunks', 'metrics']


## show record of one experiment

In [4]:
exp = list(db.runs.find())[-1]
pprint(exp.keys())
print('Example of a "document" that represents a single experiment:')
pprint(exp)

dict_keys(['_id', 'experiment', 'format', 'command', 'host', 'start_time', 'config', 'meta', 'status', 'resources', 'artifacts', 'captured_out', 'info', 'heartbeat', 'result', 'stop_time'])
Example of a "document" that represents a single experiment:
{'_id': 35,
 'artifacts': [],
 'captured_out': "INFO - mnist_cnn - Running command 'my_main'\n"
                 'INFO - mnist_cnn - Started run with ID "35"\n'
                 'x_train shape: (60000, 28, 28, 1)\n'
                 '60000 train samples\n'
                 '10000 test samples\n'
                 'Metal device set to: Apple M1 Pro\n'
                 '\n'
                 'systemMemory: 16.00 GB\n'
                 'maxCacheSize: 5.33 GB\n'
                 '\n'
                 '2022-05-01 11:38:23.080192: I '
                 'tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] '
                 'Could not identify NUMA node of platform GPU ID 0, '
                 'defaulting to 0. Your kern

In [20]:
exp['info']

{'metrics': [{'id': '626dd34b1efeb44eb15cf07b', 'name': 'loss'},
  {'id': '626dd34b1efeb44eb15cf07d', 'name': 'accuracy'},
  {'id': '626dd34b1efeb44eb15cf07f', 'name': 'val_loss'}]}

## show configs

In [5]:
print('Example of configurations of several experiments:\n')
for i, exp in enumerate(list(db.runs.find())[0:3]):
    pprint(exp['config'])
    print('')

Example of configurations of several experiments:

{'params_selected': {'batch_size': 128,
                     'epochs': 3,
                     'img_cols': 28,
                     'img_rows': 28,
                     'num_classes': 10},
 'seed': 325123068}

{'params_selected': {'batch_size': 128,
                     'epochs': 3,
                     'img_cols': 28,
                     'img_rows': 28,
                     'num_classes': 10},
 'seed': 58624823}

{'batch_size': 128, 'epochs': 3, 'num_classes': 10, 'seed': 396457322}



## pandas+

In [6]:
from collections import OrderedDict
import pandas as pd
import re

def slice_dict(d, keys):
    """ Returns a dictionary ordered and sliced by given keys
        keys can be a list, or a CSV string
    """
    if isinstance(keys, str):
        keys = keys[:-1] if keys[-1] == ',' else keys
        keys = re.split(', |[, ]', keys)

    return dict((k, d[k]) for k in keys)

def sacred_to_df(db_runs, mongo_query=None, ):
    """
    db_runs is usually db.runs
    returns a dataframe that summarizes the experiments, where 
    config and info fields are flattened to their keys.
    Summary DF contains the following columns:
    _id, experiment.name, **config, result, **info, status, start_time
    """
    # get all experiment according to mongo query and represent as a pandas DataFrame    
    df = pd.DataFrame(list(db_runs.find(mongo_query)))

    # Take only the interesting columns
    df = df.loc[:, '_id, experiment, config, result, info, status, start_time'.split(', ')]

    def _summerize_experiment(s):
        """
        Take only the 
        """
        o = OrderedDict()
        o['_id'] = s['_id']
        o['name']=s['experiment']['name']
        o.update(s['config'])
        for key, val in s['info'].items():
            if key != 'metrics':
                o[key] = val 

        o.update(slice_dict(s.to_dict(), 'result, status, start_time'))
        return pd.Series(o)
    
    sum_list = []
    for ix, s in df.iterrows():
        sum_list.append(_summerize_experiment(s))
    df_summary = pd.DataFrame(sum_list).set_index('_id')
    
    return df_summary

In [16]:
# Get the COMPLETED experiments with dim<=100 and val. accuracy > 85%
query = 'status=="COMPLETED"'
df_summary = sacred_to_df(db.runs).query(query)
# Sort them in descending order (best performer is first).
#df_summary = df_summary.sort_values('val_acc', ascending=False)
display(df_summary)

Unnamed: 0_level_0,name,params_selected,seed,result,status,start_time,batch_size,epochs,num_classes
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
26,mnist_cnn,"{'batch_size': 128, 'epochs': 3, 'img_cols': 2...",325123068,,COMPLETED,2022-04-30 21:45:26.074,,,
27,mnist_cnn,"{'batch_size': 128, 'epochs': 3, 'img_cols': 2...",58624823,,COMPLETED,2022-04-30 23:20:45.164,,,
28,mnist_cnn,,396457322,,COMPLETED,2022-05-01 00:24:33.646,128.0,3.0,10.0
30,mnist_cnn,,123881562,,COMPLETED,2022-05-01 12:14:19.625,128.0,3.0,10.0
32,mnist_cnn,"{'batch_size': 128, 'epochs': 3, 'img_cols': 2...",682302499,,COMPLETED,2022-05-01 12:18:00.727,,,
33,mnist_cnn,,1427569,,COMPLETED,2022-05-01 14:46:16.907,128.0,3.0,10.0
34,mnist_cnn,,114285423,,COMPLETED,2022-05-01 14:48:14.020,128.0,3.0,10.0
35,mnist_cnn,"{'batch_size': 128, 'epochs': 3, 'img_cols': 2...",901688824,,COMPLETED,2022-05-01 15:38:22.787,,,
