# `mle-monitor`: Lightweight Resource Monitoring
### Author: [@RobertTLange](https://twitter.com/RobertTLange) [Last Update: October 2021][![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RobertTLange/mle-monitor/blob/main/examples/getting_started.ipynb)

In [1]:
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

try:
    import mle_monitor
except:
    !pip install -q mle-monitor
    import mle_monitor

# Pillar I: `MLEProtocol`

In [2]:
from mle_monitor import MLEProtocol

# Load the protocol from a local file (create new if it doesn't exist yet)
protocol = MLEProtocol(protocol_fname="mle_protocol.db")

In [3]:
experiment_data = {"purpose": "Test Protocol",
                   "project_name": "MNIST",
                   "exec_resource": "local",
                   "experiment_dir": "log_dir",
                   "experiment_type": "hyperparameter-search",
                   "base_fname": "main.py",
                   "config_fname": "base_config.json",
                   "num_seeds": 5,
                   "num_total_jobs": 10,
                   "num_jobs_per_batch": 5,
                   "num_job_batches": 2,
                   "time_per_job": "00:05:00",  # days-hours-minutes
                   "num_cpus": 2,
                   "num_gpus": 1} 
e_id = protocol.add(experiment_data, save=False)
protocol.get(e_id)

{'purpose': 'Test Protocol',
 'project_name': 'MNIST',
 'exec_resource': 'local',
 'experiment_dir': 'log_dir',
 'experiment_type': 'hyperparameter-search',
 'base_fname': 'main.py',
 'config_fname': 'base_config.json',
 'num_seeds': 5,
 'num_total_jobs': 10,
 'num_jobs_per_batch': 5,
 'num_job_batches': 2,
 'time_per_job': '00:05:00',
 'num_cpus': 2,
 'num_gpus': 1,
 'git_hash': '7b551b3488eb9b8a9b053e5e4d0ef3b7ed324f65',
 'loaded_config': [{'train_config': {'lrate': 0.1},
   'model_config': {'num_layers': 5},
   'log_config': {'time_to_track': ['step_counter'],
    'what_to_track': ['loss'],
    'time_to_print': ['step_counter'],
    'what_to_print': ['loss'],
    'print_every_k_updates': 10,
    'overwrite_experiment_dir': 1}}],
 'e-hash': '001881ef046f5150291e672a57ca7090',
 'retrieved_results': False,
 'stored_in_cloud': False,
 'report_generated': False,
 'job_status': 'running',
 'start_time': '10/29/2021 14:06:57',
 'duration': '00:10:00',
 'stop_time': '10/30/2021 00:06:57'}

In [4]:
# Print a summary of the last experiments
sub_df = protocol.summary()

# ... and a more detailed version
sub_df = protocol.summary(full=True)

In [5]:
# Update some element in the database
protocol.update(e_id, "exec_resource", "slurm-cluster", save=False)

# Abort the experiment - changes status
protocol.abort(e_id, save=False)
sub_df = protocol.summary()

In [6]:
# Get the status of the experiment
protocol.status(e_id)

'aborted'

In [8]:
# Get the monitoring data - used later in dashboard
total_data, last_data, time_data, protocol_table = protocol.monitor()
total_data, last_data, time_data

({'total': '1',
  'run': '0',
  'done': '0',
  'aborted': '1',
  'sge': '0',
  'slurm': '1',
  'gcp': '0',
  'local': '0',
  'report_gen': '0',
  'gcs_stored': '0',
  'retrieved': '0'},
 {'e_id': '1',
  'e_dir': 'log_dir',
  'e_type': 'hyperparameter-search',
  'e_script': 'main.py',
  'e_config': 'base_config.json',
  'report_gen': False},
 {'total_jobs': 10,
  'total_batches': 2,
  'jobs_per_batch': 5,
  'time_per_batch': '00:05:00',
  'start_time': '10/29/2021 14:05:26',
  'stop_time': '10/30/2021 00:05:26',
  'est_duration': '00:10:00'})

In [None]:
# Sync your protocol with a GCS bucket

# Pillar II: `MLEResource`

In [None]:
from mle_monitor import MLEResource

resource = MLEResource()

len(resource.monitor())
resource.monitor()

# Pillar III: MLEDashboard

In [None]:
from mle_monitor import MLEDashboard

dashboard = MLEDashboard(protocol, resource)

In [None]:
# Get a static snapshot of the protocol & resource utilisation
dashboard.snapshot()