# Experiment 1: Typical use case
This is representative of a normal use-case on METASPACE, which makes it suitable for head-to-head comparisons. There is often limited time available on the higher-spec PC used for initial data capture as it is a shared resource, so usually the analysis will be performed from scientists’ or students’ lower-spec laptops.

### METRICS TO BENCHMARK
* Performance:
    * **Metric:** Total processing time up to downloading the results dataframe
    
        **Goal:** Faster than serverful METASPACE (including or excluding cluster start time)

    * **Metric:** Latency for retrieving all images of target ions.
    
        **Goal:** Similar to or faster than METASPACE’s python client

* Capability:
    * **Metric:** Peak memory usage on client.
    
        **Goal:** Capable of running on low-spec PC with 8GB ram, so ~6GB max usage

* Cost:
    * **Metric:** Cloud provider cost
    
        **Goal:** Similar price or cheaper than METASPACE (including or excluding cluster start time)

    * **Metric:** Developer time
    
        **Goal:** Less annual time required to manage cloud infrastructure than METASPACE

# Notebook setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%config Completer.use_jedi = False
%matplotlib inline

In [None]:
# These are Python and Python lib path we want to use
import sys
sys.executable, sys.prefix

In [None]:
#Install PyWren-IBM if needed
try:
    import pywren_ibm_cloud as pywren
except ModuleNotFoundError:    
    !{sys.executable} -m pip install -U pywren-ibm-cloud==1.0.10
    import pywren_ibm_cloud as pywren

pywren.__version__

In [None]:
#Install psutil if needed
try:
    import psutil
except ModuleNotFoundError:    
    !{sys.executable} -m pip install -U psutil
    import psutil

In [None]:
# We need this to overcome Python notebooks limitations of too many open files
import resource
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
print('Bebore:', soft, hard)

# Raising the soft limit. Hard limits can be raised only by sudo users
resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
print('After:', soft, hard)

In [None]:
from matplotlib import pyplot as plt
from scipy.sparse import coo_matrix
from collections import defaultdict
from pyImagingMSpec.image_measures import isotope_image_correlation, isotope_pattern_match
from cpyImagingMSpec import measure_of_chaos
from itertools import chain
from pathlib import Path
import numpy as np
import pandas as pd
import pickle
import sys
import io
import os
from datetime import datetime

In [None]:
import logging
logging.basicConfig(level=logging.INFO)

## Configuration

In [None]:
config = json.load(open('config.json'))

In [None]:
# input_config = json.load(open('metabolomics/input_config_small.json'))
input_config = json.load(open('metabolomics/input_config_big.json'))
# input_config = json.load(open('metabolomics/input_config_huge.json'))
input_data = input_config['dataset']
input_db = input_config['molecular_db']

# Override config to match METASPACE annotation settings
input_data['num_decoys'] = 20
input_db['adducts'] = ['+H','+Na','+K']
input_db['modifiers'] = ['']

In [None]:
import ibm_boto3
from ibm_botocore.client import Config
from ibm_botocore.client import ClientError
cos_client = ibm_boto3.client(service_name='s3',
                              ibm_api_key_id=config['ibm_cos']['api_key'],
                              config=Config(signature_version='oauth'),
                              endpoint_url=config['ibm_cos']['endpoint'])

# Setup (Not included in benchmark timings)

In [None]:
from annotation_pipeline.molecular_db import dump_mol_db, build_database, \
    calculate_centroids, get_formula_id_dfs, clean_formula_chunks

In [None]:
# Download commonly used mol DBs from METASPACE (add force=True to redownload if needed)
dump_mol_db(config, input_db['bucket'], 'metabolomics/db/mol_db1.pickle', 22) #HMDB-v4
dump_mol_db(config, input_db['bucket'], 'metabolomics/db/mol_db2.pickle', 19) #ChEBI-2018-01
dump_mol_db(config, input_db['bucket'], 'metabolomics/db/mol_db3.pickle', 24) #LipidMaps-2017-12-12
dump_mol_db(config, input_db['bucket'], 'metabolomics/db/mol_db4.pickle', 26) #SwissLipids-2018-02-02

In [None]:
num_formulas, formula_chunk_keys = build_database(config, input_db)

In [None]:
isocalc_sigma = input_data['isocalc_sigma'] # Use 0.001238 if missing from the config, but it's better to get the actual value as it affects the results
centroids_shape, centroids_head = calculate_centroids(config, input_db, formula_chunk_keys, isocalc_sigma)

In [None]:
clean_formula_chunks(config, input_db, formula_chunk_keys)

# Benchmark

In [None]:
import os
import psutil # "pip install psutil" if needed
memory_usage_mb = psutil.Process(os.getpid()).memory_info().rss / 2**20
print(f'Memory usage before: {memory_usage_mb:.0f}MB')

In [None]:
start_time = datetime.now()
print('start', start_time)

### Upload dataset

In [None]:
import os
from annotation_pipeline_v2.utils import upload_to_cos

In [None]:
for root, dirnames, filenames in os.walk(input_data['path']):
    for fn in filenames:
        f_path = f'{root}/{fn}'
        print(f_path)
        upload_to_cos(cos_client, f_path, input_config['dataset']['bucket'], f_path)

### Run Annotation Pipeline

In [None]:
# Download centroids.pickle to local machine
resp = cos_client.get_object(Bucket=input_db['bucket'], Key=input_db['centroids_pandas'])
with open(input_db['centroids_pandas'], 'wb') as f:
    f.write(resp['Body'].read())

In [None]:
from annotation_pipeline_v2.pipeline import Pipeline

In [None]:
pipeline = Pipeline(config, input_config)

In [None]:
%time pipeline.load_ds()

In [None]:
%time pipeline.segment_ds()

In [None]:
%time pipeline.segment_centroids()

In [None]:
%time pipeline.annotate()

In [None]:
%time pipeline.run_fdr()

In [None]:
%time results_df = pipeline.get_results()

In [None]:
finish_time = datetime.now()
print('start', start_time)
print('finish', finish_time)
print('duration', finish_time - start_time)

In [None]:
memory_usage_mb = psutil.Process(os.getpid()).memory_info().rss / 2**20
print(f'Memory usage after: {memory_usage_mb:.0f}MB')

# Benchmark Part 2
Comparing the time required to retrieve all images

In [None]:
#Install metaspace2020 if needed (May require kernel restart)
try:
    from metaspace.sm_annotation_utils import SMInstance
except ModuleNotFoundError:    
    !{sys.executable} -m pip install -U metaspace2020
    from metaspace.sm_annotation_utils import SMInstance

In [None]:
%time images_dict = pipeline.get_images()

In [None]:
sm = SMInstance()
# TODO: (EMBL) Reprocess these datasets with the other DBs, because currently they only have HMDB-v4
#ds = sm.dataset(id='2016-09-22_11h16m11s') # For input_config_small.json
ds = sm.dataset(id='2016-09-21_16h06m52s') # For input_config_big.json
#ds = sm.dataset(id='2016-09-21_16h06m49s') # For input_config_huge.json
%time metaspace_client_images = ds.all_annotation_images(fdr=0.5)