# Experiment 1: Typical use case
This is representative of a normal use-case on METASPACE, which makes it suitable for head-to-head comparisons. There is often limited time available on the higher-spec PC used for initial data capture as it is a shared resource, so usually the analysis will be performed from scientists’ or students’ lower-spec laptops.

### METRICS TO BENCHMARK
* Performance:
    * **Metric:** Total processing time up to downloading the results dataframe
    
        **Goal:** Faster than serverful METASPACE (including or excluding cluster start time)

    * **Metric:** Latency for retrieving all images of target ions.
    
        **Goal:** Similar to or faster than METASPACE’s python client

* Capability:
    * **Metric:** Peak memory usage on client.
    
        **Goal:** Capable of running on low-spec PC with 8GB ram, so ~6GB max usage

* Cost:
    * **Metric:** Cloud provider cost
    
        **Goal:** Similar price or cheaper than METASPACE (including or excluding cluster start time)

    * **Metric:** Developer time
    
        **Goal:** Less annual time required to manage cloud infrastructure than METASPACE

# Notebook setup
Run `python3 setup.py install` to install all requirements for annotation pipeline project.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%config Completer.use_jedi = False
%matplotlib inline

In [None]:
# We need this to overcome Python notebooks limitations of too many open files
import resource
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
print('Before:', soft, hard)

# Raising the soft limit. Hard limits can be raised only by sudo users
resource.setrlimit(resource.RLIMIT_NOFILE, (10000, hard))
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
print('After:', soft, hard)

In [None]:
# These are Python and Python lib path we want to use
import sys
sys.executable, sys.prefix

In [None]:
# Install PyWren-IBM if needed
try:
    import pywren_ibm_cloud as pywren
except ModuleNotFoundError:    
    !{sys.executable} -m pip install -U pywren-ibm-cloud==1.0.10
    import pywren_ibm_cloud as pywren

pywren.__version__

In [None]:
import logging
logging.basicConfig(level=logging.INFO)

In [None]:
# Set a socket timeout so that CF requests fail instead of hanging if they don't get a response
import socket
print('Previous timeout:', socket.getdefaulttimeout())
socket.setdefaulttimeout(60)

## Configuration

In [None]:
import json
config = json.load(open('config.json'))

In [None]:
#input_config = json.load(open('metabolomics/input_config_small.json'))
input_config = json.load(open('metabolomics/input_config_big.json'))
#input_config = json.load(open('metabolomics/input_config_huge.json'))
input_data = input_config['dataset']
input_db = input_config['molecular_db']

# Override config to match METASPACE annotation settings
input_data['num_decoys'] = 20
input_db['modifiers'] = ['']

# Setup (Not included in benchmark timings)

In [None]:
from annotation_pipeline.molecular_db import dump_mol_db, build_database, \
    calculate_centroids, get_formula_id_dfs

In [None]:
# Download commonly used mol DBs from METASPACE (add force=True to redownload if needed)
dump_mol_db(config, config['storage']['db_bucket'], 'metabolomics/db/mol_db1.pickle', 22) #HMDB-v4
dump_mol_db(config, config['storage']['db_bucket'], 'metabolomics/db/mol_db2.pickle', 19) #ChEBI-2018-01
dump_mol_db(config, config['storage']['db_bucket'], 'metabolomics/db/mol_db3.pickle', 24) #LipidMaps-2017-12-12
dump_mol_db(config, config['storage']['db_bucket'], 'metabolomics/db/mol_db4.pickle', 26) #SwissLipids-2018-02-02

In [None]:
build_database(config, input_db)

In [None]:
polarity = input_data['polarity']
isocalc_sigma = input_data['isocalc_sigma']
calculate_centroids(config, input_db, polarity, isocalc_sigma)

# Benchmark

In [None]:
import os
import psutil
import pandas as pd
from datetime import datetime
memory_usage_mb = psutil.Process(os.getpid()).memory_info().rss / 2**20
print(f'Memory usage before: {memory_usage_mb:.0f}MB')

In [None]:
from annotation_pipeline.utils import init_pywren_stats
init_pywren_stats()

### Run Annotation Pipeline

In [None]:
from annotation_pipeline.pipeline import Pipeline

In [None]:
pipeline = Pipeline(config, input_config)

In [None]:
start_time = datetime.now()
pipeline()
results_df = pipeline.get_results()
finish_time = datetime.now()

In [None]:
print('start', start_time)
print('finish', finish_time)
print('duration', finish_time - start_time)

In [None]:
# Display PyWren statistics file
pd.read_csv('stats.csv')

In [None]:
from annotation_pipeline.utils import remove_pywren_stats
remove_pywren_stats()

In [None]:
memory_usage_mb = psutil.Process(os.getpid()).memory_info().rss / 2**20
print(f'Memory usage after: {memory_usage_mb:.0f}MB')

# Benchmark Part 2
Comparing the time required to retrieve all images

In [None]:
#Install metaspace2020 if needed (May require kernel restart)
try:
    from metaspace.sm_annotation_utils import SMInstance
except ModuleNotFoundError:    
    !{sys.executable} -m pip install -U metaspace2020
    from metaspace.sm_annotation_utils import SMInstance

In [None]:
%time images_dict = pipeline.get_images()

In [None]:
sm = SMInstance()
ds = sm.dataset(id=input_data['metaspace_id'])

In [None]:
%time metaspace_client_images = ds.all_annotation_images(fdr=0.5)

# Check results are correct

In [None]:
checked_results = pipeline.check_results()

# Clean Temp Data

In [None]:
from annotation_pipeline.utils import clean_from_cos

In [None]:
# Clean formulas chunks
clean_from_cos(config, config["storage"]["db_bucket"], input_data["formulas_chunks"])

In [None]:
# Clean dataset chunks
clean_from_cos(config, config["storage"]["ds_bucket"], input_data["ds_chunks"])

In [None]:
# Clean dataset segments
clean_from_cos(config, config["storage"]["ds_bucket"], input_data["ds_segments"])

In [None]:
# Clean centroids database segments
clean_from_cos(config, config["storage"]["db_bucket"], input_db["centroids_segments"])

In [None]:
# Clean formula output images
clean_from_cos(config, config["storage"]["output_bucket"], output["formula_images"])

In [None]:
# Clean FDR rankings
clean_from_cos(config, config["storage"]["ds_bucket"], input_data["fdr_rankings"])