# Experiment 1: Typical use case
As a test to ensure that the limits of are similar to METASPACE, this is one of the larger datasets that has been processed.

### METRICS TO BENCHMARK
* Performance:
    * **Metric:** Total processing time

        **Goal:** similar to or faster than METASPACE (including cluster start time)

# Notebook setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%config Completer.use_jedi = False
%matplotlib inline

In [None]:
# These are Python and Python lib path we want to use
import sys
sys.executable, sys.prefix

In [None]:
#Install PyWren-IBM if needed
try:
    import pywren_ibm_cloud as pywren
except ModuleNotFoundError:    
    !{sys.executable} -m pip install -U pywren-ibm-cloud==1.0.10
    import pywren_ibm_cloud as pywren

pywren.__version__

In [None]:
# We need this to overcome Python notebooks limitations of too many open files
import resource
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
print('Before:', soft, hard)

# Raising the soft limit. Hard limits can be raised only by sudo users
resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
print('After:', soft, hard)

In [None]:
from matplotlib import pyplot as plt
from scipy.sparse import coo_matrix
from collections import defaultdict
from pyImagingMSpec.image_measures import isotope_image_correlation, isotope_pattern_match
from cpyImagingMSpec import measure_of_chaos
from itertools import chain
from pathlib import Path
import numpy as np
import pandas as pd
import pickle
import sys
import io
import os
from datetime import datetime

In [None]:
import logging
logging.basicConfig(level=logging.INFO)

## Configuration

In [None]:
config = json.load(open('config.json'))

In [None]:
input_config = json.load(open('metabolomics/input_config_small.json'))
# input_config = json.load(open('metabolomics/input_config_big.json'))
# input_config = json.load(open('metabolomics/input_config_huge.json'))
input_data = input_config['dataset']
input_db = input_config['molecular_db']

In [None]:
import ibm_boto3
from ibm_botocore.client import Config
from ibm_botocore.client import ClientError
cos_client = ibm_boto3.client(service_name='s3',
                              ibm_api_key_id=config['ibm_cos']['api_key'],
                              config=Config(signature_version='oauth'),
                              endpoint_url=config['ibm_cos']['endpoint'])

# Benchmark

In [None]:
start_time = datetime.now()
print('start', start_time)

### Upload dataset

In [None]:
import os
from annotation_pipeline_v2.utils import upload_to_cos

In [None]:
for root, dirnames, filenames in os.walk(input_data['path']):
    for fn in filenames:
        f_path = f'{root}/{fn}'
        print(f_path)
        upload_to_cos(cos_client, f_path, config['storage']['ds_bucket'], f_path)

### Build molecular database

In [None]:
from annotation_pipeline.molecular_db import dump_mol_db, build_database, \
    calculate_centroids, get_formula_id_dfs, clean_formula_chunks

In [None]:
# Download commonly used mol DBs from METASPACE (add force=True to redownload if needed)
dump_mol_db(config, config['storage']['db_bucket'], 'metabolomics/db/mol_db1.pickle', 22) #HMDB-v4
dump_mol_db(config, config['storage']['db_bucket'], 'metabolomics/db/mol_db2.pickle', 19) #ChEBI-2018-01
dump_mol_db(config, config['storage']['db_bucket'], 'metabolomics/db/mol_db3.pickle', 24) #LipidMaps-2017-12-12
dump_mol_db(config, config['storage']['db_bucket'], 'metabolomics/db/mol_db4.pickle', 26) #SwissLipids-2018-02-02

In [None]:
num_formulas, formula_chunk_keys = build_database(config, input_db)

In [None]:
polarity = input_data['polarity']
isocalc_sigma = input_data['isocalc_sigma']
centroids_shape, centroids_head = calculate_centroids(config, input_db, formula_chunk_keys, polarity, isocalc_sigma)

In [None]:
clean_formula_chunks(config, input_db, formula_chunk_keys)

### Run Annotation Pipeline

In [None]:
# Download centroids.pickle to local machine
resp = cos_client.get_object(Bucket=config['storage']['db_bucket'], Key=input_db['centroids_pandas'])
with open(input_db['centroids_pandas'], 'wb') as f:
    f.write(resp['Body'].read())

In [None]:
from annotation_pipeline_v2.pipeline import Pipeline

In [None]:
pipeline = Pipeline(config, input_config)

In [None]:
%time pipeline.load_ds()

In [None]:
%time pipeline.segment_ds()

In [None]:
%time pipeline.segment_centroids()

In [None]:
%time pipeline.annotate()

In [None]:
%time pipeline.run_fdr()

In [None]:
%time results_df = pipeline.get_results()

In [None]:
%time images_dict = pipeline.get_images()

In [None]:
finish_time = datetime.now()
print('start', start_time)
print('finish', finish_time)
print('duration', finish_time - start_time)