# Experiment 2: Interactive reprocessing
This is representative of a new type of functionality that we currently don’t support in METASPACE
because it’s uneconomical with the serverful approach. While looking for specific compounds,
scientists tend to have relatively short lists of molecules of interest, and iteratively try
different adducts or modifiers until they find the data they’re interested in.

### METRICS TO BENCHMARK
* Performance:
    * **Metric:** Total processing time
    
        **Goal:** Fast enough to use interactively in a notebook - less than ~60 seconds

* Cost:
    * **Metric:** Total cost
    
        **Goal:** Significantly less than a full annotation - determined by experiment 1

# Notebook setup
Run `python3 setup.py install` to install all requirements for annotation pipeline project.

In [None]:
# We need this to overcome Python notebooks limitations of too many open files
import resource
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
print('Before:', soft, hard)

# Raising the soft limit. Hard limits can be raised only by sudo users
resource.setrlimit(resource.RLIMIT_NOFILE, (10000, hard))
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
print('After:', soft, hard)

In [None]:
%config Completer.use_jedi = False
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
# If pywren_ibm_cloud isn't installed, please run `pip install -e .` in this directory to install all dependencies
import pywren_ibm_cloud as pywren

pywren.__version__

In [None]:
import logging
logging.basicConfig(level=logging.INFO)

In [None]:
# Set a socket timeout so that CF requests fail instead of hanging if they don't get a response
import socket
print('Previous timeout:', socket.getdefaulttimeout())
socket.setdefaulttimeout(60)

## Configuration

In [None]:
import json
config = json.load(open('config.json'))

In [None]:
#input_config = json.load(open('metabolomics/input_config_small.json'))
input_config = json.load(open('metabolomics/input_config_big.json'))
#input_config = json.load(open('metabolomics/input_config_huge.json'))
input_data = input_config['dataset']
input_db = input_config['molecular_db']

# Override databases, because this experiment expects a small database
exp_db_path = 'metabolomics/db/mol_db5.pickle'
input_db['databases'] = [exp_db_path]

In [None]:
# Check input dataset is present
from annotation_pipeline.utils import ds_imzml_path
import sys
try:
    assert ds_imzml_path(input_config['dataset']['path'])
except:
    print(f"No imzML file was found in {input_config['dataset']['path']}. "
           "Please follow the instructions in README.md to download and extract the dataset required by this input_config.json file.",
          file=sys.stderr)

# Initial setup (not included in benchmark timings)

In [None]:
from annotation_pipeline.molecular_db import build_database, calculate_centroids, get_formula_id_dfs
from annotation_pipeline.pipeline import Pipeline
from datetime import datetime
import pandas as pd
import pickle

### Load & segment dataset

In [None]:
pipeline = Pipeline(config, input_config)
pipeline.load_ds()
pipeline.split_ds()
pipeline.segment_ds()

# Benchmark

In [None]:
from annotation_pipeline.utils import get_ibm_cos_client
cos_client = get_ibm_cos_client(config)

In [None]:
from annotation_pipeline.utils import init_pywren_stats
init_pywren_stats()

### Process new molecules and Run Annotation

In [None]:
start_time = datetime.now()

# Process new molecules:
## Upload list of molecules (in a real scenario this list would change every iteration, so this isn't part of setup)
mols = pd.read_csv('metabolomics/db/mol_db5.csv')
mols_list = sorted(set(mols.sf.values.tolist()))
cos_client.put_object(Bucket=config['storage']['db_bucket'], Key=exp_db_path, Body=pickle.dumps(mols_list))
build_database(config, input_db)
polarity = input_data['polarity']
isocalc_sigma = input_data['isocalc_sigma']
calculate_centroids(config, input_db, polarity, isocalc_sigma)
pipeline.segment_centroids()

# Run Annotation:
pipeline.annotate()
results_df = pipeline.formula_metrics_df

finish_time = datetime.now()

In [None]:
print('start', start_time)
print('finish', finish_time)
print('duration', finish_time - start_time)

In [None]:
# Display PyWren statistics file
from annotation_pipeline.utils import get_pywren_stats
get_pywren_stats()

In [None]:
from annotation_pipeline.utils import remove_pywren_stats
remove_pywren_stats()

In [None]:
# Display results
print(results_df.shape)
results_df.head()

# Clean Temp Data

In [None]:
from annotation_pipeline.utils import clean_from_cos
clean_from_cos(config, config["storage"]["ds_bucket"], "metabolomics/tmp")
clean_from_cos(config, config["storage"]["db_bucket"], "metabolomics/tmp")