# Experiment 2: Interactive reprocessing
This is representative of a new type of functionality that we currently don’t support in METASPACE
because it’s uneconomical with the serverful approach. While looking for specific compounds,
scientists tend to have relatively short lists of molecules of interest, and iteratively try
different adducts or modifiers until they find the data they’re interested in.

### METRICS TO BENCHMARK
* Performance:
    * **Metric:** Total processing time
    
        **Goal:** Fast enough to use interactively in a notebook - less than ~60 seconds

* Cost:
    * **Metric:** Total cost
    
        **Goal:** Significantly less than a full annotation - determined by experiment 1

# Notebook setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%config Completer.use_jedi = False
%matplotlib inline

In [None]:
# These are Python and Python lib path we want to use
import sys
sys.executable, sys.prefix

In [None]:
#Install PyWren-IBM if needed
try:
    import pywren_ibm_cloud as pywren
except ModuleNotFoundError:    
    !{sys.executable} -m pip install -U pywren-ibm-cloud==1.0.10
    import pywren_ibm_cloud as pywren

pywren.__version__

In [None]:
# We need this to overcome Python notebooks limitations of too many open files
import resource
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
print('Before:', soft, hard)

# Raising the soft limit. Hard limits can be raised only by sudo users
resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
print('After:', soft, hard)

In [None]:
from matplotlib import pyplot as plt
from scipy.sparse import coo_matrix
from collections import defaultdict
from pyImagingMSpec.image_measures import isotope_image_correlation, isotope_pattern_match
from cpyImagingMSpec import measure_of_chaos
from itertools import chain
from pathlib import Path
import numpy as np
import pandas as pd
import pickle
import sys
import io
import os
from datetime import datetime

In [None]:
import logging
logging.basicConfig(level=logging.INFO)

## Configuration

In [None]:
config = json.load(open('config.json'))

In [None]:
# input_config = json.load(open('metabolomics/input_config_small.json'))
input_config = json.load(open('metabolomics/input_config_big.json'))
# input_config = json.load(open('metabolomics/input_config_huge.json'))
input_data = input_config['dataset']
input_db = input_config['molecular_db']

# Override databases, because this experiment expects a small database
exp_db_path = 'metabolomics/db/mol_db5.pickle'
input_db['databases'] = [exp_db_path]

In [None]:
import ibm_boto3
from ibm_botocore.client import Config
from ibm_botocore.client import ClientError
cos_client = ibm_boto3.client(service_name='s3',
                              ibm_api_key_id=config['ibm_cos']['api_key'],
                              config=Config(signature_version='oauth'),
                              endpoint_url=config['ibm_cos']['endpoint'])

# Initial setup (not included in benchmark timings)

### Upload test data into COS bucket

In [None]:
from annotation_pipeline_v2.utils import upload_to_cos
for root, dirnames, filenames in os.walk(input_data['path']):
    for fn in filenames:
        f_path = f'{root}/{fn}'
        print(f_path)
        upload_to_cos(cos_client, f_path, config['storage']['ds_bucket'], f_path)

### Load & segment dataset

In [None]:
from annotation_pipeline_v2.pipeline import Pipeline
pipeline = Pipeline(config, input_config)
pipeline.load_ds()
pipeline.segment_ds()

# Benchmark

In [None]:
start_time = datetime.now()
print('start', start_time)

### Process new molecules

In [None]:
from annotation_pipeline.molecular_db import build_database, calculate_centroids, get_formula_id_dfs, clean_formula_chunks

In [None]:
# Upload list of molecules (in a real scenario this list would change every iteration, so this isn't part of setup)
mols = pd.read_csv('metabolomics/db/mol_db5.csv')
mols_list = sorted(set(mols.sf.values.tolist()))
cos_client.put_object(Bucket=config['storage']['db_bucket'], Key=exp_db_path, Body=pickle.dumps(mols_list))

In [None]:
num_formulas, formula_chunk_keys = build_database(config, input_db)

In [None]:
polarity = input_data['polarity']
isocalc_sigma = input_data['isocalc_sigma']
centroids_shape, centroids_head = calculate_centroids(config, input_db, formula_chunk_keys, polarity, isocalc_sigma)

In [None]:
# Download centroids.pickle to local machine (Can be removed when pipeline can run directly from COS)
resp = cos_client.get_object(Bucket=config['storage']['db_bucket'], Key=input_db['centroids_pandas'])
with open(input_db['centroids_pandas'], 'wb') as f:
    f.write(resp['Body'].read())

In [None]:
clean_formula_chunks(config, input_db, formula_chunk_keys)

In [None]:
%time pipeline.segment_centroids()

### Run Annotation

In [None]:
%time pipeline.annotate()

In [None]:
%time results_df = pipeline.formula_metrics_df

In [None]:
# Display results
print(results_df.shape)
results_df.head()

In [None]:
finish_time = datetime.now()
print('start', start_time)
print('finish', finish_time)
print('duration', finish_time - start_time)