# Initial requirements

This notebook requires IBM Cloud Object Storage and IBM Cloud Functions
Please follow IBM Cloud dashboard and create both services.


In [None]:
# These are Python and Python lib path we want to use
import sys
sys.executable, sys.prefix

In [None]:
#Install PyWren-IBM if needed
try:
    import pywren_ibm_cloud as pywren
except ModuleNotFoundError:    
    !{sys.executable} -m pip install -U pywren-ibm-cloud==1.0.8
    import pywren_ibm_cloud as pywren

pywren.__version__

In [None]:
# We need this to overcome Python notebooks limitations of too many open files
import resource
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
print('Bebore:', soft, hard)

# Raising the soft limit. Hard limits can be raised only by sudo users
resource.setrlimit(resource.RLIMIT_NOFILE, (10000, hard))
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
print('After:', soft, hard)

In [None]:
%config Completer.use_jedi = False
%matplotlib inline

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from matplotlib import pyplot as plt
from scipy.sparse import coo_matrix
from collections import defaultdict
from pyImagingMSpec.image_measures import isotope_image_correlation, isotope_pattern_match
from cpyImagingMSpec import measure_of_chaos
from itertools import chain
from pathlib import Path
import numpy as np
import pandas as pd
import pickle
import sys
import io

In [None]:
import logging
logging.basicConfig(level=logging.DEBUG)

### IBM COS Setup

Copy the file `config.json.template` to `config.json` and fill in the missing values for API keys, buckets and endpoints per these instructions:

Setup a bucket in IBM Cloud Object Storage

You need an IBM COS bucket which you will use to store the input data. If you don't know of any of your existing buckets or would like like to create a new one, please navigate to your cloud resource list, then find and select your storage instance. From here, you will be able to view all your buckets and can create a new bucket in the region you prefer. Make sure you copy the correct endpoint for the bucket from the Endpoint tab of this COS service dashboard. Note: The bucket names must be unique.

Obtain the API key and endpoint to the IBM Cloud Functions service. Navigate to Getting Started > API Key from the side menu and copy the values for "Current Namespace", "Host" and "Key" into the config below. Make sure to add "https://" to the host when adding it as the endpoint.

In [None]:
import json

config = json.load(open('config.json'))

### Input Files Setup

Copy the file `input_config.json.template` to `input_config.json` and fill in the missing values for buckets.

In [None]:
input_config = json.load(open('input_config.json'))
input_data = input_config['dataset']
input_db = input_config['molecular_db']

# Upload test data into COS bucket

In [None]:
import ibm_boto3
from ibm_botocore.client import Config
from ibm_botocore.client import ClientError

In [None]:
cos_client = ibm_boto3.client(service_name='s3',
                              ibm_api_key_id=config['ibm_cos']['api_key'],
#                               ibm_auth_endpoint=config['ibm_cos']['auth_endpoint'],
                              config=Config(signature_version='oauth'),
                              endpoint_url=config['ibm_cos']['endpoint'])

In [None]:
def copy(src, target_bucket, target_key):
    print('Copying from {} to {}/{}'.format(src, target_bucket, target_key))

    with open(src, "rb") as fp:
        cos_client.put_object(Bucket=target_bucket, Key=target_key, Body=fp)

    print('Copy completed for {}/{}'.format(target_bucket, target_key))

In [None]:
import os

for dirpath, dirnames, filenames in os.walk('./metabolomics'):
    for fn in filenames:
        f_path = f'{dirpath}/{fn}'
        copy(f_path, input_data['bucket'], f_path)

# Read Dataset Spectra

In [None]:
from annotation_pipeline.dataset import read_dataset_spectra, read_dataset_coords, real_pixel_indices

In [None]:
spectra = read_dataset_spectra(config, input_data)

In [None]:
len(spectra)

In [None]:
sp_i, mzs, ints = spectra[0]
mzs

In [None]:
ints

In [None]:
spectra_coords = read_dataset_coords(config, input_data)

In [None]:
len(spectra_coords)

In [None]:
spectra_coords[:5]

In [None]:
pixel_indices, nrows, ncols = real_pixel_indices(spectra_coords)

In [None]:
pixel_indices

In [None]:
nrows, ncols

# Read Molecular Database and Store

In [None]:
from annotation_pipeline.molecular_db import process_formulas_database, store_centroids_database

In [None]:
formulas_shape, formulas_head = process_formulas_database(config, input_db)

In [None]:
formulas_shape

In [None]:
formulas_head

In [None]:
%%time
centroids_shape, centroids_head = store_centroids_database(config, input_db)

In [None]:
centroids_shape

In [None]:
centroids_head

# Split Dataset into Segments

In [None]:
from annotation_pipeline.dataset_segmentation import generate_segm_intervals, split_spectra_into_segments

In [None]:
segm_n = 256

In [None]:
segm_intervals = generate_segm_intervals(config, input_db, segm_n)

In [None]:
segm_intervals[:5]

In [None]:
split_spectra_into_segments(config, input_data, segm_n, segm_intervals)

In [None]:
cos_client.list_objects_v2(Bucket=input_data['bucket'], Prefix=input_data['segments']).get('Contents', [])[:3]

# Annotation Pipeline Applied to each Segment in Parallel

In [None]:
from annotation_pipeline.annotation import annotate_spectra

In [None]:
%%time
results = annotate_spectra(config, input_data, input_db, segm_n, pixel_indices, nrows, ncols)

In [None]:
len(results)

# Get Results

In [None]:
from annotation_pipeline.annotation import merge_annotation_results
formula_scores_df, formula_images = merge_annotation_results(results)

In [None]:
img = formula_images[896952][0][1]
plt.imshow(img.toarray())

## Clean Segments Datasets

In [None]:
from annotation_pipeline.dataset_segmentation import clean_segments

In [None]:
%%time
clean_segments(config, input_data)