# Extract data from MassBank to MongoDB

For more information about MassBank see http://www.massbank.jp

In [29]:
from __future__ import print_function
from __future__ import division
import os
import json
import codecs
from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError
from rdkit import Chem
from IPython.html.widgets import FloatText
from IPython.display import display

In [30]:
RECREATE = True

REPOSITORY_PATH = '../../data/dl/record/'

MONGODB_ADDRESS = 'localhost'
MONGODB_PORT = 27017

DB_NAME = 'mass-bank'
COLLECTION_NAME = 'spectrums'

## Setup MongoDB Client

In [31]:
client = MongoClient(host=MONGODB_ADDRESS, port=MONGODB_PORT)

db = client[DB_NAME]

if RECREATE:
    db.drop_collection(COLLECTION_NAME)

collection = db[COLLECTION_NAME]

collection.create_index('ACCESSION', unique=True)
collection.create_index('CH$LINK_INCHIKEY')

u'CH$LINK_INCHIKEY_1'

In [32]:
def mass_spectrometry_parser(info, value):
    split = value.strip().split(' ', 1)
    key = split[0]
    new_key = 'AC$MASS_SPECTROMETRY_' + key
    
    if key in ['MS_TYPE', 'ION_MODE', 'COLLISION_ENERGY']:
        info[new_key] = split[1]

rules = {
    'ACCESSION': ('M', 'U', 'S', None),
    'RECORD_TITLE': ('M', 'U', 'S', None),
    'DATE': ('M', 'U', 'S', None),
    'AUTHORS': ('M', 'U', 'S', None),
    'LICENSE': ('M', 'U', 'S', None),
    'CH$NAME': ('M', 'I', 'S', None),
    'CH$COMPOUND_CLASS': ('M', 'U', 'S', None),
    'CH$FORMULA': ('M', 'U', 'S', None),
    'CH$EXACT_MASS': ('M', 'U', 'S', None),
    'CH$SMILES': ('M', 'U', 'S', None),
    'CH$IUPAC': ('M', 'U', 'S', None),
    'AC$INSTRUMENT': ('M', 'U', 'S', None),
    'AC$INSTRUMENT_TYPE': ('M', 'U', 'S', None),
    'AC$MASS_SPECTROMETRY': ('M', 'U', 'S', mass_spectrometry_parser),
    'PK$NUM_PEAK': ('M', 'U', 'S', None),
    'PK$PEAK': ('M', 'U', 'M', None),
}

In [33]:
def execute_rule(rule, info, key, value):    
    if rule[1] == 'I':
        # No support for custom parser for itterative values
        if not info.has_key(key):
            info[key] = []
        info[key].append(value.strip())
    else:  
        if rule[3]:
            rule[3](info, value)
        else:
            info[key] = value.strip()

In [34]:
def parse_file(file_path):
    
    info = {}

    with codecs.open(file_path, 'r') as fh:    
        parsing_multiline = False
        multiline_value = ''
        multiline_key = ''
        multiline_rule = None

        for line in fh.readlines():
            if line.startswith('  ') and not parsing_multiline:
                continue

            if parsing_multiline:
                if line.startswith('  '):
                    multiline_value += line.lstrip()
                    continue
                else:
                    # commit the value and process the next
                    parsing_multiline = False
                    execute_rule(multiline_rule, info, multiline_key, multiline_value)
                    multiline_key = ''
                    multiline_value = ''
                    multiline_rule = None

            if line.startswith('//'):
                break

            split = line.split(': ', 1)

            if len(split) != 2:
                continue
            
            key = split[0]
            value = split[1]
            rule = rules.get(key)

            if rule:
                #Multilines
                if rule[2] == 'M':
                    multiline_key = key
                    multiline_rule = rule
                    multiline_value += value
                    parsing_multiline = True
                else:
                    execute_rule(rule, info, key, value)

    # Add INCHI Key
    if info['CH$IUPAC'] != 'N/A':
        info['CH$LINK_INCHIKEY'] = 'InChIKey=' + Chem.InchiToInchiKey(info['CH$IUPAC'])

    return info

## Get all txt files from the MassBank repository

In [35]:
all_files = []

for (root, dirs, files) in os.walk(REPOSITORY_PATH):
    # Skip the SVN hidden dir
    if '.svn' in dirs:
        dirs.remove('.svn')
    
    for f in [f for f in files if f.endswith('.txt')]:
        full_path = os.path.join(root, f)
        all_files.append(full_path)

In [36]:
# Preview a samples
if all_files:
    print(json.dumps(parse_file(all_files[0]), indent=4, sort_keys=True))

{
    "AC$INSTRUMENT": "Bruker maXis Impact", 
    "AC$INSTRUMENT_TYPE": "LC-ESI-QTOF", 
    "AC$MASS_SPECTROMETRY_COLLISION_ENERGY": "Ramp 21.1-31.6 eV", 
    "AC$MASS_SPECTROMETRY_ION_MODE": "POSITIVE", 
    "AC$MASS_SPECTROMETRY_MS_TYPE": "MS2", 
    "ACCESSION": "AU100601", 
    "AUTHORS": "Nikiforos Alygizakis, Anna Bletsou, Nikolaos Thomaidis, University of Athens", 
    "CH$COMPOUND_CLASS": "N/A; Environmental Standard", 
    "CH$EXACT_MASS": "284.0135", 
    "CH$FORMULA": "C10H9ClN4O2S", 
    "CH$IUPAC": "InChI=1S/C10H9ClN4O2S/c11-9-5-13-6-10(14-9)15-18(16,17)8-3-1-7(12)2-4-8/h1-6H,12H2,(H,14,15)", 
    "CH$LINK_INCHIKEY": "InChIKey=QKLPUVXBJHRFQZ-UHFFFAOYSA-N", 
    "CH$NAME": [
        "Sulfaclozine", 
        "4-amino-N-(6-chloropyrazin-2-yl)benzenesulfonamide"
    ], 
    "CH$SMILES": "c1cc(ccc1N)S(=O)(=O)Nc2cncc(n2)Cl", 
    "DATE": "2015.07.05", 
    "LICENSE": "CC BY-SA", 
    "PK$NUM_PEAK": "27", 
    "PK$PEAK": "m/z int. rel.int.\r\n53.0389 592 5\r\n54.0333 564 5\r\n55

## Parse all files and add MS2 spectrums to the collection

In [37]:
# Progress Bar
progress = FloatText()
display(progress)

num_files = len(all_files)

for i, f in enumerate(all_files):
    try:
        info = parse_file(f)
        
        # Report progress
        progress.value = 100.0 * ((i+1) / num_files)
        
        if info['AC$MASS_SPECTROMETRY_MS_TYPE'] != 'MS2':
            continue
    except:
        print("Could not parse", f)
        raise

    try:
        collection.insert_one(info)
    except DuplicateKeyError:
        pass
    except:
        print(f)
        raise