# Enriching datasets
Author: Michal Stary

This notebook consists of semi-automated way to enrich existing GC-MS datasets annotated with InchiKeys with chemical classes from ClassyFire tool.

## Obtain table of Inchi

In [1]:
# path to datasets
mona_path = "/storage/brno1-cerit/projects/msml/data/MoNA-export-GC-MS_Spectra.msp"
#nist_path = "/storage/brno1-cerit/projects/msml/data/NIST_EI_MS.msp"
nist_path = "data/20210925_NIST_EI_MS_cleaned.msp"
# path to extra datasets
rcx1_path = "/storage/brno1-cerit/projects/msml/data/recetox_gc-ei_ms_20201028.msp"
rcx2_path = "/storage/brno1-cerit/projects/msml/data/rcx_gc-orbitrap_metabolites_20210817.msp"


In [2]:
# %pip install --upgrade matchms
#from rdkit.Chem.inchi import InchiToInchiKey

import numpy as np

from matchms.importing import load_from_msp
from matchms.exporting import save_as_msp as save_as_msp_orig

# REDEFINE save_as_msp function to avoid problems with nested comments
def save_as_msp(ds, path):
    for rec in ds:
        if "comments" in rec.metadata:
            rec.set("comment", rec.get("comments"))
            rec._metadata.pop("comments")
    save_as_msp_orig(ds, path)

In [3]:
nist = [*load_from_msp(nist_path)]
mona = [*load_from_msp(mona_path)]
rcx1 = [*load_from_msp(rcx1_path)]
rcx2 = [*load_from_msp(rcx2_path)]


In [4]:
nist[0].metadata

{'name': 'Hydrogen',
 'formula': 'H2',
 'mw': '2',
 'casno': '1333-74-0',
 'id': '1',
 'comment': 'NIST MS# 245692, Seq# M1',
 'inchi': 'InChI=1S/H2/h1H',
 'inchikey': 'UFHFLCQGNIYNRP-UHFFFAOYSA-N',
 'iupac_name': 'molecular hydrogen',
 'smiles': '[HH]',
 'num peaks': '2'}

In [5]:
# get keys of unique records

def get_unique_keys(ds, unkey):
    org = len(ds)
    # omit the spectra without unkey parameter or with the value of this parameter being nan
    ds = [s for s in ds if unkey in s.metadata and s.metadata[unkey]!="nan"]
    print(f"Dropped {org - len(ds)} out of {org} records due to missing unique key")
    keys = set([s.metadata[unkey] for s in ds])
    return keys
    
mona_keys = get_unique_keys(mona ,"inchikey")
print(f"Unique keys {len(mona_keys)}")

nist_keys = get_unique_keys(nist, "inchikey")
print(f"Unique keys {len(nist_keys)}")

rcx1_keys = get_unique_keys(rcx1 ,"inchikey")
rcx2_keys = get_unique_keys(rcx2 ,"inchikey")

Dropped 6 out of 18898 records due to missing unique key
Unique keys 9858
Dropped 60163 out of 350618 records due to missing unique key
Unique keys 246910
Dropped 0 out of 386 records due to missing unique key
Dropped 0 out of 265 records due to missing unique key


## Manual approach
Copy the keys below to the: https://cfb.fiehnlab.ucdavis.edu/ and download the resulting csv.

Note that the API behind can be heavily overloaded during the US working days. It is recommemded to use this on US weekend (Monday morning CET is fine)

Displayed keys are those which are not already present in the inchi2class table. 

In [6]:
import pandas as pd
MAIN_FILE = "/storage/brno1-cerit/projects/msml/inchi2class.csv"

main_df = pd.read_csv(MAIN_FILE, index_col=0)


In [7]:
arr = []
for i, key in enumerate(set(nist_keys).difference(main_df.index)):
    if i< 10000:
        arr.append(key)
        out = "\n".join(arr)
        #print(key)
with open("keys_out.txt", "w") as foo:
    foo.write(out)

## Dowload & parse the csv
download manually csv, upload it to the server into the TMP_FILE

merge it with the main file for inchi -> classes mapping

# Fullly automated solution
Adapted from https://gitlab.unige.ch/Pierre-Marie.Allard/pybatchclassyfire/-/tree/master/notebook

In [8]:
#%pip install requests_cache
from pybatchclassyfire import *
import pandas as pd
import os
import csv 
import time
import json
from pandas import json_normalize

DEBUG:requests_cache.backends:Initializing backend: None demo_cache_pybatch
DEBUG:requests_cache.backends.base:Initializing SQLitePickleDict with serializer: <requests_cache.serializers.pipeline.SerializerPipeline object at 0x14a09ef6de80>
DEBUG:requests_cache.backends.sqlite:Opening connection to /auto/brno6/home/xstary1/raims/demo_cache_pybatch.sqlite:responses
DEBUG:requests_cache.backends.base:Initializing SQLiteDict with serializer: <requests_cache.serializers.pipeline.SerializerPipeline object at 0x14a09ef6de80>
DEBUG:requests_cache.backends.sqlite:Opening connection to /auto/brno6/home/xstary1/raims/demo_cache_pybatch.sqlite:redirects


In [9]:
gnps_proxy = True 

url = "http://classyfire.wishartlab.com"
proxy_url =  "https://gnps-classyfire.ucsd.edu"
chunk_size = 400
sleep_interval = 0

In [11]:
%%time

total_df = pd.read_csv("total_df.csv", index_col=0)
total_df = total_df[pd.isnull(total_df.inchikey) == False]
ink_list = list(reversed(list(set(nist_keys).difference(main_df.index).difference(set([x.split("=")[1] for x in total_df.inchikey])))))

for i in range(len(ink_list)//chunk_size):
    print(f"chunk: {i}/{len(ink_list)//chunk_size}")
    get_classifications_cf_mod(ink_list[i*chunk_size: (i+1)*chunk_size], 8)
    
    cleanse('all_json.json', 'all_json.json')
    
    with open("all_json.json") as tweetfile:
        jsondic = json.loads(tweetfile.read())

    df = json_normalize(jsondic)
    total_df = total_df.append(df, ignore_index=True)
    time.sleep(sleep_interval)
    total_df.to_csv("total_df_.csv")
    total_df.to_csv("total_df.csv")
    print(f"done: {len(total_df)}")
    print()



chunk: 0/251
done: 164497

chunk: 1/251
done: 164511

chunk: 2/251
done: 164529

chunk: 3/251
done: 164547

chunk: 4/251
done: 164568

chunk: 5/251
done: 164591

chunk: 6/251
done: 164607

chunk: 7/251
done: 164631

chunk: 8/251
done: 164653

chunk: 9/251
done: 164669

chunk: 10/251
done: 164684

chunk: 11/251
done: 164704

chunk: 12/251
done: 164722

chunk: 13/251
done: 164740

chunk: 14/251
done: 164759

chunk: 15/251
done: 164776

chunk: 16/251
done: 164796

chunk: 17/251
done: 164812

chunk: 18/251
done: 164827

chunk: 19/251
done: 164847

chunk: 20/251
done: 164861

chunk: 21/251
done: 164880

chunk: 22/251
done: 164895

chunk: 23/251
done: 164914

chunk: 24/251
done: 164932

chunk: 25/251
done: 164947

chunk: 26/251
done: 164963

chunk: 27/251
done: 164970

chunk: 28/251
done: 164988

chunk: 29/251
done: 165009

chunk: 30/251
done: 165030

chunk: 31/251
done: 165048

chunk: 32/251
done: 165066

chunk: 33/251
done: 165083

chunk: 34/251
done: 165104

chunk: 35/251
done: 165125

ch

## Merge automatically retrieved classes to main_df

TODO

## Merge manually retrieved classes to main_df

In [12]:


TMP_FILE = "/storage/brno6/home/xstary1/raims/inchi2class_.csv"
tmp_df = pd.read_csv(TMP_FILE, index_col=0)

# drop unsuccessful from tmp
tmp_df = tmp_df[tmp_df["Status"] == "Completed"]
tmp_df.drop(columns=["Status"], inplace=True)

# get missing compounds
missing = tmp_df.index.difference(main_df.index)

# merge tmp into main
main_df = main_df.append(tmp_df.loc[missing])

# save enriched main file
main_df.to_csv(MAIN_FILE)

In [13]:
pd.read_csv(MAIN_FILE)

Unnamed: 0,InChIKey,Kingdom,Superclass,Class,Subclass,Parent Level 1,Parent Level 2,Parent Level 3,Parent Level 4,Parent Level 5,Parent Level 6
0,CFKMVGJGLGKFKI-UHFFFAOYSA-N,Organic compounds,Benzenoids,Phenols,Cresols,Meta cresols,,,,,
1,CJMJIWJARUFVLF-UHFFFAOYSA-N,Organic compounds,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,Ketones,Aryl ketones,Phenylketones,Alkyl-phenylketones,,
2,AVIRMQMUBGNCKS-RWCYGVJQSA-N,Organic compounds,Phenylpropanoids and polyketides,Macrolides and analogues,,Macrolides and analogues,,,,,
3,AGBQKNBQESQNJD-SSDOTTSWSA-N,Organic compounds,Organoheterocyclic compounds,Dithiolanes,Lipoic acids and derivatives,Lipoic acids and derivatives,,,,,
4,XJTUNBKAWATELL-UHFFFAOYSA-N,Organic compounds,Organometallic compounds,Organometalloid compounds,Organosilicon compounds,Alkylsilanes,Trialkylsilanes,,,,
...,...,...,...,...,...,...,...,...,...,...,...
14042,ULNSOTRESSNLPZ-UHFFFAOYSA-N,Organic compounds,Organometallic compounds,Organometalloid compounds,Organosilicon compounds,Alkylarylsilanes,,,,,
14043,VZIJDIWCWDYVFM-UHFFFAOYSA-N,Organic compounds,Benzenoids,Benzene and substituted derivatives,Benzylethers,Benzylethers,,,,,
14044,WJWQUGVMBMNRQU-UHFFFAOYSA-N,Organic compounds,Organoheterocyclic compounds,Diazines,Pyrimidines and pyrimidine derivatives,Pyrimidones,,,,,
14045,WYORLUAUAFCFES-UHFFFAOYSA-N,Organic compounds,Benzenoids,Phenol esters,,Phenol esters,,,,,


## Enhance datasets with classes obtained

In [14]:
for ds in [mona, nist, rcx1, rcx2]:
    # TODO update nist with inchikeys and remove this 
    if ds == nist:
        continue
    for rec in ds:
        if "inchikey" not in rec.metadata:
            continue
        key = rec.metadata["inchikey"]
        
        if key not in main_df.index:
            continue
            
        for name, val in main_df.loc[key].items():
            rec.set("ClassyFire_" + name, val)

In [15]:
mona[0].metadata

{'name': '1-NITROPYRENE',
 'synon': '$:00in-source',
 'db#': 'JP000001',
 'inchikey': 'ALRLPDGCPYIVHP-UHFFFAOYSA-N',
 'spectrum_type': 'MS1',
 'instrument_type': 'EI-B',
 'instrument': 'VARIAN MAT-44',
 'ion_mode': 'P',
 'formula': 'C16H9NO2',
 'mw': '247',
 'exactmass': '247.063328528',
 'smiles': '[O-1][N+1](=O)c(c4)c(c1)c(c3c4)c(c2cc3)c(ccc2)c1',
 'inchi': 'InChI=1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H',
 'computed smiles': 'C1=CC2=C3C(=C1)C=CC4=CC=C(C(=C43)C=C2)N(=O)=O',
 'accession': 'JP000001',
 'date': '2016.01.19 (Created 2008.10.21, modified 2011.05.06)',
 'author': 'KOGA M, UNIV. OF OCCUPATIONAL AND ENVIRONMENTAL HEALTH',
 'license': 'CC BY-NC-SA',
 'exact mass': '247.06333',
 'ionization energy': '70 eV',
 'ion type': '[M]+*',
 'splash': 'splash10-0udj-7790000000-5839a7971f0d731a8df6',
 'submitter': 'University of Tokyo Team (Faculty of Engineering, University of Tokyo)',
 'mona rating': '3.75',
 'num peaks': '75',
 'ClassyFire_Kingdom'

In [16]:
for path, ds in zip([mona_path, nist_path, rcx1_path, rcx2_path], [mona, nist, rcx1, rcx2]):
    newpath = path.split(".")[0] + "_with_classes.msp"
    save_as_msp(ds, newpath)