# Summary
In this notebook I am prospecting how to gather chemical and bgc classes from mibigs. This is inspired by ClassifyNPDB.

I will try to use the gnps classyfire api and see how many chemical classifications that results in, so the need for pyclassyfire is obsolete.

In [63]:
import os
import urllib
import glob
import json
from typing import Dict, List

In [7]:
mibig_folder = "/mnt/scratch/louwe015/mibig_json_2.0/"

base_path = "/mnt/scratch/louwe015/NPLinker/classifying/mibig_classifications/"
mibig_smiles_path = os.path.join(base_path, "files/All_MIBiG_compounds_with_SMILES_and_PMID_MAS.txt")
print("mibig_smiles_path exists is", os.path.isfile(mibig_smiles_path))

mibig_smiles_path exists is True


## Read in mibig smiles

In [16]:
def load_mibig_smiles(in_file: str) -> Dict[str, str]:
    """Load mibig smiles from file (tsv) and return {BGCid_Compound: smiles}
    """
    compound_dict = {}
    with open(in_file, 'r') as inf:
        inf.readline() #ignore header
        for line in inf:
            line = line.split('\t')
            compound_id = line[0]+"_"+line[1]
            structure = line[2]
            if compound_id in compound_dict:
                raise ValueError("Duplication in mibig smiles file, please check file")
            if len(structure) > 0:
                compound_dict[compound_id] = structure
    return compound_dict

In [17]:
comp_dict = load_mibig_smiles(mibig_smiles_path)

In [25]:
list(comp_dict.items())[:5]

[('BGC0000001_abyssomicin C',
  'CC1C[C@]23OC(=O)C4=C2OC1C(O)C3\\C=C/C(=O)[C@@H](C)C[C@@H](C)C4=O'),
 ('BGC0000001_atrop-abyssomicin C',
  'CC1CC23OC(=O)C4=C2OC1C(O)C3\\C=C/C(=O)C(C)CC(C)C4=O'),
 ('BGC0000002_aculeximycin',
  'CCCC(O[C@H]1C[C@](C)(N)[C@H](O)[C@H](C)O1)C(C)C(O)C(CC)\\C=C\\C(O)C(C)C1C\\C=C(C)\\C(O)C(C)C(CC(O)C(C)C(O)CC2CC(O)C(O)C(O)(CC(O[C@@H]3O[C@H](C)[C@@H](O)[C@H](O[C@H]4C[C@@H](N)[C@H](O)[C@@H](C)O4)[C@H]3O[C@@H]3O[C@H](C)[C@@H](O)[C@H](O)[C@H]3O)C(C)CCC(O)CC(O)C\\C=C(CC)\\C(=O)O1)O2)O[C@@H]1O[C@H](CO)[C@@H](O)[C@H](O)[C@@H]1O'),
 ('BGC0000003_AF-toxin',
  'CCC(C)C(C(=O)OC(/C=C/C=C/C=C/C(=O)O)C1(CO1)C)OC(=O)C(C(C)(C)O)O'),
 ('BGC0000004_aflatoxin G1',
  '[H][C@@]12OC=C[C@]1([H])C1=C(O2)C=C(OC)C2=C1OC(=O)C1=C2CCOC1=O')]

## Use GNPS api to get ClassyFire results from smiles
- Try to get it from smiles
- If result is empty; convert to inchikey through gnps as well
- get CF result from inchikey


### First try just using smiles

In [51]:
# just trying with smiles

result_list = []
for i, (np, smiles) in enumerate(comp_dict.items()):
    url_base = "https://gnps-structure.ucsd.edu/classyfire?smiles="
    smiles = smiles.strip(' ')
    safe_smiles = urllib.parse.quote(smiles)  # url encoding
    url = url_base + safe_smiles
    try:
        with urllib.request.urlopen(url) as inf:
            smiles_result = inf.read()
    except urllib.error.HTTPError:
        # apparently the smiles request failed
        smiles_result = None
        superclass = None  # remove later when directing to inchikey

    if smiles_result is not None:
        cf_json = json.loads(smiles_result)  # make a function that gathers result from json, no matter the source
        superclass_dict = cf_json.get("superclass", None)
        superclass = None
        if superclass_dict:
            superclass = superclass_dict.get("name", None)
        
    result_list.append(superclass)

missing_res_num = len([elem for elem in result_list if elem is None])
print(f"Gathered {len(result_list)-missing_res_num} results, missing {missing_res_num} results")

Gathered 1766 results, missing 346 results


### First lookup smiles, then try via lookup of inchikey
apparently this doesn't work.

In [64]:
result_list = []
for i, (np, smiles) in enumerate(comp_dict.items()):
    superclass = None
    
    # lookup CF with smiles
    url_base = "https://gnps-structure.ucsd.edu/classyfire?smiles="
    smiles = smiles.strip(' ')
    safe_smiles = urllib.parse.quote(smiles)  # url encoding
    url = url_base + safe_smiles
    try:
        with urllib.request.urlopen(url) as inf:
            smiles_result = inf.read()
    except urllib.error.HTTPError:
        # apparently the smiles request failed
        smiles_result = None
    
    # read CF result
    if smiles_result is not None:
        cf_json = json.loads(smiles_result)  # make a function that gathers result from json, no matter the source
        superclass_dict = cf_json.get("superclass", None)
        if superclass_dict:
            superclass = superclass_dict.get("name", None)
    
    
    if superclass is None:
        # lookup inchikey from smiles
        url_base = "https://gnps-structure.ucsd.edu/inchikey?smiles="
        url = url_base + safe_smiles
        try:
            with urllib.request.urlopen(url) as inf:
                inchi_result = str(inf.read(), 'utf-8')
#                 print(inchi_result)
        except urllib.error.HTTPError:
            # apparently the inchi request failed
            inchi_result = None
        
        # do CF lookup with inchikey
        if inchi_result is not None:
            url = f"https://gnps-classyfire.ucsd.edu/entities/{inchi_result}.json"
#             print(url)
            try:
                with urllib.request.urlopen(url) as inf:
                    inchi_cf_result = inf.read()
            except urllib.error.HTTPError:
                # apparently the inchi request failed
                inchi_cf_result = None
            
            # read CF result from inchikey lookup
            if inchi_cf_result is not None:
                cf_json = json.loads(inchi_cf_result)  # make a function that gathers result from json, no matter the source
                superclass_dict = cf_json.get("superclass", None)
                if superclass_dict:
                    superclass = superclass_dict.get("name", None)

    result_list.append(superclass)

missing_res_num = len([elem for elem in result_list if elem is None])
print(f"Gathered {len(result_list)-missing_res_num} results, missing {missing_res_num} results")

Gathered 1766 results, missing 346 results


### Try also getting inchikey from rdkit

In [None]:
from rdkit import Chem

result_list = []
for i, (np, smiles) in enumerate(comp_dict.items()):
    superclass = None
    
    # lookup CF with smiles
    url_base = "https://gnps-structure.ucsd.edu/classyfire?smiles="
    smiles = smiles.strip(' ')
    safe_smiles = smiles #urllib.parse.quote(smiles)  # url encoding
    url = url_base + safe_smiles
    try:
        with urllib.request.urlopen(url) as inf:
            smiles_result = inf.read()
    except urllib.error.HTTPError:
        # apparently the smiles request failed
        smiles_result = None
    
    # read CF result
    if smiles_result is not None:
        cf_json = json.loads(smiles_result)  # make a function that gathers result from json, no matter the source
        superclass_dict = cf_json.get("superclass", None)
        if superclass_dict:
            superclass = superclass_dict.get("name", None)
    
    
    if superclass is None:
        # lookup inchikey from smiles
        url_base = "https://gnps-structure.ucsd.edu/inchikey?smiles="
        url = url_base + safe_smiles
        try:
            with urllib.request.urlopen(url) as inf:
                inchi_result = str(inf.read(), 'utf-8')
#                 print(inchi_result)
        except urllib.error.HTTPError:
            # apparently the inchi request failed
            inchi_result = None
        
        # do CF lookup with inchikey
        if inchi_result is not None:
            url = f"https://gnps-classyfire.ucsd.edu/entities/{inchi_result}.json"
#             print(url)
            try:
                with urllib.request.urlopen(url) as inf:
                    inchi_cf_result = inf.read()
            except urllib.error.HTTPError:
                # apparently the inchi request failed
                inchi_cf_result = None
            
            # read CF result from inchikey lookup
            if inchi_cf_result is not None:
                cf_json = json.loads(inchi_cf_result)  # make a function that gathers result from json, no matter the source
                superclass_dict = cf_json.get("superclass", None)
                if superclass_dict:
                    superclass = superclass_dict.get("name", None)
    
    # do a last try with inchikey from rdkit
    m = Chem.MolFromSmiles(smiles)
    rdkit_smiles = Chem.MolToSmiles(m, kekuleSmiles=False,isomericSmiles=False)
    m = Chem.MolFromSmiles(rdkit_smiles)
    rdkit_inchi = Chem.inchi.MolToInchiKey(m)
    print(rdkit_smiles, rdkit_inchi)

    result_list.append(superclass)
    break

missing_res_num = len([elem for elem in result_list if elem is None])
print(f"Gathered {len(result_list)-missing_res_num} results, missing {missing_res_num} results")

## Make some functions
- do_url_request(url: str) -> [bytes, None]
- get_json_cf_results(raw_json: bytes, wanted_info_list: List[str]) -> List[str]
- master func

In [None]:
def do_url_request(url: str) -> [bytes, None]:
    """Do url request and return opened .read() object or None if HTTPError is raised
    
    Parameters
    ----------
    url:
        Url to access
    """
    try:
        with urllib.request.urlopen(url) as inf:
            result = inf.read()
    except urllib.error.HTTPError:
        # apparently the request failed
        result = None
    return result

def get_json_cf_results(raw_json: bytes, wanted_keys_list_name: List[str] = ["superclass"]) -> List[str]:
    """Read bytes version of json str, extract the keys in wanted_keys_list_name in order
    
    Parameters
    ----------
    raw_json:
        Json str as a bytes object containing ClassyFire information
    wanted_keys_list_name:
        Keys to extract from the json, they all have a 'name' value in the json
    """
    wanted_info = []
    cf_json = json.loads(raw_json)
    for key in wanted_keys_list_name:
        info_dict = cf_json.get(key, "")
        if info_dict:
            info = info_dict.get('name', "")
    wanted_info.append(info)
    
    return wanted_info