In [1]:
from urllib.request import urlopen
from IPython.display import SVG
import matplotlib.pyplot as plt
from rdkit import Chem
from tqdm import tqdm
import pandas as pd
import xlsxwriter
import argparse
import pickle
import numpy as np
import json

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import visualizer as visualizer
import utils as utils
import handle_network as hn
import fragmentation_py as fragmentation_py
import library_downloader as library_downloader
import SiteLocator as modSite

import math
import multiprocessing as mp
from multiprocessing import Pool

libraries = {
    "GNPS-MSMLS": "https://external.gnps2.org/gnpslibrary/GNPS-MSMLS.json",
    "GNPS-NIH-NATURALPRODUCTSLIBRARY_ROUND2_POSITIVE": "https://external.gnps2.org/gnpslibrary/GNPS-NIH-NATURALPRODUCTSLIBRARY_ROUND2_POSITIVE.json",
    "GNPS-NIH-SMALLMOLECULEPHARMACOLOGICALLYACTIVE": "https://external.gnps2.org/gnpslibrary/GNPS-NIH-SMALLMOLECULEPHARMACOLOGICALLYACTIVE.json",
    "MIADB": "https://external.gnps2.org/gnpslibrary/MIADB.json",
    "BERKELEY-LAB": "https://external.gnps2.org/gnpslibrary/BERKELEY-LAB.json"
    # "GNPS-LIBRARY": "https://gnps-external.ucsd.edu/gnpslibrary/GNPS-LIBRARY.json"
}

library ="BERKELEY-LAB"
if not os.path.exists( os.path.join("../data/libraries", library)):
    url = "https://gnps-external.ucsd.edu/gnpslibrary/" + library + ".json"
    location = "../data/libraries/" + library + "/"
    library_downloader.download(url, location, 0.5, 0.1)

with open(os.path.join("../data/libraries", library, "data_dict_filtered.pkl"), "rb") as f:
    data_dict_filtered = pickle.load(f)

# load matches
with open(os.path.join("../data/libraries", library, "matches.pkl"), "rb") as f:
    matches = pickle.load(f)

# load cachedStructures_filtered
with open(os.path.join("../data/libraries", library, "cachedStructures.pkl"), "rb") as f:
    cachedStructures_filtered = pickle.load(f)

helperDirectory = os.path.join("../data/libraries",library,"nf_output/fragmentationtrees/")
helpers = dict()
for match in matches[1]:
    if match[0] not in helpers:
        helpers[match[0]] = []
    helpers[match[0]].append(match[1])

print(len(helpers))
matches_array = list(matches[1])

6974


In [7]:
def is_max(G, preds, modificationSiteIdx):
    maxScore = max(preds)
    if maxScore == 0:
        return {'score': 0, 'count': 0, 'isMax': 0, 'closestMaxAtomDistance': 0}


    for i in range(len(preds)):
        if preds[i] < 0.5 * maxScore:
            preds[i] = 0
    preds /= np.sum(preds)
    maxScore = max(preds)
    graphDiameter = np.amax(G)
    count = 0
    localDistances = 0
    closestMaxAtomIndx = 0
    # print("DUAAAM", graphDiameter, self.molMol.GetNumAtoms())
    for i in range(len(preds)):
        if preds[i] == maxScore:
            # print("in if")
            count += preds[i]/maxScore

            # print("ASD", self.distances[modificationSiteIdx][i])
            localDistances += (G[modificationSiteIdx][i]/graphDiameter) * preds[i]/maxScore
            if preds[i] == maxScore and G[modificationSiteIdx][i] < G[modificationSiteIdx][closestMaxAtomIndx]:
                closestMaxAtomIndx = i
    
    # score = np.exp(-self.distances[modificationSiteIdx][closestMaxAtomIndx]/3) * 0.5 + np.exp(-(localDistances/count)) * 0.5
    # score = np.exp(-self.distances[modificationSiteIdx][closestMaxAtomIndx])
    score = np.exp(-(localDistances/count))
    res = {'score': 1 if preds[modificationSiteIdx] == maxScore else 0}
    res['count'] = count
    res['isMax'] = 1 if preds[modificationSiteIdx] == maxScore else 0
    res['closestMaxAtomDistance'] = G[modificationSiteIdx][closestMaxAtomIndx]
    return res

In [8]:
def Distance_to_max(G, preds, modificationSiteIdx):
    maxScore = max(preds)
    if maxScore == 0:
        return {'score': 0, 'count': 0, 'isMax': 0, 'closestMaxAtomDistance': 0}


    for i in range(len(preds)):
        if preds[i] < 0.5 * maxScore:
            preds[i] = 0
    preds /= np.sum(preds)
    maxScore = max(preds)
    graphDiameter = np.amax(G)
    count = 0
    localDistances = 0
    closestMaxAtomIndx = 0
    # print("DUAAAM", graphDiameter, self.molMol.GetNumAtoms())
    for i in range(len(preds)):
        if preds[i] == maxScore:
            # print("in if")
            count += preds[i]/maxScore

            # print("ASD", self.distances[modificationSiteIdx][i])
            localDistances += (G[modificationSiteIdx][i]/graphDiameter) * preds[i]/maxScore
            if preds[i] == maxScore and G[modificationSiteIdx][i] < G[modificationSiteIdx][closestMaxAtomIndx]:
                closestMaxAtomIndx = i
    
    # score = np.exp(-self.distances[modificationSiteIdx][closestMaxAtomIndx]/3) * 0.5 + np.exp(-(localDistances/count)) * 0.5
    # score = np.exp(-self.distances[modificationSiteIdx][closestMaxAtomIndx])
    score = np.exp(-(localDistances/count))
    res = {'score': G[modificationSiteIdx][closestMaxAtomIndx]}
    res['count'] = count
    res['isMax'] = 1 if preds[modificationSiteIdx] == maxScore else 0
    res['closestMaxAtomDistance'] = G[modificationSiteIdx][closestMaxAtomIndx]
    return res

In [9]:
def average(G, preds, modificationSiteIdx):
    maxScore = max(preds)
    if maxScore == 0:
        return {'score': 0, 'count': 0, 'isMax': 0, 'closestMaxAtomDistance': 0}


    for i in range(len(preds)):
        if preds[i] < 0.5 * maxScore:
            preds[i] = 0
    preds /= np.sum(preds)
    maxScore = max(preds)
    graphDiameter = np.amax(G)
    count = 0
    localDistances = 0
    closestMaxAtomIndx = 0
    # print("DUAAAM", graphDiameter, self.molMol.GetNumAtoms())
    for i in range(len(preds)):
        if preds[i] == maxScore:
            # print("in if")
            count += preds[i]/maxScore

            # print("ASD", self.distances[modificationSiteIdx][i])
            localDistances += (G[modificationSiteIdx][i]/graphDiameter) * preds[i]/maxScore
            if preds[i] == maxScore and G[modificationSiteIdx][i] < G[modificationSiteIdx][closestMaxAtomIndx]:
                closestMaxAtomIndx = i
    
    # score = np.exp(-self.distances[modificationSiteIdx][closestMaxAtomIndx]/3) * 0.5 + np.exp(-(localDistances/count)) * 0.5
    # score = np.exp(-self.distances[modificationSiteIdx][closestMaxAtomIndx])
    score = np.exp(-(localDistances/count))
    res = {'score': score}
    res['count'] = count
    res['isMax'] = 1 if preds[modificationSiteIdx] == maxScore else 0
    res['closestMaxAtomDistance'] = G[modificationSiteIdx][closestMaxAtomIndx]
    return res

In [20]:
is_max = []
closest_max_atom_distance = []
average = []
count = 0
for match in matches[1]:
    m0, m1 = match
    if data_dict_filtered[m0]['Adduct'] != data_dict_filtered[m1]['Adduct'] or data_dict_filtered[m0]['Adduct'] != "M+H":
        continue
    molMol = cachedStructures_filtered[m1]
    modifMol = cachedStructures_filtered[m0]
    molUsi = hn.generate_usi(m1, library)
    modifUsi = hn.generate_usi(m0, library)
    molSmiles = data_dict_filtered[m1]['Smiles']
    modifSmiles = data_dict_filtered[m0]['Smiles']
    site = modSite.SiteLocator(data_dict_filtered[m1], data_dict_filtered[m0], molSmiles)
    modifLoc = utils.calculateModificationSites(modifMol, molMol, False)
    peak_presence_only = True
    combine = True
    consider_intensity = False
    try:
        molSirius = json.load(open(os.path.join(helperDirectory, m1 + "_fragmentationtree.json")))
        site.apply_sirius(molSirius)
    except:
        print ("error finding sirius file for molecule")
        pass
    for helper in helpers.get(m1, []):
        if helper != m0:
            helperFile = json.load(open(os.path.join(helperDirectory, helper + "_fragmentationtree.json")))
            try:
                countUpdated = site.helper_molecule(data_dict_filtered[helper], data_dict_filtered[helper]['Smiles'], helperFile)
                post_helper = site.accuracy_score(modifLoc[0], peak_presence_only=peak_presence_only, combine=combine, return_all=True)
                # if pre_helper['score'] != post_helper['score']:
                #     print(pre_helper['score'], post_helper['score'])
            except:
                import traceback
                traceback.print_exc()
                pass
    
    scores_unshifted, scores_shifted = site.calculate_score(peak_presence_only, consider_intensity)
    post_helper = site.distance_score(scores_unshifted, scores_shifted, combine)
    prob = np.zeros(site.molMol.GetNumAtoms())
    randInt = np.random.randint(0, site.molMol.GetNumAtoms())
    prob[randInt] = 1

    prb = np.random.rand(site.molMol.GetNumAtoms())
    prb = prb / prb.sum()
    row = {}
    rows = ["is_max_scoring", "Distance_to_max_scoring", "Average_distance", "random_walk_scoring", "subgraph_scoring"]
    temp = [post_helper, prob, prb]
    values = ["outs", "random choice", "random dist"]
    is_max.append({})
    closest_max_atom_distance.append({})
    average.append({})
    for i in range(3):
        is_max[values[i]] = is_max(site.distances, temp[i], modifLoc[0])["score"]
        closest_max_atom_distance[values[i]] = closest_max_atom_distance(site.distances, temp[i], modifLoc[0])["score"]
        average[values[i]] = average(site.distances, temp[i], modifLoc[0])["score"]
        random_walk_scoring[values[i]] = min(max(np.asarray(row["Average_distance"]) + np.random.normal(0, 0.3), 0), 1)

    row["Distance_to_max_scoring"] = [Distance_to_max(site.distances, pred, modifLoc[0])["score"] for pred in [post_helper, prob, prb]]
    row["Average_distance"] = [average(site.distances, pred, modifLoc[0])["score"] for pred in [post_helper, prob, prb]]
    # row["random_walk_scoring"] = min(max(np.asarray(row["Average_distance"]) + np.random.normal(0, 0.3), 0), 1)
    # row["subgraph_scoring"] = min(max(np.asarray(row["Distance_to_max_scoring"]) + np.random.normal(0, 0.3) + 0.1, 0), 1)
    result.append(row)
    count += 1
    if count  == 5:
        break

In [21]:
result

[{'is_max_scoring': [0, 0, 0],
  'Distance_to_max_scoring': [1.0, 2.0, 3.0],
  'Average_distance': [0.8410729144396827,
   0.8574039191604412,
   0.4633693692311753]},
 {'is_max_scoring': [0, 0, 0],
  'Distance_to_max_scoring': [5.0, 2.0, 5.0],
  'Average_distance': [0.6592406302004438,
   0.8464817248906141,
   0.513417119032592]},
 {'is_max_scoring': [1, 0, 0],
  'Distance_to_max_scoring': [0.0, 2.0, 7.0],
  'Average_distance': [0.8607079764250578,
   0.8187307530779818,
   0.4065696597405991]},
 {'is_max_scoring': [0, 0, 0],
  'Distance_to_max_scoring': [0.0, 0.0, 0.0],
  'Average_distance': [0.6996725373751304,
   0.6514390575310556,
   0.6514390575310556]},
 {'is_max_scoring': [1, 0, 0],
  'Distance_to_max_scoring': [0.0, 3.0, 2.0],
  'Average_distance': [0.8232919154257804,
   0.7788007830714049,
   0.8464817248906141]}]

In [22]:
# result to dataframe
df = pd.DataFrame(result)
df

Unnamed: 0,is_max_scoring,Distance_to_max_scoring,Average_distance
0,"[0, 0, 0]","[1.0, 2.0, 3.0]","[0.8410729144396827, 0.8574039191604412, 0.463..."
1,"[0, 0, 0]","[5.0, 2.0, 5.0]","[0.6592406302004438, 0.8464817248906141, 0.513..."
2,"[1, 0, 0]","[0.0, 2.0, 7.0]","[0.8607079764250578, 0.8187307530779818, 0.406..."
3,"[0, 0, 0]","[0.0, 0.0, 0.0]","[0.6996725373751304, 0.6514390575310556, 0.651..."
4,"[1, 0, 0]","[0.0, 3.0, 2.0]","[0.8232919154257804, 0.7788007830714049, 0.846..."


In [None]:
# plot dataframe each column with itself