In [51]:
import os
import sys
import numpy as np
from tqdm.notebook import tqdm
from matchms.importing import load_from_json
import tensorflow as tf
from tensorflow.keras.utils import to_categorical

ROOT = os.path.dirname(os.getcwd())
sys.path.insert(0, ROOT)
path_data = 'C:\\OneDrive - Netherlands eScience Center\\Project_Wageningen_iOMEGA\\Data\\'

## Load spectra

In [12]:
import pickle

outfile = os.path.join(path_data, 'GNPS_all', 'ALL_GNPS_210125_positive_cleaned_by_matchms_and_lookups.pickle')
with open(outfile, 'rb') as file:
    spectrums = pickle.load(file)

print("number of spectra:", len(spectrums))

number of spectra: 144691


In [5]:
from matchms.filtering import normalize_intensities
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_mz
from matchms.filtering import select_by_relative_intensity
from matchms.filtering import reduce_to_number_of_peaks
from matchms.filtering import add_losses

In [13]:
def post_process(s):
    s = normalize_intensities(s)
    s = select_by_mz(s, mz_from=10.0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=5)
    return s

# apply post processing steps to the data
spectrums = [post_process(s) for s in spectrums]

# omit spectrums that didn't qualify for analysis
spectrums = [s for s in spectrums if s is not None]

print("Number of remaining spectra:", len(spectrums))

Number of remaining spectra: 129411


In [7]:
import pickle
pickle.dump(spectrums, 
            open(os.path.join(path_data,'ALL_GNPS_210125_positive_processed.pickle'), "wb"))

In [6]:
import pickle
outfile = os.path.join(path_data,'ALL_GNPS_210125_positive_processed.pickle')
with open(outfile, 'rb') as file:
    spectrums = pickle.load(file)

### Minimum filtering

In [8]:
number_of_peaks = [len(spec.peaks) for spec in spectrums]

print("Maximum number of peaks in one spectrum:", np.max(number_of_peaks))
print("Number of spectra with > 1000 peaks:", np.sum(np.array(number_of_peaks)>1000))
print("Number of spectra with > 2000 peaks:", np.sum(np.array(number_of_peaks)>2000))
print("Number of spectra with > 5000 peaks:", np.sum(np.array(number_of_peaks)>5000))
print("Careful: Number of spectra with < 10 peaks:", np.sum(np.array(number_of_peaks)<10))

Maximum number of peaks in one spectrum: 37922
Number of spectra with > 1000 peaks: 5474
Number of spectra with > 2000 peaks: 2244
Number of spectra with > 5000 peaks: 691
Careful: Number of spectra with < 10 peaks: 0


In [9]:
ID = 102
if spectrums[ID].get("inchi") + spectrums[ID].get("smiles"):
    print(spectrums[ID].get("inchi") + "\n\n" + spectrums[ID].get("smiles"))

InChI=1S/C28H26N4O3/c1-28-26(34-3)17(29-2)12-20(35-28)31-18-10-6-4-8-14(18)22-23-16(13-30-27(23)33)21-15-9-5-7-11-19(15)32(28)25(21)24(22)31/h4-11,17,20,26,29H,12-13H2,1-3H3,(H,30,33)

CNC1CC2OC(C)(C1OC)N1C3=CC=CC=C3C3=C4CNC(=O)C4=C4C5=C(C=CC=C5)N2C4=C13


In [18]:
def count_annotations(spectra):
    inchi_lst = []
    smiles_lst = []
    inchikey_lst = []
    for i, spec in enumerate(spectra):
        inchi_lst.append(spec.get("inchi"))
        smiles_lst.append(spec.get("smiles"))
        inchikey = spec.get("inchikey")
        if inchikey is None:
            inchikey = spec.get("inchikey_inchi")
        inchikey_lst.append(inchikey)

    inchi_count = sum([1 for x in inchi_lst if x])
    smiles_count = sum([1 for x in smiles_lst if x])
    inchikey_count = sum([1 for x in inchikey_lst if x])
    print(f"Inchis: {inchi_count} -- {len(set(inchi_lst))} unique")
    print("Smiles: {smiles_count} -- {len(set(smiles_lst))} unique")
    print("Inchikeys:", inchikey_count, "--",
          len(set(inchikey_lst)), "unique")
    print("Inchikeys:", inchikey_count, "--",
          len(set([x[:14] for x in inchikey_lst if x])), "unique (first 14 characters)")

In [19]:
count_annotations(spectrums)

Inchis: 109775 -- 18686 unique
Smiles: {smiles_count} -- {len(set(smiles_lst))} unique
Inchikeys: 109739 -- 17303 unique
Inchikeys: 109739 -- 15062 unique (first 14 characters)


In [31]:
def annotated(s):
    return (s.get("inchi") or s.get("smiles")) and s.get("inchikey")

In [20]:
annotation_list = []
for i, s in enumerate(spectrums):
    if annotated(s):
        annotation_list.append((i, s.get("inchi"), s.get("smiles"), s.get("inchikey")))

In [21]:
len(annotation_list)

109734

In [29]:
print(f"Unique inchikeys (14char) in annotated dat: {len({x[3][:14] for x in annotation_list})}")

Unique inchikeys (14char) in annotated dat: 15062


In [32]:
spectrums_annotated = [s for s in spectrums if annotated(s)]

In [33]:
len(spectrums_annotated), len(spectrums)

(109734, 129411)

In [34]:
import pickle
pickle.dump(spectrums_annotated, 
            open(os.path.join(path_data,'GNPS_all', 'ALL_GNPS_210125_positive_processed_annotated.pickle'), "wb"))

## Create reference scores (Tanimoto)
- Check better alternatives?

In [35]:
from collections import Counter 
  
def most_frequent(List): 
    occurence_count = Counter(List) 
    return occurence_count.most_common(1)[0][0] 

In [41]:
inchikeys_list = []
for s in spectrums_annotated:
    inchikeys_list.append(s.get("inchikey"))

inchikeys14_array = np.array([x[:14] for x in inchikeys_list])

In [39]:
inchikeys14_unique = list({x[:14] for x in inchikeys_list})
len(inchikeys14_unique)

15062

In [43]:
inchikey14 = inchikeys14_unique[1000]
print(inchikey14)

idx = np.where(inchikeys14_array == inchikey14)[0]
for i in idx:
    print(spectrums_annotated[i].get("smiles") + "\n")

print("most frequent:", most_frequent([spectrums_annotated[i].get("smiles") for i in idx]))

PKTVMNKLPFVXBH
COc1cc(OC)c(-c2c(OC)cc(OC)cc2OC)c(OC)c1

COc1cc(OC)c(-c2c(OC)cc(OC)cc2OC)c(OC)c1

COc1cc(OC)c(-c2c(OC)cc(OC)cc2OC)c(OC)c1

COc1cc(OC)c(-c2c(OC)cc(OC)cc2OC)c(OC)c1

COc1cc(OC)c(-c2c(OC)cc(OC)cc2OC)c(OC)c1

COc1cc(OC)c(-c2c(OC)cc(OC)cc2OC)c(OC)c1

COc1cc(OC)c(-c2c(OC)cc(OC)cc2OC)c(OC)c1

COc1cc(OC)c(-c2c(OC)cc(OC)cc2OC)c(OC)c1

COc1cc(OC)c(-c2c(OC)cc(OC)cc2OC)c(OC)c1

COc1cc(OC)c(-c2c(OC)cc(OC)cc2OC)c(OC)c1

COc1cc(OC)c(-c2c(OC)cc(OC)cc2OC)c(OC)c1

COc1cc(OC)c(-c2c(OC)cc(OC)cc2OC)c(OC)c1

COc1cc(OC)c(-c2c(OC)cc(OC)cc2OC)c(OC)c1

COc1cc(OC)c(-c2c(OC)cc(OC)cc2OC)c(OC)c1

most frequent: COc1cc(OC)c(-c2c(OC)cc(OC)cc2OC)c(OC)c1


In [44]:
inchi_list = []
for s in spectrums_annotated:
    inchi_list.append(s.get("inchi"))

inchi_array = np.array(inchi_list)

In [45]:
inchi_mapping = []
ID_mapping = []

for inchikey14 in inchikeys14_unique:
    idx = np.where(inchikeys14_array == inchikey14)[0]
    
    inchi = most_frequent([spectrums_annotated[i].get("inchi") for i in idx])
    inchi_mapping.append(inchi)
    ID = idx[np.where(inchi_array[idx] == inchi)[0][0]]
    ID_mapping.append(ID)

In [47]:
import pandas as pd
metadata = pd.DataFrame(list(zip(inchikeys_unique, inchi_mapping, ID_mapping)), columns=["inchikey", "inchi", "ID"])
metadata.head()

Unnamed: 0,inchikey,inchi,ID
0,TXZUPPVCNIMVHW,InChI=1S/C40H52O24/c1-11-20(42)26(48)30(52)37(...,8435
1,DGYASNDHNSXGSL,InChI=1S/C20H24O6/c1-11-18(12-5-7-14(21)16(9-1...,8790
2,JWYUFVNJZUSCSM,InChI=1S/C7H7N3/c8-7-9-5-3-1-2-4-6(5)10-7/h1-4...,43994
3,ZQXBVPNSSGEUCM,InChI=1S/C43H62O24/c1-5-19-22(25(38(56)57)15-6...,80787
4,WIOKWEJDRXNVSH,InChI=1S/C16H18O5/c1-9-5-13(20-4)15(18)14(6-9)...,1187


In [48]:
spectrums_annotated[8435].get("inchikey")

'TXZUPPVCNIMVHW-AJMZACEDSA-N'

In [60]:
metadata.to_csv("metadata_AllInchikeys.csv")

metadata.head()

Unnamed: 0,inchikey,inchi,ID
0,MYHSVHWQEVDFQT-RLIDIOMENA-N,InChI=1/C11H19NO10S2/c1-2-5(14)3-7(12-22-24(18...,75971
1,BKAWJIRCKVUVED-UHFFFAOYSA-N,"InChI=1S/C6H9NOS/c1-5-6(2-3-8)9-4-7-5/h4,8H,2-...",17330
2,CXVGEDCSTKKODG-UHFFFAOYSA-N,InChI=1S/C14H12O6S/c1-20-12-8-11(15)10(7-13(12...,422
3,JAMSDVDUWQNQFZ-QNQJCTKXSA-N,InChI=1S/C52H102NO8P/c1-6-8-10-12-14-16-18-20-...,38937
4,ODHCTXKNWHHXJC-GSVOUGTGSA-N,"InChI=1S/C5H7NO3/c7-4-2-1-3(6-4)5(8)9/h3H,1-2H...",46378


In [63]:
metadata = pd.read_csv("metadata_AllInchikeys_safe.csv")
metadata.head()

Unnamed: 0.1,Unnamed: 0,inchikey,inchi,ID
0,0,MYHSVHWQEVDFQT-RLIDIOMENA-N,InChI=1/C11H19NO10S2/c1-2-5(14)3-7(12-22-24(18...,75971
1,1,BKAWJIRCKVUVED-UHFFFAOYSA-N,"InChI=1S/C6H9NOS/c1-5-6(2-3-8)9-4-7-5/h4,8H,2-...",17330
2,2,CXVGEDCSTKKODG-UHFFFAOYSA-N,InChI=1S/C14H12O6S/c1-20-12-8-11(15)10(7-13(12...,422
3,3,JAMSDVDUWQNQFZ-QNQJCTKXSA-N,InChI=1S/C52H102NO8P/c1-6-8-10-12-14-16-18-20-...,38937
4,4,ODHCTXKNWHHXJC-GSVOUGTGSA-N,"InChI=1S/C5H7NO3/c7-4-2-1-3(6-4)5(8)9/h3H,1-2H...",46378


In [50]:
metadata.ID.values.shape

(15062,)

## Add fingerprints (where necessary)

In [53]:
from matchms.filtering.add_fingerprint import add_fingerprint

for i in tqdm(metadata.ID.values):
    spectrums_annotated[i] = add_fingerprint(spectrums_annotated[i],
                                             fingerprint_type="daylight", nbits=2048)

  0%|          | 0/15062 [00:00<?, ?it/s]

In [64]:
for i in tqdm(metadata.ID.values):
    if np.any(np.isnan(spectrums_annotated[i].get("fingerprint"))):
        print(i)

  0%|          | 0/15062 [00:00<?, ?it/s]

In [None]:
from matchms.similarity import FingerprintSimilarity

spectrums_represent = [spectrums_annotated[i] for i in metadata.ID.values]

similarity_measure = FingerprintSimilarity(similarity_measure="jaccard")
scores_mol_similarity = similarity_measure.matrix(spectrums_represent, spectrums_represent)

In [65]:
filename = os.path.join(path_data, "similarities_ALL_GNPS_210125_positive_daylight2048_jaccard.npy")
np.save(filename, scores_mol_similarity)

In [66]:
scores_mol_similarity.shape

(15062, 15062)

In [70]:
tanimoto_df = pd.DataFrame(scores_mol_similarity, columns=metadata.inchikey.values, index=metadata.inchikey.values)
tanimoto_df.head()

Unnamed: 0,TXZUPPVCNIMVHW,DGYASNDHNSXGSL,JWYUFVNJZUSCSM,ZQXBVPNSSGEUCM,WIOKWEJDRXNVSH,NJMQSVWMCODQIP,VBFKEZGCUWHGSK,KJAYXCCGPDNITQ,NTAHMPNXQOYXSX,YALMHTJLWDGANA,...,DCPZWPYLSMMJKM,BFZHCUBIASXHPK,IYUIDAWSRJAFII,YMGXBGVMAOTRFZ,ZOVBJSDLILDXCH,ZBAVIUQLFUYWMT,LBZHBTVSBILXAE,FQXXSQDCDRQNQE,PKKTXAMCHLIVDS,HDDNZVWBRRAOGK
TXZUPPVCNIMVHW,1.0,0.351169,0.097595,0.45449,0.232465,0.350913,0.220788,0.369819,0.553414,0.488411,...,0.245836,0.197547,0.31924,0.170619,0.419412,0.111339,0.502242,0.609171,0.456767,0.364198
DGYASNDHNSXGSL,0.351169,1.0,0.095344,0.311426,0.266393,0.306679,0.187037,0.24155,0.352905,0.346228,...,0.239921,0.196926,0.248181,0.224319,0.328424,0.134894,0.371523,0.395833,0.325097,0.267617
JWYUFVNJZUSCSM,0.097595,0.095344,1.0,0.087488,0.096552,0.089779,0.07571,0.078571,0.101093,0.111277,...,0.087171,0.076667,0.090069,0.086042,0.097539,0.082902,0.100224,0.097547,0.091977,0.091603
ZQXBVPNSSGEUCM,0.45449,0.311426,0.087488,1.0,0.158416,0.403968,0.223629,0.428328,0.476427,0.356584,...,0.190833,0.303226,0.30987,0.151724,0.382889,0.200405,0.419784,0.455195,0.301105,0.505532
WIOKWEJDRXNVSH,0.232465,0.266393,0.096552,0.158416,1.0,0.170429,0.115108,0.148259,0.213307,0.193948,...,0.235054,0.100985,0.156159,0.146067,0.189189,0.0656,0.208011,0.223117,0.272388,0.147978


In [71]:
filename = os.path.join(path_data, "ALL_GNPS_210125_positive_tanimoto_scores.pickle")
tanimoto_df.to_pickle(filename)