In [1]:
# as MatchMS can't parse the Passatuto files, I created my own parser which matches spectra to spectrum objects from MatchMS
# it is available under my FDR-Metabolomics repo: src/passatutto_parser.py
import sys
sys.path.append(r'C:\Users\Gosia\Desktop\FDR-Metabolomics\src\passatutto_parser.py')
import passatutto_parser as pp

In [None]:
# Taking all the files from MassBankOrbi (queries) and Gnps_Noise_Filtered (library), parsing them to json objects
pre_spectrums_query = pp.PassatuttoParser(r'C:\Users\Gosia\Desktop\MassbankOrbi').parse_folder()
pre_spectrums_lib = pp.PassatuttoParser(r'C:\Users\Gosia\Desktop\Gnps_Noise_Filtered').parse_folder()

In [None]:
# Using MatchMS to create spectra for both
from matchms.importing.load_from_json import as_spectrum
spectrums_query = []
for i, s in enumerate( pre_spectrums_query ):
    spectrums_query.append(as_spectrum(s))
    if i and i % 100 == 0:
        print('processed %d', i)
spectrums_lib = []
for i, s in enumerate( pre_spectrums_lib ):
    spectrums_lib.append(as_spectrum(s))
    if i and i % 100 == 0:
        print('processed %d', i)

In [4]:
# Calculating cosine similarity
from cosine_calc import get_hits
hits, misses = get_hits(spectrums_query, spectrums_lib, 2, cosine_tol=0.1)

In [5]:
# Calculating q-value scores
from q_value_calc import calculate_q_value
q_list = calculate_q_value(hits)

In [6]:
# this cell is just loading their cosine similarity and saving them by the query compound name
cosine_scores = open(r'C:\Users\Gosia\Desktop\q_values\MassbankOrbi-Gnps.txt', 'r').readlines() 
query_scores = {}
for line in cosine_scores[1:]:
    q,t,ti,s = line.split("\t") # q for query, t for target, ti for target inchi, s for score
    query_scores[q] = (float(s),t,ti)
    

In [7]:
# useful for visualisation of cosine differences
same = 0
different = 0
for hit in hits:
    if hit.target.get('compound_name') == query_scores[hit.query.get('compound_name')][1]:
        same += 1
    else:
        different += 1
    print('\t'.join((hit.query.get('compound_name'),hit.target.get('compound_name'), query_scores[hit.query.get('compound_name')][1], str(hit.score), str(query_scores[hit.query.get('compound_name')][0]))))
print(same, different)  

DErySphingosine	C18_Sphingosine	C18_Sphingosine	0.9277338131401497	0.8232295929697477
Prednisolone	Prednisolone	Prednisolone	0.9178586467476109	0.927530390141308
Hydrochlorothiazide	58-93-5	58-93-5	0.8968967400445442	0.8491576140089205
Clotrimazole	CLOTRIMAZOLE	CLOTRIMAZOLE	0.8753596956697863	0.7911624344085376
Methylprednisolone	Methylprednisolone	Methylprednisolone	0.844044967572292	0.9175689084373738
Quercetin	Ellagic acid	MLS002153851-01!2-(3,4-dihydroxyphenyl)-3,5,7-trihydroxy-4H-chromen-4-one dihydrate117-39-5	0.8132432846286978	0.6399952123966156
Atomoxetine	Atomoxetine hydrochloride	Atomoxetine hydrochloride	0.8067385925897322	0.8730037538142732
Exemestane	EXEMESTANE	EXEMESTANE	0.7999253262058288	0.9289551036930237
Fluconazole	Fluconazole	Fluconazole	0.7964150907148216	0.9552302415884909
Rutin	RUTIN	RUTIN	0.7883416739030162	0.5125823983500516
Fenofibrate	Fenofibrate (Tricor, Trilipix)	Fenofibrate (Tricor, Trilipix)	0.7683539644958313	0.9204728345768457
Telmisartan	Telmisartan	2

# Below is my attempt at Grid-Searching the cosine-tolerance parameter
# For each of the chosen tolerance-candidates, I see:
# 1) how many queries result in the same target by choosing the highest score
# 2) what is the average cosine-similarity difference between our scores and their, given the same query & target

In [8]:
from matchms.similarity import CosineGreedy
from rdkit.Chem.inchi import InchiToInchiKey, MolToInchiKey
import bisect
from collections import namedtuple

Hit = namedtuple('Hit', ['query', 'target', 'score', 'hit'])


def inchis_equal(s1, s2):
    return s1.metadata['inchi'].split("/")[:4] == s2.metadata['inchi'].split("/")[:4]
    # return InchiToInchiKey(s1.metadata['inchi']).split('-')[0] == InchiToInchiKey(s2.metadata['inchi']).split('-')[0]


def get_hits_2(query_spec, library_spec, precursor_tol=1, metaKey='parent_mass', cosine_tol=0.1):
    cosine_greedy = CosineGreedy(tolerance=cosine_tol)
    library_spec.sort(key=lambda x: x.metadata[metaKey])
    hits = []
    misses = []
    library_prec_list = [x.metadata[metaKey] for x in library_spec]
    for q in query_spec:
        if metaKey not in q.metadata:
            continue
        min_mz = q.metadata[metaKey] - precursor_tol
        max_mz = q.metadata[metaKey] + precursor_tol
        pos = bisect.bisect_right(library_prec_list, min_mz)
        pos2 = pos
        while pos2 < len(library_prec_list) and library_prec_list[pos2] < max_mz:
            pos2 += 1
        if pos == pos2:
            # nothing in precursor range
            misses.append(q)
        else:
            scores = []
            for l in library_spec[pos:pos2]:
                s, _ = cosine_greedy.pair(q, l)
                scores.append((s, l))
            # if all( s[0] == 0.0 for s in scores ):
            #    print(q.get('compound_name'))
            for score in scores:
                hits.append(Hit(q, score[1], score[0], inchis_equal(q, score[1])))
    return hits, misses


In [9]:
tolerances = [ 0.0, 0.001, 0.005, 0.01, 0.02, 0.08, 0.1, 0.12, 0.2, 0.4, 0.75, 0.9, 1.0, 1.1, 1.3, 1.5, 2.0 ]
for tolerance in tolerances:
    hit_count = 0
    hits, _ = get_hits(spectrums_query, spectrums_lib, cosine_tol=tolerance)    
    for hit in hits:
        if hit.target.get('compound_name') == query_scores.get(hit.query.get('compound_name'), ['',''])[1]:
            hit_count += 1

    score_diff = 0.0
    hits_2, _ = get_hits_2(spectrums_query, spectrums_lib, cosine_tol=tolerance)    
    for hit in hits_2:
        if hit.target.get('compound_name') == query_scores.get(hit.query.get('compound_name'), ['',''])[1]:
            score_diff += abs(hit.score - query_scores.get(hit.query.get('compound_name'))[0])
    
    print( tolerance, hit_count, score_diff/192 )

0.0 46 0.5756801887799549
0.001 73 0.36505103163697705
0.005 81 0.25503798541973555
0.01 80 0.2541797954808381
0.02 79 0.2539194279040465
0.08 76 0.25698665954923905
0.1 75 0.2572404030117687
0.12 75 0.25724035510685905
0.2 75 0.2584404508640584
0.4 74 0.2587497252527731
0.75 74 0.258776676170086
0.9 74 0.2587944857415175
1.0 70 0.25863776477416717
1.1 69 0.25589094506072046
1.3 68 0.2558926123983358
1.5 68 0.255893388279548
2.0 64 0.2552602984337558
