# Step One: _match file to USI extraction

In [1]:
path='D:/Robin/git/microbe_masst/output/nina_drugs/' 
# fastMASST file total size 300 GB and cannot be uploaded to GitHub- request access from haz072@health.ucsd.edu

In [3]:
import pandas as pd
import numpy as np
from collections import Counter
import os
import statistics
from matchms.importing import load_from_usi
from matchms import Spectrum
from datetime import datetime
from typing import List
import pyteomics.mgf as py_mgf
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import wait
import warnings
warnings.filterwarnings("ignore")

### read in GNPS Drug Library metadata for endogenous/food labels

In [4]:
dfexonlyGNPS =pd.read_excel( 'E:/Nina/analogMASST/20230915 analogMASST exogenous only/20230913_GNPS_drug_full_metadata_combined_with_manual_curation.xlsx')
print(dfexonlyGNPS.shape)
dfexonlyGNPS.head()

(76415, 143)


Unnamed: 0,gnps_libid,cleanup_name,compound_name,nina_source,chemical_source,broad_pert_iname,broad_disease_area,broad_indication,broad_moa,broad_target,...,Nina_chembl_clinical_phase,Nina_chembl_indication,Nina_drugbank_id,Nina_Drugbank_name,Nina_Drugbank_approved,Nina_Drugbank_indication,Nina_DrugCentral_id,Nina_DrugCentral_name,Nina_DrugCentral_administration,Nina_DrugCentral_pharma_class
0,CCMSLIB00000564922,canrenone,CANRENONE,Corinna,Drug metabolite,canrenone,,,mineralocorticoid receptor antagonist,NR3C2,...,,,,,,,,,,
1,CCMSLIB00000564945,canrenone,CANRENONE,Corinna,Drug metabolite,canrenone,,,mineralocorticoid receptor antagonist,NR3C2,...,,,,,,,,,,
2,CCMSLIB00000564977,canrenone,CANRENONE,Corinna,Drug metabolite,canrenone,,,mineralocorticoid receptor antagonist,NR3C2,...,,,,,,,,,,
3,CCMSLIB00000573761,canrenone,CANRENONE,Corinna,Drug metabolite,canrenone,,,mineralocorticoid receptor antagonist,NR3C2,...,,,,,,,,,,
4,CCMSLIB00000579769,canrenone,CANRENONE,Drug_API,Drug metabolite,canrenone,,,mineralocorticoid receptor antagonist,NR3C2,...,4.0,Aldosterone Antagonist,DB12221,canrenone,investigational,,478.0,canrenone,,"Diuretics,Hormone Antagonists,Mineralocorticoi..."


### exclude compounds with endogenous and food sources

In [5]:
gnps_exclude =dfexonlyGNPS[dfexonlyGNPS.chemical_source.isin(['Endogenous, Food, Medical',
       'Endogenous, Food, Medical, Personal Care',
       'Endogenous, Food, Medical, Personal care', 'Endogenous, Medical',
       'Endogenous, Medical, Personal Care', 'Food, Medical',
       'Food, Medical, Personal Care',
       'Food, Medical, Personal Care, Industrial'])].gnps_libid.unique()

### read in endogenous/exdogenous labels for MCE

In [7]:
dfexonlyMCE  =pd.read_csv('E:/Nina/analogMASST/20230915 analogMASST exogenous only/20230913_MCE_all_phase_with_gnps_id_full_metadata_small.csv')
print(dfexonlyMCE.shape)
dfexonlyMCE.head()

(22921, 2)


Unnamed: 0,SPECTRUMID,chemical_source
0,CCMSLIB00010146646,Medical
1,CCMSLIB00010146645,Medical
2,CCMSLIB00010146644,Medical
3,CCMSLIB00010146642,Medical
4,CCMSLIB00010146643,Medical


### exclude compounds with endogenous and food sources

In [9]:
mce_exclude =dfexonlyMCE[dfexonlyMCE.chemical_source.isin([ 'Endogenous, Food, Medical', 'Food, Medical',
       'Endogenous, Medical', 'Food, Medical, Personal Care',
       'Endogenous, Food, Medical, Personal Care'])].SPECTRUMID.unique()

### combine exclusions from two resources 

In [10]:
all_exclude = list(mce_exclude)+list(gnps_exclude)
all_exclude = set(all_exclude)
len(all_exclude)

37066

### read in delta mass inclusion list

In [11]:
explained_mass = pd.read_excel('E:/Nina/analogMASST/20231123_analog_reprocess_delta_mass_with_direction/20231123_delta_mass_cleanup_with_direction.xlsx')
print(explained_mass.shape)
explained_mass.head()

(156, 4)


Unnamed: 0,mz delta (=drug - analog),atomic difference,rationale,source
0,0.984016,"1N,1H,-1O",amination,Wout
1,1.979265,"1C,2H,-1O",dehydroxylation + methylation,Wout
2,2.01565,-2H,dehydrogenation,Wout
3,3.986848,"2O,-1H,-1Cl","OXA, dechlorination + carboxylation",Wout
4,3.994915,"1C,-1O",dehydration + methylation,Wout


In [14]:
explained_mass_list = list(explained_mass['mz delta (=drug - analog)'])
len(explained_mass_list)

156

In [15]:
###conert to set for faster search operation
explained_mass_set = set(explained_mass_list)
len(explained_mass_set)

156

### Analogs for drug MS/MS from MSn Library

In [19]:
curdir = 'mce'

In [20]:
match_files = [f for f in os.listdir(path+curdir) if f.endswith('_matches.tsv')]
match_files[:5]

['20230312_mce_library__10000_CCMSLIB00010139724_matches.tsv',
 '20230312_mce_library__10001_CCMSLIB00010139725_matches.tsv',
 '20230312_mce_library__10002_CCMSLIB00010139726_matches.tsv',
 '20230312_mce_library__10003_CCMSLIB00010139727_matches.tsv',
 '20230312_mce_library__10004_CCMSLIB00010139728_matches.tsv']

In [21]:
drugNumber = len(match_files)
drugNumber

21306

In [22]:
match_files[0].split('_')[5]

'CCMSLIB00010139724'

In [23]:
usiMass = pd.read_csv('E:/Nina/analogMASST/20231010_filter_USI_with_wrong_pepmass/20231010_USI_pepmass.csv')
print(usiMass.shape)
usiMass.head()

(1563838, 3)


Unnamed: 0.1,Unnamed: 0,USI,PEPMASS
0,1,mzspec:MSV000087006:20200223_DOM_IB_SZ_1:scan:...,285.241211
1,2,mzspec:MSV000080558:SS343_RF4_01_7702:scan:2206,391.28274
2,3,mzspec:MSV000083475:4A2_2_6_pinto-63-s023-a04:...,257.211046
3,4,mzspec:MSV000083475:2C2_4_2_rohwer-85-s023-a04...,278.190227
4,5,mzspec:MSV000083559:11713.Std.Mix.6.P5.WRA1:sc...,311.080445


In [24]:
usiMass2 = dict(zip(usiMass.USI,usiMass.PEPMASS))
len(usiMass2.keys())

1563838

In [25]:
ccmsMass=pd.read_csv('E:/Nina/analogMASST/20231010_filter_USI_with_wrong_pepmass/20231010_GNPS_and_MCE_drug_pepmass.csv')
print(ccmsMass.shape)
ccmsMass.head()

(99279, 2)


Unnamed: 0,gnpsid,pepmass
0,CCMSLIB00000001635,656.306
1,CCMSLIB00000001637,678.29
2,CCMSLIB00000001644,785.23
3,CCMSLIB00000001651,718.286
4,CCMSLIB00000001653,696.304


In [26]:
ccmsMass2 = dict(zip(ccmsMass.gnpsid,ccmsMass.pepmass))
len(ccmsMass2.keys())

99279

In [27]:
def getMass(usi,dic):
    if usi not in dic:
        return float('inf')
    else:
        return dic[usi]
getMass(usiMass.USI.values[0],usiMass2)

285.2412109375

In [36]:
df = pd.read_csv(path+curdir+'/'+match_files[0],sep='\t')
print(df.shape)
df.head()

(520, 5)


Unnamed: 0,Delta Mass,USI,Cosine,Matching Peaks,Status
0,23.99,mzspec:MSV000084314:MSV000087728:scan:2046,0.89,4,NoID
1,47.01,mzspec:MSV000083612:rime5941_180710_4_Col_Amb_...,0.85,4,NoID
2,-40.98,mzspec:MSV000084314:MSV000083004:scan:8875,0.84,5,NoID
3,-0.45,mzspec:MSV000085922:200725_LQ_478908Z01_pos:sc...,0.83,4,NoID
4,-0.45,mzspec:MSV000085918:200728_LQ_PF0017796_476409...,0.83,4,NoID


### Filtering conidtions:
### abs(delta mass)>0.9 & cosine>=0.8 & deltamass explanable & ccmsid exdogenous & matching peaks>=6

In [48]:
def withinExplainedRange(ccmsMass,usiMass,explainedMass):
    dmass = ccmsMass-usiMass
    for m in explainedMass:
        if abs(dmass-m)<=0.005:
            return True
    return False

In [50]:
curdir

'mce'

In [51]:
%%time
cosine =[]
matchingPeaks = []
delta_masses = []
usis = []
dfmatchfile_filtered_all = pd.DataFrame(None)
for i,file in enumerate(match_files):
    filename = file.split('_')[5]
    if filename not in all_exclude:
        df = pd.read_csv(path+curdir+'/'+file,sep='\t')
        if df.shape[0]>0:
            df['Delta Mass']=df['Delta Mass'].apply(lambda x: round(x,2))
            df_filter = df[((df['Delta Mass']>0.9) | (df['Delta Mass']<-0.9)) & (df['Matching Peaks']>=6) \
                    & (df['Cosine']>=0.8)] #& (df['Delta Mass'].isin(explained_mass_set))]
            
            df_filter['ccmsid'] = filename
            df_filter['ccmsMass'] = ccmsMass2[filename]
            df_filter['usiMass']= df_filter['USI'].apply(lambda x: getMass(x,usiMass2))
            df_filter['keep'] = df_filter.apply((lambda x: False if abs(x.ccmsMass-x.usiMass-x['Delta Mass'])>0.05 else True),\
                                                axis=1)
            df_filter['keepMass'] = df_filter.apply((lambda x: withinExplainedRange(x.ccmsMass,x.usiMass,explained_mass_set)),\
                                                axis = 1)
            df_filter = df_filter[(df_filter.keep==True) & (df_filter.keepMass==True)]
            delta_masses += list(df_filter['Delta Mass'].values)
            usis += list(df_filter['USI'].values)
            
            if dfmatchfile_filtered_all.shape[0]!=0:
                dfmatchfile_filtered_all = pd.concat([dfmatchfile_filtered_all, df_filter.copy()], ignore_index=True)
            else:
                dfmatchfile_filtered_all = df_filter.copy()
            
            if i%2500==0:
                print(i,len(delta_masses),len(usis))
#dfmatchfile_filtered_all.to_csv('E:/Nina/20240705_mce_match_files_cosine_sixpeak_exogenous.csv', index=False)
print('final:', len(delta_masses),len(usis),dfmatchfile_filtered_all.shape)

2500 6762 6762
12500 70388 70388
17500 87385 87385
final: 154993 154993 (154993, 10)
CPU times: total: 3min 49s
Wall time: 3min 49s


In [53]:
massCounts = Counter(delta_masses)
len(massCounts.items())

201

In [55]:
realMasses = set()
for key in massCounts:
    if massCounts[key]>9:
        realMasses.add(key)
print(len(realMasses))

152


In [56]:
mceUSI = usis
mceDMs = delta_masses

In [57]:
len(usis)

154993

### Analogs for drug MS/MS from GNPS library

In [60]:
curdir = 'gnps'

In [65]:
match_files_GNPS = [f for f in os.listdir(path+curdir) if f.endswith('_matches.tsv')]
match_files_GNPS[:5]

['20230403_GNPS_drug__100006_CCMSLIB00006454306_matches.tsv',
 '20230403_GNPS_drug__100007_CCMSLIB00006454307_matches.tsv',
 '20230403_GNPS_drug__10000_CCMSLIB00000572749_matches.tsv',
 '20230403_GNPS_drug__10001_CCMSLIB00010112435_matches.tsv',
 '20230403_GNPS_drug__100026_CCMSLIB00006454326_matches.tsv']

In [66]:
drugNumber = len(match_files_GNPS)
drugNumber

66107

In [67]:
len(explained_mass_set)

156

In [68]:
dfmatchfile_filtered_all.head()

Unnamed: 0,Delta Mass,USI,Cosine,Matching Peaks,Status,ccmsid,ccmsMass,usiMass,keep,keepMass
0,14.02,mzspec:MSV000084314:MSV000087449:scan:2698,0.97,7.0,NoID,CCMSLIB00010139779,243.10156,229.086,1.0,1.0
1,-58.0,mzspec:MSV000084314:MSV000085120:scan:520981,0.8,11.0,NoID,CCMSLIB00010139798,385.20096,443.20599,1.0,1.0
2,42.01,mzspec:MSV000081176:C3:scan:1166,0.8,10.0,NoID,CCMSLIB00010139798,385.20096,343.190313,1.0,1.0
3,-58.01,mzspec:MSV000085120:BAX500_6x_BF5_01_23808:sca...,0.88,12.0,NoID,CCMSLIB00010139799,385.20096,443.206032,1.0,1.0
4,-58.0,mzspec:MSV000084314:MSV000085120:scan:520981,0.86,11.0,NoID,CCMSLIB00010139799,385.20096,443.20599,1.0,1.0


In [69]:
%%time
cosine =[]
matchingPeaks = []
delta_masses_gnps = []
usis_gnps = []

for i,file in enumerate(match_files_GNPS):
    filename = file.split('_')[5]
    if filename not in all_exclude:
        df = pd.read_csv(path+curdir+'/'+file,sep='\t')
        if df.shape[0]>0:
            df['Delta Mass']=df['Delta Mass'].apply(lambda x: round(x,2))
            df_filter = df[((df['Delta Mass']>0.9) | (df['Delta Mass']<-0.9)) & (df['Matching Peaks']>=6) \
                    & (df['Cosine']>=0.8)] #& (df['Delta Mass'].isin(explained_mass_set))]
            #cosine += list(df['Cosine'].values)
            #matchingPeaks += list(df['Matching Peaks'].values)
            df_filter['ccmsid'] = filename
            df_filter['ccmsMass'] = ccmsMass2[filename]
            df_filter['usiMass']= df_filter['USI'].apply(lambda x: getMass(x,usiMass2))
            df_filter['keep'] = df_filter.apply((lambda x: False if abs(x.ccmsMass-x.usiMass-x['Delta Mass'])>0.05 else True),\
                                                axis=1)
            df_filter['keepMass'] = df_filter.apply((lambda x:withinExplainedRange(x.ccmsMass,x.usiMass,explained_mass_set)),\
                                                axis=1)
            df_filter = df_filter[(df_filter.keep==True) & (df_filter.keepMass==True)]
            delta_masses_gnps += list(df_filter['Delta Mass'].values)
            usis_gnps += list(df_filter['USI'].values)
            
            if dfmatchfile_filtered_all.shape[0]!=0:
                dfmatchfile_filtered_all = pd.concat([dfmatchfile_filtered_all, df_filter], ignore_index=True)
            else:
                dfmatchfile_filtered_all = df_filter.copy()
            
            if i%2500==0:
                print(i,len(delta_masses_gnps),len(usis_gnps))

print('final:', len(delta_masses_gnps),len(usis_gnps))

5000 108106 108106
7500 116053 116053
30000 298513 298513
32500 316978 316978
50000 565372 565372
65000 685767 685767
final: 710672 710672
CPU times: total: 24min 27s
Wall time: 29min 41s


In [70]:
dfmatchfile_filtered_all.to_csv('E:/Nina/20240913_gnps_match_files_cosine_sixpeak_exogenous.csv', index=False)

In [71]:
dfmatchfile_filtered_all.head()

Unnamed: 0,Delta Mass,USI,Cosine,Matching Peaks,Status,ccmsid,ccmsMass,usiMass,keep,keepMass
0,14.02,mzspec:MSV000084314:MSV000087449:scan:2698,0.97,7.0,NoID,CCMSLIB00010139779,243.10156,229.086,1.0,1.0
1,-58.0,mzspec:MSV000084314:MSV000085120:scan:520981,0.8,11.0,NoID,CCMSLIB00010139798,385.20096,443.20599,1.0,1.0
2,42.01,mzspec:MSV000081176:C3:scan:1166,0.8,10.0,NoID,CCMSLIB00010139798,385.20096,343.190313,1.0,1.0
3,-58.01,mzspec:MSV000085120:BAX500_6x_BF5_01_23808:sca...,0.88,12.0,NoID,CCMSLIB00010139799,385.20096,443.206032,1.0,1.0
4,-58.0,mzspec:MSV000084314:MSV000085120:scan:520981,0.86,11.0,NoID,CCMSLIB00010139799,385.20096,443.20599,1.0,1.0


In [72]:

massCounts_gnps = Counter(delta_masses_gnps)
len(massCounts_gnps.items())

238

In [74]:
realMasses_gnps = set()
for key in massCounts_gnps:
    if massCounts_gnps[key]>9:
        realMasses_gnps.add(key)
print(len(realMasses_gnps))

187


In [75]:
dfmatchfile_filtered_all.shape

(865665, 10)

In [76]:
len(dfmatchfile_filtered_all.USI.unique())

250709

In [77]:
freqMass = Counter(dfmatchfile_filtered_all['Delta Mass'].values)
len(freqMass.keys())

248

In [79]:
retainedMass = set()
for key in freqMass:
    if freqMass[key]>9:
        retainedMass.add(key)
len(retainedMass)

204

In [80]:
dff = dfmatchfile_filtered_all[dfmatchfile_filtered_all['Delta Mass'].isin(retainedMass)]
print(dff.shape)
dff.head()

(865516, 10)


Unnamed: 0,Delta Mass,USI,Cosine,Matching Peaks,Status,ccmsid,ccmsMass,usiMass,keep,keepMass
0,14.02,mzspec:MSV000084314:MSV000087449:scan:2698,0.97,7.0,NoID,CCMSLIB00010139779,243.10156,229.086,1.0,1.0
1,-58.0,mzspec:MSV000084314:MSV000085120:scan:520981,0.8,11.0,NoID,CCMSLIB00010139798,385.20096,443.20599,1.0,1.0
2,42.01,mzspec:MSV000081176:C3:scan:1166,0.8,10.0,NoID,CCMSLIB00010139798,385.20096,343.190313,1.0,1.0
3,-58.01,mzspec:MSV000085120:BAX500_6x_BF5_01_23808:sca...,0.88,12.0,NoID,CCMSLIB00010139799,385.20096,443.206032,1.0,1.0
4,-58.0,mzspec:MSV000084314:MSV000085120:scan:520981,0.86,11.0,NoID,CCMSLIB00010139799,385.20096,443.20599,1.0,1.0


In [81]:
dff.to_csv('E:/Nina/20240913_gnps_mce_match_files_cosine_sixpeak_exogenous.csv', index=False)

### Exclude analogs that matches to GPNS library 

In [83]:
curdir = 'D:/Robin/git/microbe_masst/output/nina_gnps_lib_analog_off'

In [84]:
match_files_exactMatch = [f for f in os.listdir(curdir) if f.endswith('_matches.tsv')]
len(match_files_exactMatch),match_files_exactMatch[:5]

(402065,
 ['20231112_gnpslib_analogoff__100000_CCMSLIB00006454300_matches.tsv',
  '20231112_gnpslib_analogoff__100001_CCMSLIB00006454301_matches.tsv',
  '20231112_gnpslib_analogoff__100002_CCMSLIB00006454302_matches.tsv',
  '20231112_gnpslib_analogoff__100003_CCMSLIB00006454303_matches.tsv',
  '20231112_gnpslib_analogoff__100004_CCMSLIB00006454304_matches.tsv'])

In [85]:
%%time
exactMatchExclusion = []

for i,file in enumerate(match_files_exactMatch):
    
    df = pd.read_csv(curdir+'/'+file,sep='\t')
    if df.shape[0]>0:
        
        exactMatchExclusion += list(df['USI'].values)

        
        if i%100000==0:
            print(i,len(exactMatchExclusion))
exactMatchExclusion = set(exactMatchExclusion)
print('final:', len(exactMatchExclusion))

final: 13712772
CPU times: total: 26min 58s
Wall time: 1h 4min 22s


In [86]:
dff2 = dff[~dff.USI.isin(exactMatchExclusion)]
print(dff2.shape)
dff2.head()

(248951, 10)


Unnamed: 0,Delta Mass,USI,Cosine,Matching Peaks,Status,ccmsid,ccmsMass,usiMass,keep,keepMass
2,42.01,mzspec:MSV000081176:C3:scan:1166,0.8,10.0,NoID,CCMSLIB00010139798,385.20096,343.190313,1.0,1.0
5,42.01,mzspec:MSV000081176:Blank_170620013826:scan:1188,0.85,11.0,NoID,CCMSLIB00010139799,385.20096,343.190593,1.0,1.0
6,42.01,mzspec:MSV000081176:B9:scan:1179,0.84,11.0,NoID,CCMSLIB00010139799,385.20096,343.190453,1.0,1.0
8,42.01,mzspec:MSV000081176:C1:scan:1128,0.82,10.0,NoID,CCMSLIB00010139799,385.20096,343.190316,1.0,1.0
9,42.01,mzspec:MSV000081176:A1:scan:1332,0.8,10.0,NoID,CCMSLIB00010139799,385.20096,343.190389,1.0,1.0


In [87]:
len(dff2.USI.unique())

75298

In [88]:
dff2.to_csv('E:/Nina/analogMASST/20231123_analog_reprocess_delta_mass_with_direction/20240913_USI_CCMSID_match_file_summary_after_filter.csv')

### write the mgf file (download the finalUSI sprectra from metabolomics-usi)

In [89]:
finalUSI =set(dff2.USI.values)
len(finalUSI)

75298

In [105]:
# retrieve spectra from USI as MGF file
def save_as_mgf(spectrums: List[Spectrum], filename: str,j:int,k:int):
    """Save spectrum(s) as mgf file.
        customized from save_as_mgf from pyteomics.mgf
    """
    if not isinstance(spectrums, list):
        # Assume that input was single Spectrum
        spectrums = [spectrums]

    # Convert matchms.Spectrum() into dictionaries for pyteomics
    
    for spectrum in spectrums:
        spectrum_dict = {"m/z array": spectrum.peaks.mz,
                         "intensity array": spectrum.peaks.intensities,
                         "params": {'usi': spectrum.metadata['usi'],
 'server': spectrum.metadata['server'],'precursor_mz': spectrum.metadata['precursor_mz'],
                                    'PEPMASS':spectrum.metadata['precursor_mz'],
 #'COSINE':df['Cosine'].iloc[k],'matching_peaks':df['Matching Peaks'].iloc[k],
                                   'SCANS':j}}
                        #            'NAME':match.split('_')[3]+'_analog','SCANS':j}}
                        # "params": spectrum.metadata}
        # Append spectrum to file
        with open(filename, 'a', encoding="utf-8") as out:
            py_mgf.write([spectrum_dict], out)
        #py_mgf.write([spectrum_dict], filename)
        j+=1
def pull_mgf_from_usi(usi,save_path,scanStart=0,identifierString='current job'):
    t1 = datetime.now()
    try:
        spectrums=[]
        links = usi
        for i,match in enumerate(links):

            spectrum = load_from_usi(match,server='https://metabolomics-usi.gnps2.org')

            if spectrum:
                spectrums.append(spectrum)
            if i%500==0 and i>0:
                print(f"{i} USI for job {identifierString} loaded")
        print(f'job {identifierString} loaded successfully!')
        if spectrums:

            save_as_mgf(spectrums, save_path,scanStart,0)
        print(f'spectrums saved to {save_path} successfully!')
        t2 = datetime.now()
        #print(f"time consumption: {(t2-t1)/3600} h")
        return True
    except Exception as e:
    # logger.exception(e)
        return False

    
def pull_mgf_from_usi_batches(big_usi,save_directory,fileStarter='USIsamples',batch_size=10000,scanStart=0,parallel_queries=100):
    
    rangeN =len(big_usi)//batch_size if (len(big_usi)%batch_size == 0)else len(big_usi)//batch_size+1
    
    
    with ThreadPoolExecutor(parallel_queries) as executor:
        futures = [
            executor.submit(
                pull_mgf_from_usi,
                big_usi[i*batch_size:min((i+1)*batch_size,len(big_usi))],
                f"{save_directory}/{fileStarter}_{i*batch_size+scanStart}_to_{min((i+1)*batch_size-1,len(big_usi)-1)+scanStart}.mgf",
                scanStart+i*batch_size+1,
                f'batch {i} with size {batchsize}'
            )
            for i in range(rangeN)
        ]

    wait(futures)
    rates = sum([f.result() for f in futures])/rangeN

    # return success rate
    print(f'job finished! batch sucessful rate: {rates}')
    return 1 if len(big_usi) == 0 else rates
    

In [106]:
%%time
save_directory = 'E:/Nina/20231125/spectrums'
fileStarter='USIsamples'
batchsize = 100
scanStart = 0
parallel_queries=10
pull_mgf_from_usi_batches(list(finalUSI),save_directory,fileStarter,batchsize,scanStart,parallel_queries)

job batch 5 with size 100 loaded successfully!
job batch 9 with size 100 loaded successfully!
job batch 4 with size 100 loaded successfully!
job batch 8 with size 100 loaded successfully!
job batch 3 with size 100 loaded successfully!
job batch 2 with size 100 loaded successfully!
job batch 1 with size 100 loaded successfully!
job batch 6 with size 100 loaded successfully!
job batch 0 with size 100 loaded successfully!
job batch 7 with size 100 loaded successfully!
job batch 12 with size 100 loaded successfully!
job batch 14 with size 100 loaded successfully!
job batch 10 with size 100 loaded successfully!
job batch 17 with size 100 loaded successfully!
job batch 13 with size 100 loaded successfully!
job batch 11 with size 100 loaded successfully!
job batch 18 with size 100 loaded successfully!
job batch 15 with size 100 loaded successfully!
job batch 19 with size 100 loaded successfully!
job batch 16 with size 100 loaded successfully!
job batch 20 with size 100 loaded successfully!
jo

job batch 40 with size 100 loaded successfully!
job batch 44 with size 100 loaded successfully!
job batch 41 with size 100 loaded successfully!
job batch 42 with size 100 loaded successfully!
job batch 43 with size 100 loaded successfully!
job batch 45 with size 100 loaded successfully!
job batch 47 with size 100 loaded successfully!
job batch 46 with size 100 loaded successfully!
job batch 48 with size 100 loaded successfully!
job batch 49 with size 100 loaded successfully!
job batch 50 with size 100 loaded successfully!
job batch 53 with size 100 loaded successfully!
job batch 51 with size 100 loaded successfully!
job batch 52 with size 100 loaded successfully!
job batch 54 with size 100 loaded successfully!
job batch 57 with size 100 loaded successfully!
job batch 56 with size 100 loaded successfully!
job batch 58 with size 100 loaded successfully!
job batch 55 with size 100 loaded successfully!
job batch 59 with size 100 loaded successfully!
job batch 60 with size 100 loaded succes

job batch 90 with size 100 loaded successfully!
job batch 91 with size 100 loaded successfully!
job batch 93 with size 100 loaded successfully!
job batch 92 with size 100 loaded successfully!
job batch 94 with size 100 loaded successfully!
job batch 95 with size 100 loaded successfully!
job batch 97 with size 100 loaded successfully!
job batch 96 with size 100 loaded successfully!
job batch 98 with size 100 loaded successfully!
job batch 99 with size 100 loaded successfully!
job batch 100 with size 100 loaded successfully!
job batch 101 with size 100 loaded successfully!
job batch 103 with size 100 loaded successfully!
job batch 102 with size 100 loaded successfully!
job batch 104 with size 100 loaded successfully!
job batch 105 with size 100 loaded successfully!
job batch 106 with size 100 loaded successfully!
job batch 107 with size 100 loaded successfully!
job batch 109 with size 100 loaded successfully!
job batch 108 with size 100 loaded successfully!
job batch 110 with size 100 lo

job batch 140 with size 100 loaded successfully!
job batch 141 with size 100 loaded successfully!
job batch 142 with size 100 loaded successfully!
job batch 143 with size 100 loaded successfully!
job batch 144 with size 100 loaded successfully!
job batch 145 with size 100 loaded successfully!
job batch 146 with size 100 loaded successfully!
job batch 147 with size 100 loaded successfully!
job batch 148 with size 100 loaded successfully!
job batch 149 with size 100 loaded successfully!
job batch 150 with size 100 loaded successfully!
job batch 151 with size 100 loaded successfully!
job batch 153 with size 100 loaded successfully!
job batch 152 with size 100 loaded successfully!
job batch 155 with size 100 loaded successfully!
job batch 157 with size 100 loaded successfully!
job batch 156 with size 100 loaded successfully!
job batch 154 with size 100 loaded successfully!
job batch 158 with size 100 loaded successfully!
job batch 159 with size 100 loaded successfully!
job batch 160 with s

job batch 182 with size 100 loaded successfully!
job batch 184 with size 100 loaded successfully!
job batch 185 with size 100 loaded successfully!
job batch 183 with size 100 loaded successfully!
job batch 188 with size 100 loaded successfully!
job batch 186 with size 100 loaded successfully!
job batch 187 with size 100 loaded successfully!
job batch 189 with size 100 loaded successfully!
job batch 190 with size 100 loaded successfully!
job batch 192 with size 100 loaded successfully!
job batch 193 with size 100 loaded successfully!
job batch 191 with size 100 loaded successfully!
job batch 194 with size 100 loaded successfully!
job batch 195 with size 100 loaded successfully!
job batch 198 with size 100 loaded successfully!
job batch 196 with size 100 loaded successfully!
job batch 197 with size 100 loaded successfully!
job batch 199 with size 100 loaded successfully!
job batch 200 with size 100 loaded successfully!
job batch 203 with size 100 loaded successfully!
job batch 201 with s

job batch 235 with size 100 loaded successfully!
job batch 236 with size 100 loaded successfully!
job batch 237 with size 100 loaded successfully!
job batch 238 with size 100 loaded successfully!
job batch 239 with size 100 loaded successfully!
job batch 240 with size 100 loaded successfully!
job batch 241 with size 100 loaded successfully!
job batch 242 with size 100 loaded successfully!
job batch 243 with size 100 loaded successfully!
job batch 244 with size 100 loaded successfully!
job batch 245 with size 100 loaded successfully!
job batch 248 with size 100 loaded successfully!
job batch 247 with size 100 loaded successfully!
job batch 249 with size 100 loaded successfully!
job batch 246 with size 100 loaded successfully!
job batch 250 with size 100 loaded successfully!
job batch 251 with size 100 loaded successfully!
job batch 253 with size 100 loaded successfully!
job batch 252 with size 100 loaded successfully!
job batch 255 with size 100 loaded successfully!
job batch 256 with s

job batch 281 with size 100 loaded successfully!
job batch 283 with size 100 loaded successfully!
job batch 282 with size 100 loaded successfully!
job batch 284 with size 100 loaded successfully!
job batch 285 with size 100 loaded successfully!
job batch 286 with size 100 loaded successfully!
job batch 288 with size 100 loaded successfully!
job batch 287 with size 100 loaded successfully!
job batch 289 with size 100 loaded successfully!
job batch 290 with size 100 loaded successfully!
job batch 291 with size 100 loaded successfully!
job batch 293 with size 100 loaded successfully!
job batch 292 with size 100 loaded successfully!
job batch 294 with size 100 loaded successfully!
job batch 295 with size 100 loaded successfully!
job batch 296 with size 100 loaded successfully!
job batch 298 with size 100 loaded successfully!
job batch 299 with size 100 loaded successfully!
job batch 297 with size 100 loaded successfully!
job batch 300 with size 100 loaded successfully!
job batch 301 with s

job batch 327 with size 100 loaded successfully!
job batch 329 with size 100 loaded successfully!
job batch 330 with size 100 loaded successfully!
job batch 331 with size 100 loaded successfully!
job batch 332 with size 100 loaded successfully!
job batch 334 with size 100 loaded successfully!
job batch 333 with size 100 loaded successfully!
job batch 335 with size 100 loaded successfully!
job batch 338 with size 100 loaded successfully!
job batch 337 with size 100 loaded successfully!
job batch 336 with size 100 loaded successfully!
job batch 339 with size 100 loaded successfully!
job batch 340 with size 100 loaded successfully!
job batch 341 with size 100 loaded successfully!
job batch 342 with size 100 loaded successfully!
job batch 343 with size 100 loaded successfully!
job batch 345 with size 100 loaded successfully!
job batch 344 with size 100 loaded successfully!
job batch 346 with size 100 loaded successfully!
job batch 347 with size 100 loaded successfully!
job batch 348 with s

job batch 366 with size 100 loaded successfully!
job batch 368 with size 100 loaded successfully!
job batch 367 with size 100 loaded successfully!
job batch 369 with size 100 loaded successfully!
job batch 370 with size 100 loaded successfully!
job batch 371 with size 100 loaded successfully!
job batch 372 with size 100 loaded successfully!
job batch 373 with size 100 loaded successfully!
job batch 374 with size 100 loaded successfully!
job batch 375 with size 100 loaded successfully!
job batch 376 with size 100 loaded successfully!
job batch 378 with size 100 loaded successfully!
job batch 377 with size 100 loaded successfully!
job batch 379 with size 100 loaded successfully!
job batch 380 with size 100 loaded successfully!
job batch 381 with size 100 loaded successfully!
job batch 382 with size 100 loaded successfully!
job batch 383 with size 100 loaded successfully!
job batch 384 with size 100 loaded successfully!
job batch 385 with size 100 loaded successfully!
job batch 388 with s

job batch 403 with size 100 loaded successfully!
job batch 407 with size 100 loaded successfully!
job batch 406 with size 100 loaded successfully!
job batch 408 with size 100 loaded successfully!
job batch 409 with size 100 loaded successfully!
job batch 410 with size 100 loaded successfully!
job batch 411 with size 100 loaded successfully!
job batch 412 with size 100 loaded successfully!
job batch 413 with size 100 loaded successfully!
job batch 414 with size 100 loaded successfully!
job batch 417 with size 100 loaded successfully!
job batch 416 with size 100 loaded successfully!
job batch 415 with size 100 loaded successfully!
job batch 418 with size 100 loaded successfully!
job batch 419 with size 100 loaded successfully!
job batch 420 with size 100 loaded successfully!
job batch 422 with size 100 loaded successfully!
job batch 421 with size 100 loaded successfully!
job batch 426 with size 100 loaded successfully!
job batch 428 with size 100 loaded successfully!
job batch 423 with s

job batch 450 with size 100 loaded successfully!
job batch 453 with size 100 loaded successfully!
job batch 451 with size 100 loaded successfully!
job batch 454 with size 100 loaded successfully!
job batch 452 with size 100 loaded successfully!
job batch 455 with size 100 loaded successfully!
job batch 456 with size 100 loaded successfully!
job batch 457 with size 100 loaded successfully!
job batch 458 with size 100 loaded successfully!
job batch 459 with size 100 loaded successfully!
job batch 460 with size 100 loaded successfully!
job batch 462 with size 100 loaded successfully!
job batch 461 with size 100 loaded successfully!
job batch 464 with size 100 loaded successfully!
job batch 463 with size 100 loaded successfully!
job batch 466 with size 100 loaded successfully!
job batch 465 with size 100 loaded successfully!
job batch 467 with size 100 loaded successfully!
job batch 469 with size 100 loaded successfully!
job batch 468 with size 100 loaded successfully!
job batch 470 with s

job batch 491 with size 100 loaded successfully!
job batch 494 with size 100 loaded successfully!
job batch 496 with size 100 loaded successfully!
job batch 493 with size 100 loaded successfully!
job batch 497 with size 100 loaded successfully!
job batch 498 with size 100 loaded successfully!
job batch 495 with size 100 loaded successfully!
job batch 499 with size 100 loaded successfully!
job batch 500 with size 100 loaded successfully!
job batch 502 with size 100 loaded successfully!
job batch 501 with size 100 loaded successfully!
job batch 503 with size 100 loaded successfully!
job batch 505 with size 100 loaded successfully!
job batch 506 with size 100 loaded successfully!
job batch 508 with size 100 loaded successfully!
job batch 509 with size 100 loaded successfully!
job batch 504 with size 100 loaded successfully!
job batch 507 with size 100 loaded successfully!
job batch 510 with size 100 loaded successfully!
job batch 512 with size 100 loaded successfully!
job batch 511 with s

job batch 537 with size 100 loaded successfully!
job batch 539 with size 100 loaded successfully!
job batch 540 with size 100 loaded successfully!
job batch 541 with size 100 loaded successfully!
job batch 542 with size 100 loaded successfully!
job batch 543 with size 100 loaded successfully!
job batch 544 with size 100 loaded successfully!
job batch 545 with size 100 loaded successfully!
job batch 547 with size 100 loaded successfully!
job batch 546 with size 100 loaded successfully!
job batch 548 with size 100 loaded successfully!
job batch 550 with size 100 loaded successfully!
job batch 549 with size 100 loaded successfully!
job batch 551 with size 100 loaded successfully!
job batch 552 with size 100 loaded successfully!
job batch 553 with size 100 loaded successfully!
job batch 554 with size 100 loaded successfully!
job batch 556 with size 100 loaded successfully!
job batch 555 with size 100 loaded successfully!

job batch 559 with size 100 loaded successfully!
job batch 557 with 

job batch 582 with size 100 loaded successfully!
job batch 583 with size 100 loaded successfully!
job batch 585 with size 100 loaded successfully!
job batch 586 with size 100 loaded successfully!
job batch 584 with size 100 loaded successfully!
job batch 587 with size 100 loaded successfully!
job batch 588 with size 100 loaded successfully!
job batch 590 with size 100 loaded successfully!
job batch 591 with size 100 loaded successfully!
job batch 589 with size 100 loaded successfully!
job batch 592 with size 100 loaded successfully!
job batch 594 with size 100 loaded successfully!
job batch 593 with size 100 loaded successfully!
job batch 595 with size 100 loaded successfully!
job batch 597 with size 100 loaded successfully!
job batch 596 with size 100 loaded successfully!
job batch 598 with size 100 loaded successfully!
job batch 599 with size 100 loaded successfully!
job batch 600 with size 100 loaded successfully!
job batch 601 with size 100 loaded successfully!
job batch 602 with s

job batch 628 with size 100 loaded successfully!
job batch 629 with size 100 loaded successfully!
job batch 630 with size 100 loaded successfully!
job batch 631 with size 100 loaded successfully!
job batch 632 with size 100 loaded successfully!
job batch 635 with size 100 loaded successfully!
job batch 633 with size 100 loaded successfully!
job batch 634 with size 100 loaded successfully!
job batch 636 with size 100 loaded successfully!
job batch 637 with size 100 loaded successfully!
job batch 638 with size 100 loaded successfully!
job batch 639 with size 100 loaded successfully!
job batch 641 with size 100 loaded successfully!
job batch 640 with size 100 loaded successfully!
job batch 643 with size 100 loaded successfully!
job batch 644 with size 100 loaded successfully!
job batch 642 with size 100 loaded successfully!
job batch 646 with size 100 loaded successfully!
job batch 645 with size 100 loaded successfully!
job batch 647 with size 100 loaded successfully!
job batch 648 with s

job batch 671 with size 100 loaded successfully!
job batch 674 with size 100 loaded successfully!
job batch 675 with size 100 loaded successfully!
job batch 673 with size 100 loaded successfully!
job batch 672 with size 100 loaded successfully!
job batch 676 with size 100 loaded successfully!
job batch 677 with size 100 loaded successfully!
job batch 678 with size 100 loaded successfully!
job batch 679 with size 100 loaded successfully!
job batch 680 with size 100 loaded successfully!
job batch 681 with size 100 loaded successfully!
job batch 684 with size 100 loaded successfully!
job batch 683 with size 100 loaded successfully!
job batch 682 with size 100 loaded successfully!
job batch 685 with size 100 loaded successfully!
job batch 686 with size 100 loaded successfully!
job batch 687 with size 100 loaded successfully!
job batch 688 with size 100 loaded successfully!
job batch 689 with size 100 loaded successfully!
job batch 690 with size 100 loaded successfully!
job batch 691 with s

job batch 714 with size 100 loaded successfully!
job batch 716 with size 100 loaded successfully!
job batch 717 with size 100 loaded successfully!
job batch 718 with size 100 loaded successfully!
job batch 719 with size 100 loaded successfully!
job batch 720 with size 100 loaded successfully!
job batch 721 with size 100 loaded successfully!
job batch 724 with size 100 loaded successfully!
job batch 722 with size 100 loaded successfully!
job batch 723 with size 100 loaded successfully!
job batch 726 with size 100 loaded successfully!
job batch 725 with size 100 loaded successfully!
job batch 727 with size 100 loaded successfully!
job batch 728 with size 100 loaded successfully!
job batch 730 with size 100 loaded successfully!
job batch 729 with size 100 loaded successfully!
job batch 731 with size 100 loaded successfully!
job batch 732 with size 100 loaded successfully!
job batch 733 with size 100 loaded successfully!
job batch 735 with size 100 loaded successfully!
job batch 734 with s

KeyboardInterrupt: 

In [80]:
# merge all retrieved spectra into one 
fullfile = []

for k in os.listdir(save_directory):
    f= open(os.path.join(save_directory,k))
    w = f.readlines()
    fullfile += w
    f.close()
len(fullfile)

32835953

In [81]:
%%time
w2 = []

for l in fullfile:
    if l[:5]=='BEGIN':
        w2.append(l)
        w2.append('CHARGE=1+\n')
        w2.append('msLevel=2\n')
        w2.append('centroided=TRUE\n')
    else:
        w2.append(l)

        
len(w2),w2[:100]
        
        
    

CPU times: total: 14 s
Wall time: 14 s


(32572947,
 ['BEGIN IONS\n',
  'CHARGE=1+\n',
  'msLevel=2\n',
  'SCANS=49\n',
  'centroided=TRUE\n',
  'PEPMASS=279.192952549774\n',
  'USI=mzspec:MSV000081482:23-F-08:scan:1658\n',
  'SERVER=https://metabolomics-usi.ucsd.edu\n',
  'PRECURSOR_MZ=279.192952549774\n',
  '51.1412773132324 3717.54443359375\n',
  '52.7439765930176 3786.01904296875\n',
  '55.0547142028809 17481.58984375\n',
  '58.0655517578125 483579.28125\n',
  '59.0689086914062 30038.134765625\n',
  '61.8175277709961 5138.28955078125\n',
  '63.5320701599121 3538.70043945312\n',
  '63.5425567626953 5413.82568359375\n',
  '65.0389785766602 10767.5595703125\n',
  '67.0547256469727 16780.58984375\n',
  '67.5497817993164 4124.7197265625\n',
  '69.6728973388672 5563.7998046875\n',
  '70.0654754638672 29486.9765625\n',
  '71.0689239501953 4988.63330078125\n',
  '71.0856781005859 4513.79248046875\n',
  '72.0810165405273 23078.50390625\n',
  '77.0385818481445 7880.70703125\n',
  '79.0543060302734 33002.61328125\n',
  '81.070060729

In [82]:
%%time
w3 = []

for l in w2:
    if l[:4]=='USI=':
        w3.append('TITLE'+l[3:])
    else:
        w3.append(l)

        
len(w3),w3[:100]

CPU times: total: 9.11 s
Wall time: 9.11 s


(32572947,
 ['BEGIN IONS\n',
  'CHARGE=1+\n',
  'msLevel=2\n',
  'SCANS=49\n',
  'centroided=TRUE\n',
  'PEPMASS=279.192952549774\n',
  'TITLE=mzspec:MSV000081482:23-F-08:scan:1658\n',
  'SERVER=https://metabolomics-usi.ucsd.edu\n',
  'PRECURSOR_MZ=279.192952549774\n',
  '51.1412773132324 3717.54443359375\n',
  '52.7439765930176 3786.01904296875\n',
  '55.0547142028809 17481.58984375\n',
  '58.0655517578125 483579.28125\n',
  '59.0689086914062 30038.134765625\n',
  '61.8175277709961 5138.28955078125\n',
  '63.5320701599121 3538.70043945312\n',
  '63.5425567626953 5413.82568359375\n',
  '65.0389785766602 10767.5595703125\n',
  '67.0547256469727 16780.58984375\n',
  '67.5497817993164 4124.7197265625\n',
  '69.6728973388672 5563.7998046875\n',
  '70.0654754638672 29486.9765625\n',
  '71.0689239501953 4988.63330078125\n',
  '71.0856781005859 4513.79248046875\n',
  '72.0810165405273 23078.50390625\n',
  '77.0385818481445 7880.70703125\n',
  '79.0543060302734 33002.61328125\n',
  '81.0700607

In [83]:
%%time
with open('E:/Nina/analogMASST/20231123_analog_reprocess_delta_mass_with_direction/20231125_USI_spectra_after_all_filters3.mgf', 'w') as f:
    for line in w3:
        f.write(line)
f.close()

CPU times: total: 37.2 s
Wall time: 37.4 s


### Process the MGF with Falcon to remove duplicate spectra

### Read in Falcon results

In [84]:
dfFalcon = pd.read_csv('E:/Nina/analogMASST/20231123_analog_reprocess_delta_mass_with_direction/20231125_Falcon_cluster_result.csv')
print(dfFalcon.shape)
dfFalcon.head()

(75295, 5)


Unnamed: 0,identifier,precursor_charge,precursor_mz,retention_time,cluster
0,mzspec:USI000000:20231125_USI_spectra_after_al...,1,279.192952,-1,1260
1,mzspec:USI000000:20231125_USI_spectra_after_al...,1,314.08,-1,2027
2,mzspec:USI000000:20231125_USI_spectra_after_al...,1,312.082345,-1,-1
3,mzspec:USI000000:20231125_USI_spectra_after_al...,1,316.129032,-1,2054
4,mzspec:USI000000:20231125_USI_spectra_after_al...,1,325.096182,-1,2261


In [85]:
dfFalcon['scans'] = dfFalcon.identifier.apply(lambda x: int(x.split(':')[-1]))
print(dfFalcon.shape)
dfFalcon.head()

(75295, 6)


Unnamed: 0,identifier,precursor_charge,precursor_mz,retention_time,cluster,scans
0,mzspec:USI000000:20231125_USI_spectra_after_al...,1,279.192952,-1,1260,49
1,mzspec:USI000000:20231125_USI_spectra_after_al...,1,314.08,-1,2027,53
2,mzspec:USI000000:20231125_USI_spectra_after_al...,1,312.082345,-1,-1,85
3,mzspec:USI000000:20231125_USI_spectra_after_al...,1,316.129032,-1,2054,103
4,mzspec:USI000000:20231125_USI_spectra_after_al...,1,325.096182,-1,2261,157


In [86]:
dfUSIscan = pd.read_csv('E:/Nina/analogMASST/20231123_analog_reprocess_delta_mass_with_direction/20231125_USI_spectra_after_all_filters_mgf_scans_to_USI.csv')
print(dfUSIscan.shape)
dfUSIscan.head()

(75298, 3)


Unnamed: 0.1,Unnamed: 0,finalUSI,scans
0,1,mzspec:MSV000081482:23-F-08:scan:1658,49
1,2,mzspec:MSV000082582:84_Control_NLD_HC_3:scan:1408,53
2,3,mzspec:MSV000082667:MIX_RA1_01_3049:scan:2027,85
3,4,mzspec:MSV000082582:80_Prebiotic_NLD_HC_2:scan...,1000010
4,5,mzspec:MSV000079034:150116_EF_PMA_PSN475_NC-11...,1000047


In [87]:
dfclusterUSI = dfFalcon.merge(dfUSIscan, on = 'scans',how='left')
print(dfclusterUSI.shape)
dfclusterUSI.head()

(75295, 8)


Unnamed: 0.1,identifier,precursor_charge,precursor_mz,retention_time,cluster,scans,Unnamed: 0,finalUSI
0,mzspec:USI000000:20231125_USI_spectra_after_al...,1,279.192952,-1,1260,49,1,mzspec:MSV000081482:23-F-08:scan:1658
1,mzspec:USI000000:20231125_USI_spectra_after_al...,1,314.08,-1,2027,53,2,mzspec:MSV000082582:84_Control_NLD_HC_3:scan:1408
2,mzspec:USI000000:20231125_USI_spectra_after_al...,1,312.082345,-1,-1,85,3,mzspec:MSV000082667:MIX_RA1_01_3049:scan:2027
3,mzspec:USI000000:20231125_USI_spectra_after_al...,1,316.129032,-1,2054,103,502,mzspec:MSV000084680:BC6_BC6_01_13816:scan:1274
4,mzspec:USI000000:20231125_USI_spectra_after_al...,1,325.096182,-1,2261,157,503,mzspec:MSV000080629:Plate3_Std_Mix_2:scan:1995


In [88]:
%%time
def findCCMSID(usi,df):
    dftmp = df[df.USI==usi]
    return list(dftmp.ccmsid.values)
findCCMSID(dfclusterUSI.iloc[0]['finalUSI'],dff2)

CPU times: total: 31.2 ms
Wall time: 35.1 ms


['CCMSLIB00010131288',
 'CCMSLIB00010131289',
 'CCMSLIB00010131290',
 'CCMSLIB00010131291',
 'CCMSLIB00010131292',
 'CCMSLIB00010131293',
 'CCMSLIB00010131294',
 'CCMSLIB00003134977',
 'CCMSLIB00006681629',
 'CCMSLIB00003134726',
 'CCMSLIB00006115614',
 'CCMSLIB00006115615',
 'CCMSLIB00006115617',
 'CCMSLIB00006115619',
 'CCMSLIB00006115621',
 'CCMSLIB00006115622',
 'CCMSLIB00006115624',
 'CCMSLIB00006115626',
 'CCMSLIB00006115627',
 'CCMSLIB00006115629',
 'CCMSLIB00006115631',
 'CCMSLIB00006115633',
 'CCMSLIB00006115634',
 'CCMSLIB00006115636',
 'CCMSLIB00006115638',
 'CCMSLIB00006115639',
 'CCMSLIB00006115641',
 'CCMSLIB00006115643',
 'CCMSLIB00006115645',
 'CCMSLIB00006115647',
 'CCMSLIB00006682038',
 'CCMSLIB00000579660',
 'CCMSLIB00006682623',
 'CCMSLIB00003136124',
 'CCMSLIB00003138900',
 'CCMSLIB00006678588',
 'CCMSLIB00005760130',
 'CCMSLIB00005761090',
 'CCMSLIB00005761267',
 'CCMSLIB00005761278',
 'CCMSLIB00005763070',
 'CCMSLIB00005763715',
 'CCMSLIB00005764539',
 'CCMSLIB00

In [89]:
dff2.head()

Unnamed: 0,Delta Mass,USI,Cosine,Matching Peaks,Status,ccmsid,ccmsMass,usiMass,keep,keepMass
2,42.01,mzspec:MSV000081176:C3:scan:1166,0.8,10.0,NoID,CCMSLIB00010139798,385.20096,343.190313,1.0,1.0
5,42.01,mzspec:MSV000081176:Blank_170620013826:scan:1188,0.85,11.0,NoID,CCMSLIB00010139799,385.20096,343.190593,1.0,1.0
6,42.01,mzspec:MSV000081176:B9:scan:1179,0.84,11.0,NoID,CCMSLIB00010139799,385.20096,343.190453,1.0,1.0
8,42.01,mzspec:MSV000081176:C1:scan:1128,0.82,10.0,NoID,CCMSLIB00010139799,385.20096,343.190316,1.0,1.0
9,42.01,mzspec:MSV000081176:A1:scan:1332,0.8,10.0,NoID,CCMSLIB00010139799,385.20096,343.190389,1.0,1.0


In [90]:
%%time
dfclusterUSI = dfclusterUSI.merge(dff2,left_on = 'finalUSI',right_on = 'USI',how = 'right')
print(dfclusterUSI.shape)
dfclusterUSI.head()

(248951, 18)
CPU times: total: 219 ms
Wall time: 234 ms


Unnamed: 0.1,identifier,precursor_charge,precursor_mz,retention_time,cluster,scans,Unnamed: 0,finalUSI,Delta Mass,USI,Cosine,Matching Peaks,Status,ccmsid,ccmsMass,usiMass,keep,keepMass
0,mzspec:USI000000:20231125_USI_spectra_after_al...,1.0,343.190313,-1.0,2460.0,1382156.0,13489.0,mzspec:MSV000081176:C3:scan:1166,42.01,mzspec:MSV000081176:C3:scan:1166,0.8,10.0,NoID,CCMSLIB00010139798,385.20096,343.190313,1.0,1.0
1,mzspec:USI000000:20231125_USI_spectra_after_al...,1.0,343.190593,-1.0,2460.0,1384071.0,13581.0,mzspec:MSV000081176:Blank_170620013826:scan:1188,42.01,mzspec:MSV000081176:Blank_170620013826:scan:1188,0.85,11.0,NoID,CCMSLIB00010139799,385.20096,343.190593,1.0,1.0
2,mzspec:USI000000:20231125_USI_spectra_after_al...,1.0,343.190453,-1.0,2460.0,86668.0,55870.0,mzspec:MSV000081176:B9:scan:1179,42.01,mzspec:MSV000081176:B9:scan:1179,0.84,11.0,NoID,CCMSLIB00010139799,385.20096,343.190453,1.0,1.0
3,mzspec:USI000000:20231125_USI_spectra_after_al...,1.0,343.190316,-1.0,2460.0,1606663.0,65063.0,mzspec:MSV000081176:C1:scan:1128,42.01,mzspec:MSV000081176:C1:scan:1128,0.82,10.0,NoID,CCMSLIB00010139799,385.20096,343.190316,1.0,1.0
4,mzspec:USI000000:20231125_USI_spectra_after_al...,1.0,343.190389,-1.0,2460.0,574228.0,42625.0,mzspec:MSV000081176:A1:scan:1332,42.01,mzspec:MSV000081176:A1:scan:1332,0.8,10.0,NoID,CCMSLIB00010139799,385.20096,343.190389,1.0,1.0


In [91]:
float('nan') in list(dfclusterUSI.cluster.unique())

False

In [95]:
dfSimilarity = pd.read_csv('E:/Nina/analogMASST/ansEXONLYpoint8.csv')
dfSimilarity.head()

Unnamed: 0,gnps_ID,compound_name,inchi,inchikey,isomeric_smiles,smiles_harmonized,ID,matching_atoms_min_clusterID,matching_atoms_max_clusterID,TanimotoSimilarity_lin_clusterID,TanimotoSimilarity_lay_clusterID,DiceSimilarity_lin_clusterID,DiceSimilarity_lay_clusterID,CosineSimilarity_lin_clusterID,CosineSimilarity_lay_clusterID
0,CCMSLIB00000001635,Rifamycin W,InChI=1S/C35H45NO11/c1-14-9-8-10-15(2)35(47)36...,PHKOJKSYBBXXED-OCENJLCRSA-N,C/C1=C/C=C/[C@H](C)[C@H](O)[C@@H](C)[C@@H](O)[...,C=C1C=C(CO)C(O)C(C)C(O)C(C)C(O)C(C)C(O)C(C)=CC...,0.0,,0.0,1800.0,,0.0,0.0,0.0,0.0
1,CCMSLIB00000001637,Rifamycin W,InChI=1S/C35H45NO11/c1-14-9-8-10-15(2)35(47)36...,PHKOJKSYBBXXED-OCENJLCRSA-N,C/C1=C/C=C/[C@H](C)[C@H](O)[C@@H](C)[C@@H](O)[...,C=C1C=C(CO)C(O)C(C)C(O)C(C)C(O)C(C)C(O)C(C)=CC...,0.0,,0.0,1800.0,,0.0,0.0,0.0,0.0
2,CCMSLIB00000001644,Dolastatin_10,InChI=1S/C42H68N6O6S/c1-13-28(6)37(47(10)42(52...,OFDNQWIFNXBECV-VFSYNPLYSA-N,CC[C@H](C)[C@@H]([C@@H](CC(=O)N1CCC[C@H]1[C@H]...,CCC(C)C(C(CC(=O)N1CCCC1C(OC)C(C)C(=O)NC(Cc1ccc...,1.0,757.0,0.0,923.0,731.0,630.0,0.0,571.0,0.0
3,CCMSLIB00000001651,Rifamycin S,InChI=1S/C37H45NO12/c1-16-11-10-12-17(2)36(46)...,BTVYFIMKUHNOBZ-ODRIEIDWSA-N,CO[C@H]1/C=C/O[C@@]2(C)Oc3c(C2=O)c2c(C(=O)C(=C...,C=C1C=CC=C(C)C(O)C(C)C(O)C(C)C(OC(C)=O)C(C)C(O...,2.0,38.0,0.0,42.0,43.0,0.0,0.0,0.0,0.0
4,CCMSLIB00000001653,Rifamycin S,InChI=1S/C37H45NO12/c1-16-11-10-12-17(2)36(46)...,BTVYFIMKUHNOBZ-ODRIEIDWSA-N,CO[C@H]1/C=C/O[C@@]2(C)Oc3c(C2=O)c2c(C(=O)C(=C...,C=C1C=CC=C(C)C(O)C(C)C(O)C(C)C(OC(C)=O)C(C)C(O...,2.0,38.0,0.0,42.0,43.0,0.0,0.0,0.0,0.0


In [96]:
dfSimilarity.TanimotoSimilarity_lin_clusterID.max()

3158.0

In [97]:
%%time
dfclusterUSI = dfclusterUSI.merge(dfSimilarity,left_on = 'ccmsid',right_on = 'gnps_ID',how = 'left')
print(dfclusterUSI.shape)
dfclusterUSI.head()

(248951, 33)
CPU times: total: 203 ms
Wall time: 185 ms


Unnamed: 0.1,identifier,precursor_charge,precursor_mz,retention_time,cluster,scans,Unnamed: 0,finalUSI,Delta Mass,USI,...,smiles_harmonized,ID,matching_atoms_min_clusterID,matching_atoms_max_clusterID,TanimotoSimilarity_lin_clusterID,TanimotoSimilarity_lay_clusterID,DiceSimilarity_lin_clusterID,DiceSimilarity_lay_clusterID,CosineSimilarity_lin_clusterID,CosineSimilarity_lay_clusterID
0,mzspec:USI000000:20231125_USI_spectra_after_al...,1.0,343.190313,-1.0,2460.0,1382156.0,13489.0,mzspec:MSV000081176:C3:scan:1166,42.01,mzspec:MSV000081176:C3:scan:1166,...,CC(=O)OCC(=O)C1(O)CCC2C3CC=C4CC(=O)C=CC4(C)C3C...,64.0,15.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0
1,mzspec:USI000000:20231125_USI_spectra_after_al...,1.0,343.190593,-1.0,2460.0,1384071.0,13581.0,mzspec:MSV000081176:Blank_170620013826:scan:1188,42.01,mzspec:MSV000081176:Blank_170620013826:scan:1188,...,CC(=O)OCC(=O)C1(O)CCC2C3CC=C4CC(=O)C=CC4(C)C3C...,64.0,15.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0
2,mzspec:USI000000:20231125_USI_spectra_after_al...,1.0,343.190453,-1.0,2460.0,86668.0,55870.0,mzspec:MSV000081176:B9:scan:1179,42.01,mzspec:MSV000081176:B9:scan:1179,...,CC(=O)OCC(=O)C1(O)CCC2C3CC=C4CC(=O)C=CC4(C)C3C...,64.0,15.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0
3,mzspec:USI000000:20231125_USI_spectra_after_al...,1.0,343.190316,-1.0,2460.0,1606663.0,65063.0,mzspec:MSV000081176:C1:scan:1128,42.01,mzspec:MSV000081176:C1:scan:1128,...,CC(=O)OCC(=O)C1(O)CCC2C3CC=C4CC(=O)C=CC4(C)C3C...,64.0,15.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0
4,mzspec:USI000000:20231125_USI_spectra_after_al...,1.0,343.190389,-1.0,2460.0,574228.0,42625.0,mzspec:MSV000081176:A1:scan:1332,42.01,mzspec:MSV000081176:A1:scan:1332,...,CC(=O)OCC(=O)C1(O)CCC2C3CC=C4CC(=O)C=CC4(C)C3C...,64.0,15.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0


### Combine drug metadata into working dataframe

In [98]:
mceNames = pd.read_csv('E:/Nina/analogMASST/RT 20230821 analog MASST library mgf generation/20230718_mce_lib_pos_all_phase_metadata_GNPS_format.csv')
print(mceNames.shape)
mceNames.head()

(22921, 21)


Unnamed: 0.1,Unnamed: 0,PEPMASS,CHARGE,MSLEVEL,SOURCE_INSTRUMENT,FILENAME,RTINSECONDS,SEQ,NOTES,IONMODE,...,ORGANISM,NAME,SMILES,INCHI,INCHIAUX,LIBRARYQUALITY,SPECTRUMID,INSTRUMENT,TITLE,SCANS
0,X1,156.12437,0+,2,ESI-Orbitrap,20230312_mce_library_pos_all_phase_MS2_modifie...,78.269997,*..*,Tomas Pluskal:Corinna Brungs:N/A:N/A:Crude:N/A,Positive,...,blank,Imeglimin,C[C@@H]1N=C(N)NC(N(C)C)=N1,"""InChI=1S/C6H13N5/c1-4-8-5(7)10-6(9-4)11(2)3/h...",GFICWFZTBXUVIG-SCSAIBSYSA-N,3,CCMSLIB00010129725,ion trap,Scan Number: 1,1
1,X10,241.09715,1+,2,ESI-Orbitrap,20230312_mce_library_pos_all_phase_MS2_modifie...,78.269997,*..*,Tomas Pluskal:Corinna Brungs:N/A:N/A:Crude:N/A,Positive,...,blank,1800405-30-4 (Chimeric precursor selection),CC(C)c1nc2c([nH]1)C(=O)C(=O)c1ccccc1-2,"""InChI=1S/C14H12N2O2/c1-7(2)14-15-10-8-5-3-4-6...",AJFWITSBVLLDCC-UHFFFAOYSA-N,3,CCMSLIB00010129734,ion trap,Scan Number: 10,10
2,X100,504.21048,1+,2,ESI-Orbitrap,20230312_mce_library_pos_all_phase_MS2_modifie...,61.860001,*..*,Tomas Pluskal:Corinna Brungs:N/A:N/A:Crude:N/A,Positive,...,blank,glucagon receptor antagonists-4,CCC[C@H](Oc1cc(C)c(-n2cc(C(F)(F)F)cn2)c(C)c1)c...,"""InChI=1S/C26H28F3N3O4/c1-4-5-22(18-6-8-19(9-7...",IBDYYOQKQCCSDP-QFIPXVFZSA-N,3,CCMSLIB00010129824,ion trap,Scan Number: 100,100
3,X1000,810.45538,1+,2,ESI-Orbitrap,20230312_mce_library_pos_all_phase_MS2_modifie...,110.089996,*..*,Tomas Pluskal:Corinna Brungs:N/A:N/A:Crude:N/A,Positive,...,blank,Pimecrolimus,CC[C@@H]1/C=C(\C)C[C@H](C)C[C@H](OC)[C@H]2O[C@...,"""InChI=1S/C43H68ClNO11/c1-10-30-18-24(2)17-25(...",KASDHRXLYQOAKZ-XDSKOBMDSA-N,3,CCMSLIB00010130724,ion trap,Scan Number: 1000,1000
4,X10000,190.04987,1+,2,ESI-Orbitrap,20230312_mce_library_pos_all_phase_MS2_modifie...,48.16,*..*,Tomas Pluskal:Corinna Brungs:N/A:N/A:Crude:N/A,Positive,...,blank,kynurenic acid,O=C(O)c1cc(=O)c2ccccc2[nH]1,"""InChI=1S/C10H7NO3/c12-9-5-8(10(13)14)11-7-4-2...",HCZHHEIFKROPDY-UHFFFAOYSA-N,3,CCMSLIB00010139724,ion trap,Scan Number: 10000,10000


In [99]:
mceNames = mceNames[['NAME','SPECTRUMID']]
mceNames.head()

Unnamed: 0,NAME,SPECTRUMID
0,Imeglimin,CCMSLIB00010129725
1,1800405-30-4 (Chimeric precursor selection),CCMSLIB00010129734
2,glucagon receptor antagonists-4,CCMSLIB00010129824
3,Pimecrolimus,CCMSLIB00010130724
4,kynurenic acid,CCMSLIB00010139724


In [100]:
len(mceNames.SPECTRUMID.unique())

22921

In [101]:
gnpsNames = pd.read_csv('E:/Nina/analogMASST/RT 20230821 analog MASST library mgf generation/20230913_GNPS_drug_cleanup_name_to_gnpsid.csv')
print(gnpsNames.shape)
gnpsNames.head()

(76415, 2)


Unnamed: 0,gnps_libid,cleanup_name
0,CCMSLIB00000564922,canrenone
1,CCMSLIB00000564945,canrenone
2,CCMSLIB00000564977,canrenone
3,CCMSLIB00000573761,canrenone
4,CCMSLIB00000579769,canrenone


In [102]:
gnpsNames = gnpsNames[['cleanup_name','gnps_libid']]
gnpsNames=gnpsNames.rename(columns={"cleanup_name": "NAME", "gnps_libid": "SPECTRUMID"})
gnpsNames.head()

Unnamed: 0,NAME,SPECTRUMID
0,canrenone,CCMSLIB00000564922
1,canrenone,CCMSLIB00000564945
2,canrenone,CCMSLIB00000564977
3,canrenone,CCMSLIB00000573761
4,canrenone,CCMSLIB00000579769


In [103]:
allNames = gnpsNames.append(mceNames)
print(allNames.shape)
allNames.head()

(99336, 2)


Unnamed: 0,NAME,SPECTRUMID
0,canrenone,CCMSLIB00000564922
1,canrenone,CCMSLIB00000564945
2,canrenone,CCMSLIB00000564977
3,canrenone,CCMSLIB00000573761
4,canrenone,CCMSLIB00000579769


In [104]:
allNames['NAME']=allNames['NAME'].apply(lambda x:x.lower())
allNames.head()

Unnamed: 0,NAME,SPECTRUMID
0,canrenone,CCMSLIB00000564922
1,canrenone,CCMSLIB00000564945
2,canrenone,CCMSLIB00000564977
3,canrenone,CCMSLIB00000573761
4,canrenone,CCMSLIB00000579769


In [106]:
%%time
dfclusterUSI = dfclusterUSI.merge(allNames,left_on = 'ccmsid',right_on = 'SPECTRUMID',how = 'left')
print(dfclusterUSI.shape)
dfclusterUSI.head()

(250629, 35)
CPU times: total: 203 ms
Wall time: 203 ms


Unnamed: 0.1,identifier,precursor_charge,precursor_mz,retention_time,cluster,scans,Unnamed: 0,finalUSI,Delta Mass,USI,...,matching_atoms_min_clusterID,matching_atoms_max_clusterID,TanimotoSimilarity_lin_clusterID,TanimotoSimilarity_lay_clusterID,DiceSimilarity_lin_clusterID,DiceSimilarity_lay_clusterID,CosineSimilarity_lin_clusterID,CosineSimilarity_lay_clusterID,NAME,SPECTRUMID
0,mzspec:USI000000:20231125_USI_spectra_after_al...,1.0,343.190313,-1.0,2460.0,1382156.0,13489.0,mzspec:MSV000081176:C3:scan:1166,42.01,mzspec:MSV000081176:C3:scan:1166,...,15.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,prednisolone acetate,CCMSLIB00010139798
1,mzspec:USI000000:20231125_USI_spectra_after_al...,1.0,343.190593,-1.0,2460.0,1384071.0,13581.0,mzspec:MSV000081176:Blank_170620013826:scan:1188,42.01,mzspec:MSV000081176:Blank_170620013826:scan:1188,...,15.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,prednisolone acetate,CCMSLIB00010139799
2,mzspec:USI000000:20231125_USI_spectra_after_al...,1.0,343.190453,-1.0,2460.0,86668.0,55870.0,mzspec:MSV000081176:B9:scan:1179,42.01,mzspec:MSV000081176:B9:scan:1179,...,15.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,prednisolone acetate,CCMSLIB00010139799
3,mzspec:USI000000:20231125_USI_spectra_after_al...,1.0,343.190316,-1.0,2460.0,1606663.0,65063.0,mzspec:MSV000081176:C1:scan:1128,42.01,mzspec:MSV000081176:C1:scan:1128,...,15.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,prednisolone acetate,CCMSLIB00010139799
4,mzspec:USI000000:20231125_USI_spectra_after_al...,1.0,343.190389,-1.0,2460.0,574228.0,42625.0,mzspec:MSV000081176:A1:scan:1332,42.01,mzspec:MSV000081176:A1:scan:1332,...,15.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,prednisolone acetate,CCMSLIB00010139799


In [107]:
dfclusterUSI.columns.values

array(['identifier', 'precursor_charge', 'precursor_mz', 'retention_time',
       'cluster', 'scans', 'Unnamed: 0', 'finalUSI', 'Delta Mass', 'USI',
       'Cosine', 'Matching Peaks', 'Status', 'ccmsid', 'ccmsMass',
       'usiMass', 'keep', 'keepMass', 'gnps_ID', 'compound_name', 'inchi',
       'inchikey', 'isomeric_smiles', 'smiles_harmonized', 'ID',
       'matching_atoms_min_clusterID', 'matching_atoms_max_clusterID',
       'TanimotoSimilarity_lin_clusterID',
       'TanimotoSimilarity_lay_clusterID', 'DiceSimilarity_lin_clusterID',
       'DiceSimilarity_lay_clusterID', 'CosineSimilarity_lin_clusterID',
       'CosineSimilarity_lay_clusterID', 'NAME', 'SPECTRUMID'],
      dtype=object)

In [108]:
dfclusterUSI.drop(['Unnamed: 0'],axis=1,inplace=True)

In [109]:
dfclusterUSI.iloc[0]

identifier                          mzspec:USI000000:20231125_USI_spectra_after_al...
precursor_charge                                                                  1.0
precursor_mz                                                               343.190313
retention_time                                                                   -1.0
cluster                                                                        2460.0
scans                                                                       1382156.0
finalUSI                                             mzspec:MSV000081176:C3:scan:1166
Delta Mass                                                                      42.01
USI                                                  mzspec:MSV000081176:C3:scan:1166
Cosine                                                                            0.8
Matching Peaks                                                                   10.0
Status                                                

In [110]:
dfclusterUSI.columns.values

array(['identifier', 'precursor_charge', 'precursor_mz', 'retention_time',
       'cluster', 'scans', 'finalUSI', 'Delta Mass', 'USI', 'Cosine',
       'Matching Peaks', 'Status', 'ccmsid', 'ccmsMass', 'usiMass',
       'keep', 'keepMass', 'gnps_ID', 'compound_name', 'inchi',
       'inchikey', 'isomeric_smiles', 'smiles_harmonized', 'ID',
       'matching_atoms_min_clusterID', 'matching_atoms_max_clusterID',
       'TanimotoSimilarity_lin_clusterID',
       'TanimotoSimilarity_lay_clusterID', 'DiceSimilarity_lin_clusterID',
       'DiceSimilarity_lay_clusterID', 'CosineSimilarity_lin_clusterID',
       'CosineSimilarity_lay_clusterID', 'NAME', 'SPECTRUMID'],
      dtype=object)

In [113]:
dfclusterUSI=dfclusterUSI.rename(columns={'identifier':'falcon_identifier', 'precursor_charge':'falcon_precursor_charge',\
                                          'precursor_mz':'falcon_precursor_mz', 'retention_time':'falcon_retention_time',\
                                          'cluster':'falcon_cluster', 'scans':'falcon_scans',\
       'compound_name':'dfSimilarity_compound_name', 'inchi':'dfSimilarity_inchi', 'inchikey':'dfSimilarity_inchikey',\
       'isomeric_smiles':'dfSimilarity_isomeric_smiles', 'smiles_harmonized':'dfSimilarity_smiles_harmonized', 'ID':'dfSimilarity_ID',\
       'matching_atoms_min_clusterID':'dfSimilarity_matching_atoms_min_clusterID', \
        'matching_atoms_max_clusterID':'dfSimilarity_matching_atoms_max_clusterID',\
       'dfSimilaritySimilarity_lin_clusterID':'dfSimilarity_dfSimilaritySimilarity_lin_clusterID', \
       'dfSimilaritySimilarity_lay_clusterID':'dfSimilarity_dfSimilaritySimilarity_lay_clusterID', \
       'DiceSimilarity_lin_clusterID':'dfSimilarity_DiceSimilarity_lin_clusterID', \
       'DiceSimilarity_lay_clusterID':'dfSimilarity_DiceSimilarity_lay_clusterID', \
       'CosineSimilarity_lin_clusterID':'dfSimilarity_CosineSimilarity_lin_clusterID', \
       'CosineSimilarity_lay_clusterID':'dfSimilarity_CosineSimilarity_lay_clusterID'})

dfclusterUSI.to_csv('E:/Nina/analogMASST/20231123_analog_reprocess_delta_mass_with_direction/20231125_all_forms_combined_for_structure_class_filter_and_name_match.csv')

### Structural cluster and delta mass occurence filtering

### filter step 1  falcon_cluster to structure cluster delta mass frequency

In [176]:
dfFalconCluster = dfclusterUSI[dfclusterUSI.falcon_cluster>=0].copy() # remove singleton after spectral clustering

other_columns = dfFalconCluster.columns.difference(['falcon_cluster','dfSimilarity_ID','Delta Mass'])

grouped_df_DeltaMass = dfFalconCluster.groupby(['falcon_cluster','dfSimilarity_ID','Delta Mass'])[other_columns].agg(lambda x: list(set(x))).reset_index()

uniqueMasses = Counter([a for a in list(grouped_df_DeltaMass['Delta Mass'].values)])
len(uniqueMasses.keys())

198

In [177]:
sorted(uniqueMasses.items(),key = lambda x:-x[1])

[(14.02, 274),
 (-14.02, 195),
 (-1.0, 173),
 (-2.02, 160),
 (-15.99, 139),
 (28.03, 128),
 (-13.98, 117),
 (-3.99, 113),
 (-27.99, 111),
 (2.02, 109),
 (-28.03, 107),
 (-2.0, 105),
 (-1.98, 105),
 (12.0, 102),
 (42.01, 102),
 (26.02, 75),
 (18.01, 73),
 (12.04, 69),
 (-57.02, 67),
 (-16.03, 66),
 (-4.0, 65),
 (-18.01, 63),
 (-30.01, 62),
 (-44.03, 61),
 (-0.98, 59),
 (-16.0, 58),
 (13.98, 57),
 (44.03, 55),
 (-28.0, 54),
 (40.03, 52),
 (-4.03, 52),
 (-1.01, 51),
 (42.05, 50),
 (-0.99, 50),
 (30.01, 48),
 (-12.0, 47),
 (15.99, 47),
 (-29.97, 46),
 (1.98, 46),
 (-17.03, 45),
 (-2.01, 44),
 (-42.01, 44),
 (-31.99, 42),
 (29.97, 42),
 (-14.01, 41),
 (16.03, 40),
 (-12.04, 40),
 (58.01, 40),
 (14.01, 36),
 (60.02, 35),
 (27.99, 34),
 (38.02, 33),
 (-1.99, 33),
 (-15.96, 32),
 (17.03, 31),
 (116.06, 30),
 (31.99, 30),
 (-1.94, 30),
 (27.05, 27),
 (-15.0, 27),
 (4.03, 26),
 (56.06, 26),
 (58.0, 26),
 (-14.96, 25),
 (16.0, 24),
 (-3.0, 23),
 (-29.98, 23),
 (2.01, 22),
 (21.98, 21),
 (44.02, 2

In [178]:
massInclude = []
massabs = []
for key in uniqueMasses:
    if uniqueMasses[key]>=5:
        massInclude.append(key)
        #massInclude.append(-key)
        massabs.append(key)
len(massInclude),len(massabs)

(135, 135)

In [179]:
len(set(massInclude))

135

In [183]:
dfFalconCluster_DMassFiltered = dfFalconCluster[dfFalconCluster['Delta Mass'].isin(massInclude)]
dfFalconCluster_DMassFiltered.shape

(230630, 34)

### filter step 2  falcon_cluster has one Tamimoto structure class

In [188]:
other_columns = dfFalconCluster_DMassFiltered.columns.difference(['falcon_cluster'])
grouped_df_FalconCluster =dfFalconCluster_DMassFiltered.groupby('falcon_cluster')[other_columns].agg(lambda x: list(set(x))).reset_index()
grouped_df_FalconCluster['NumberOfStructureClass'] = grouped_df_FalconCluster.dfSimilarity_TanimotoSimilarity_lin_clusterID.apply(len)
grouped_df_FalconCluster['OneStructureClass'] = grouped_df_FalconCluster.NumberOfStructureClass.apply(lambda x: True if x==1 else False)
grouped_df_FalconCluster.OneStructureClass.sum()

3156

In [189]:
clusterInclude = list(grouped_df_FalconCluster[grouped_df_FalconCluster.OneStructureClass==True].falcon_cluster.unique())
len(clusterInclude)

3156

In [191]:
dfFalconCluster_DMassFiltered_ClusterFiltered = dfFalconCluster_DMassFiltered[dfFalconCluster_DMassFiltered['falcon_cluster'].isin(clusterInclude)]
len(dfFalconCluster_DMassFiltered_ClusterFiltered.falcon_cluster.unique())

3156

In [192]:
%%time
other_columns = dfFalconCluster.columns.difference(['falcon_cluster','dfSimilarity_ID','Delta Mass'])

grouped_df_DeltaMass2 = dfFalconCluster_DMassFiltered_ClusterFiltered.groupby(['falcon_cluster','dfSimilarity_ID','Delta Mass'])[other_columns].agg(lambda x: list(set(x))).reset_index()

uniqueMasses2 = Counter([a for a in list(grouped_df_DeltaMass2['Delta Mass'].values)])
print(len(uniqueMasses2.keys()))
sorted(uniqueMasses2.items(),key = lambda x:-x[1])

133
CPU times: total: 3.34 s
Wall time: 3.34 s


[(14.02, 226),
 (-2.02, 140),
 (-14.02, 130),
 (-15.99, 106),
 (-3.99, 103),
 (2.02, 98),
 (-1.0, 95),
 (28.03, 93),
 (-28.03, 91),
 (-1.98, 89),
 (42.01, 79),
 (-13.98, 78),
 (12.0, 73),
 (-27.99, 70),
 (18.01, 65),
 (-2.0, 64),
 (-16.03, 63),
 (26.02, 61),
 (-4.0, 54),
 (-0.98, 52),
 (-18.01, 50),
 (12.04, 50),
 (40.03, 48),
 (13.98, 47),
 (42.05, 46),
 (-57.02, 45),
 (-17.03, 44),
 (-4.03, 42),
 (15.99, 39),
 (29.97, 39),
 (-30.01, 38),
 (44.03, 37),
 (-42.01, 36),
 (1.98, 35),
 (-16.0, 34),
 (30.01, 33),
 (58.01, 32),
 (-12.0, 31),
 (-29.97, 31),
 (116.06, 29),
 (-28.0, 29),
 (38.02, 28),
 (16.03, 28),
 (60.02, 28),
 (17.03, 28),
 (-1.94, 26),
 (-15.96, 25),
 (31.99, 24),
 (-31.99, 24),
 (27.05, 23),
 (-14.96, 22),
 (58.0, 22),
 (-14.01, 22),
 (27.99, 22),
 (-12.04, 20),
 (56.06, 19),
 (-44.03, 19),
 (-0.99, 19),
 (4.03, 18),
 (2.01, 18),
 (56.03, 18),
 (-1.99, 18),
 (-1.01, 18),
 (21.98, 17),
 (14.01, 17),
 (-36.02, 17),
 (0.98, 16),
 (68.06, 16),
 (16.0, 16),
 (90.05, 15),
 (-58.

In [194]:
%%time
dfFalconCluster_DMassFiltered_ClusterFiltered.to_csv('E:/Nina/analogMASST/20231123_analog_reprocess_delta_mass_with_direction/20231125_all_forms_combined_after_delta_mass_and_structure_class_filter_FINAL.csv')

CPU times: total: 2.11 s
Wall time: 2.83 s


### write results into mgf

In [198]:
other_columns = dfFalconCluster_DMassFiltered_ClusterFiltered.columns.difference(['falcon_cluster','NAME','Delta Mass'])

grouped_df_mgf = dfFalconCluster_DMassFiltered_ClusterFiltered.groupby(['falcon_cluster','NAME','Delta Mass'])[other_columns].agg(lambda x: list(set(x))).reset_index()

len(grouped_df_mgf.falcon_cluster.unique()),grouped_df_mgf.falcon_cluster.unique()

(3156,
 array([0.000e+00, 1.000e+00, 2.000e+00, ..., 3.666e+03, 3.667e+03,
        3.668e+03]))

In [199]:
%%time
def comment(record):
    ans = record.NAME + " (Delta Mass:"+ str(record['Delta Mass'])+")"
    return ans
comment(grouped_df_mgf.iloc[1000])

CPU times: total: 0 ns
Wall time: 977 µs


'perhexiline (Delta Mass:12.0)'

In [201]:
grouped_df_mgf['comment'] = grouped_df_mgf.apply((lambda x: comment(x)),axis=1)

other_columns = grouped_df_mgf.columns.difference(['falcon_cluster'])

grouped_df_mgf_final = grouped_df_mgf.groupby(['falcon_cluster'])[other_columns].agg(lambda x: list(set(x))).reset_index()

print(grouped_df_mgf_final.shape)
grouped_df_mgf_final.head()

(3156, 4)


Unnamed: 0,falcon_cluster,Delta Mass,NAME,comment
0,0.0,[-1.0],[benzimidazole],[benzimidazole (Delta Mass:-1.0)]
1,1.0,[-12.0],[hydroquinone],[hydroquinone (Delta Mass:-12.0)]
2,2.0,[27.05],[thioguanine],[thioguanine (Delta Mass:27.05)]
3,3.0,[-15.99],[hydroquinone],[hydroquinone (Delta Mass:-15.99)]
4,4.0,[-2.02],[cycloserine],[cycloserine (Delta Mass:-2.02)]


In [203]:
rightClusters =  set(grouped_df_mgf_final.falcon_cluster.unique())
len(rightClusters)#,rightClusters

3156

In [204]:
with open('E:/Nina/analogMASST/20231123_analog_reprocess_delta_mass_with_direction/Falcon output/falcon.mgf') as f:
    allClusters = f.readlines()
    f.close()
len(allClusters)

495184

In [209]:
%%time
res = []
i=0
j=1
while( i<len(allClusters)):
    #print(i)
    if allClusters[i][:5]=='BEGIN':
        clu = int(allClusters[i+5].split('=')[-1])
        #print('clu:',clu)
        if  clu not  in rightClusters:
            #print('clu:',clu)
            while(allClusters[i][:3]!='END'):
                i+=1
            i+=1
            #print(allClusters[i])
        else:
            tmp = []
            tmp.append(allClusters[i])
            for k in range(1,6):
                i+=1
                tmp.append(allClusters[i])
            names = 'NAME=related spectra of ' + str(grouped_df_mgf_final[grouped_df_mgf_final.falcon_cluster==clu].comment.values[0])[1:-1]+'\n'
            tmp.append(names)
            tmp.append('SCANS='+str(j)+'\n')
            j+=1
            i+=1    
            while(allClusters[i][:3]!='END'):
                tmp.append(allClusters[i])
                i+=1
            tmp.append(allClusters[i])
            i+=1
            tmp.append('\n')
            res+=tmp
    else:
        i+=1
len(res)

CPU times: total: 1.5 s
Wall time: 1.44 s


430723

In [210]:
res[:1000]

['BEGIN IONS\n',
 'TITLE=mzspec:USI000000:20231125_USI_spectra_after_all_filters3:scan:1417019\n',
 'PEPMASS=120.055565643706\n',
 'RTINSECONDS=-1.0\n',
 'CHARGE=1+\n',
 'CLUSTER=0\n',
 "NAME=related spectra of 'benzimidazole (Delta Mass:-1.0)'\n",
 'SCANS=1\n',
 '52.01883 66356.7\n',
 '53.02671 11073.8\n',
 '53.85208 3038.9\n',
 '54.03445 7380.6\n',
 '55.01387 3938.1\n',
 '56.05009 5559.7\n',
 '57.03733 3398.7\n',
 '64.75687 3346.6\n',
 '66.03443 37882.3\n',
 '67.02975 11643.8\n',
 '68.05003 8479.2\n',
 '70.62389 3288.2\n',
 '78.03427 10257.6\n',
 '79.02962 103977.9\n',
 '80.03741 104059.1\n',
 '80.35188 3167.9\n',
 '83.64928 3291.6\n',
 '85.08935 3403.1\n',
 '85.58525 2973.2\n',
 '90.28767 3687.4\n',
 '90.97583 3562.8\n',
 '91.05482 4416.7\n',
 '93.04511 154772.2\n',
 '93.75025 3346.8\n',
 '94.04026 5740.7\n',
 '97.04022 3791.3\n',
 '100.10415 3174.8\n',
 '103.02932 7070.8\n',
 '104.05269 3334.4\n',
 '104.42691 3880.2\n',
 '109.58183 3332.5\n',
 '111.05554 13457.0\n',
 '119.06051 140

In [211]:
with open('E:/Nina/analogMASST/20231123_analog_reprocess_delta_mass_with_direction/20231125_drug_analog_Falcon_filtered_DeltaMass_Directional_FINAL.mgf', 'w') as f:
    for line in res:
        f.write(line)
f.close()