# Add annotations
In this notebooks annotations are added.
MS2Query annotations are created by running MS2Query on the files  "./cleaned_pos_spectra.mgf" and "./cleaned_neg_spectra.mgf" created in the pre_processing_notebook

### format ms2query results


#### Load in ms2query annotations

In [58]:
import pandas as pd
import os
file_ms2query = os.path.join("results", "cleaned_pos_spectra.csv")
pos_ms2query = pd.read_csv(file_ms2query)
file_ms2query = os.path.join("results", "cleaned_neg_spectra.csv")
neg_ms2query = pd.read_csv(file_ms2query)

#### fix ms2query bug 
A bug resulted in weird formatting of the compound classes, these lines fix this. MS2Query version >1.5.3 does not have this bug, but these results were created with 1.5.2

In [59]:
neg_ms2query.loc[neg_ms2query["cf_kingdom"].str.startswith("["), ["cf_superclass", "cf_class", "cf_subclass", "cf_direct_parent", "cf_kingdom"]] = "unknown"
pos_ms2query.loc[pos_ms2query["cf_kingdom"].str.startswith("["), ["cf_superclass", "cf_class", "cf_subclass", "cf_direct_parent", "cf_kingdom"]] = "unknown"


#### add ionmode prefix to query_spectrum_nr
This makes it possible to link the MS2Query results to the mol network graphml file created. 

In [60]:
neg_ms2query["query_spectrum_nr"] = "neg_" + neg_ms2query["query_spectrum_nr"].astype(str)
pos_ms2query["query_spectrum_nr"] = "pos_" + pos_ms2query["query_spectrum_nr"].astype(str)

#### add ionmode as column

In [61]:
neg_ms2query["ionmode"] = "negative"
pos_ms2query["ionmode"] = "positive"

#### Combine pos and neg in one df

In [62]:
combined_ms2query_results = pd.concat([pos_ms2query, neg_ms2query], ignore_index=True)


#### Mask ms2query results below 0.7


In [65]:
mask = (combined_ms2query_results.ms2query_model_prediction.values > 0.7)
combined_ms2query_results[["precursor_mz_difference", "precursor_mz_analog", "inchikey", "smiles", 
                           "analog_compound_name", "cf_superclass", "cf_class", "cf_subclass", 
                           "cf_direct_parent", "npc_class_results", "npc_superclass_results", 
                           "npc_pathway_results"]] = combined_ms2query_results[["precursor_mz_difference", "precursor_mz_analog", 
                                                                                "inchikey", "smiles", "analog_compound_name", 
                                                                                "cf_superclass", "cf_class", "cf_subclass", 
                                                                                "cf_direct_parent", "npc_class_results", "npc_superclass_results", 
                                                                                "npc_pathway_results"]][mask]

In [66]:
combined_ms2query_results

Unnamed: 0,query_spectrum_nr,ms2query_model_prediction,precursor_mz_difference,precursor_mz_query_spectrum,precursor_mz_analog,inchikey,analog_compound_name,smiles,rtinminutes,cf_kingdom,cf_superclass,cf_class,cf_subclass,cf_direct_parent,npc_class_results,npc_superclass_results,npc_pathway_results,ionmode
0,pos_1,0.9150,0.0004,203.2234,203.2230,PFNFFQXMRSDOHW,SPERMINE,NCCCNCCCCNCCCN,0.372183,Organic compounds,Organic nitrogen compounds,Organonitrogen compounds,Amines,Dialkylamines,Polyamines,Ornithine alkaloids,Alkaloids,positive
1,pos_2,0.3530,,223.9854,,,,,0.412767,unknown,,,,,,,,positive
2,pos_3,0.9150,0.0004,170.0924,170.0920,JDHILDINMRGULE,N.pi.-Methyl-L-histidine,Cn1cncc1C[C@@H](C(=O)O)N,0.626817,Organic compounds,Organic acids and derivatives,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues",Histidine and derivatives,Aminoacids,Small peptides,Amino acids and Peptides,positive
3,pos_4,0.5869,,160.0963,,,,,0.667783,unknown,,,,,,,,positive
4,pos_5,0.5869,,160.0970,,,,,0.667783,unknown,,,,,,,,positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2902,neg_1311,0.9585,0.0008,367.1578,367.1586,CZWCKYRVOZZJNM,Dehydroisoandrosterone sulfate,C[C@]12CC[C@H]3[C@H]([C@@H]1CCC2=O)CC=C4[C@@]3...,7.964783,unknown,unknown,unknown,unknown,unknown,,,,negative
2903,neg_1312,0.9364,0.0008,369.1732,369.1740,ZMITXKRGXGRMKS,Androsterone sulfate,C[C@]12CC[C@H](C[C@@H]1CC[C@@H]3[C@@H]2CC[C@]4...,8.584184,unknown,unknown,unknown,unknown,unknown,,,,negative
2904,neg_1313,0.9569,0.0006,329.2323,329.2329,MDIUMSLCYIJBQC,FA 18:1+3O,O=C(O)CCCCCCCC(O)C=CC(O)C(O)CCCCC,8.958633,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Fatty acids and conjugates,Long-chain fatty acids,Other Octadecanoids,Octadecanoids,Fatty acids,negative
2905,neg_1314,0.9569,0.0010,293.1750,293.1760,NLDDIKRKFXEWBK,6-Gingerol CollisionEnergy:102040,CCCCCC(O)CC(=O)CCc1ccc(O)c(OC)c1,9.783950,Organic compounds,Benzenoids,Phenols,Methoxyphenols,Gingerols,,,,negative


In [67]:
combined_ms2query_results.to_csv("combined_ms2query_results.csv")

# Adding annotations from Natural Phenome Center.

In [68]:
neg_elena = pd.read_csv("./annotations_elena/RNEG_ROI_V_3_2_0.csv", encoding="windows-1252")

In [69]:
pos_elena = pd.read_csv("./annotations_elena/RPOS_ROI_V_3_2_0.csv", encoding="windows-1252")

In [70]:
def add_matching_annotations(row_elena_df, ms2query_annotations, ionmode):
    rt_elena = row["rt_minutes"]
    mz_elena = row["mz"]
    inchikey_elena = row["InChIKey"]
    compound_name_elena = row["cpdName"]
    
    matching_mz = (abs(ms2query_annotations["precursor_mz_query_spectrum"] - mz_elena) / ms2query_annotations["precursor_mz_query_spectrum"]*1000000) <15
    matching_rt = (abs(ms2query_annotations["rtinminutes"] - rt_elena)) < 0.25
    matching_ionmode = ms2query_annotations["ionmode"] == ionmode

    all_matching = matching_mz & matching_rt & matching_ionmode
    ms2query_annotations.loc[all_matching, "elena_compound_name"] = compound_name_elena
    ms2query_annotations.loc[all_matching, "elena_inchikey"] = inchikey_elena

In [71]:
for _, row in pos_elena.iterrows():
    add_matching_annotations(row, combined_ms2query_results, "positive")
for _, row in neg_elena.iterrows():
    add_matching_annotations(row, combined_ms2query_results, "negative")

In [72]:
combined_ms2query_results

Unnamed: 0,query_spectrum_nr,ms2query_model_prediction,precursor_mz_difference,precursor_mz_query_spectrum,precursor_mz_analog,inchikey,analog_compound_name,smiles,rtinminutes,cf_kingdom,cf_superclass,cf_class,cf_subclass,cf_direct_parent,npc_class_results,npc_superclass_results,npc_pathway_results,ionmode,elena_compound_name,elena_inchikey
0,pos_1,0.9150,0.0004,203.2234,203.2230,PFNFFQXMRSDOHW,SPERMINE,NCCCNCCCCNCCCN,0.372183,Organic compounds,Organic nitrogen compounds,Organonitrogen compounds,Amines,Dialkylamines,Polyamines,Ornithine alkaloids,Alkaloids,positive,,
1,pos_2,0.3530,,223.9854,,,,,0.412767,unknown,,,,,,,,positive,,
2,pos_3,0.9150,0.0004,170.0924,170.0920,JDHILDINMRGULE,N.pi.-Methyl-L-histidine,Cn1cncc1C[C@@H](C(=O)O)N,0.626817,Organic compounds,Organic acids and derivatives,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues",Histidine and derivatives,Aminoacids,Small peptides,Amino acids and Peptides,positive,,
3,pos_4,0.5869,,160.0963,,,,,0.667783,unknown,,,,,,,,positive,,
4,pos_5,0.5869,,160.0970,,,,,0.667783,unknown,,,,,,,,positive,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2902,neg_1311,0.9585,0.0008,367.1578,367.1586,CZWCKYRVOZZJNM,Dehydroisoandrosterone sulfate,C[C@]12CC[C@H]3[C@H]([C@@H]1CCC2=O)CC=C4[C@@]3...,7.964783,unknown,unknown,unknown,unknown,unknown,,,,negative,Dehydroepiandrosterone Sulfate,CZWCKYRVOZZJNM-USOAJAOKSA-N
2903,neg_1312,0.9364,0.0008,369.1732,369.1740,ZMITXKRGXGRMKS,Androsterone sulfate,C[C@]12CC[C@H](C[C@@H]1CC[C@@H]3[C@@H]2CC[C@]4...,8.584184,unknown,unknown,unknown,unknown,unknown,,,,negative,,
2904,neg_1313,0.9569,0.0006,329.2323,329.2329,MDIUMSLCYIJBQC,FA 18:1+3O,O=C(O)CCCCCCCC(O)C=CC(O)C(O)CCCCC,8.958633,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Fatty acids and conjugates,Long-chain fatty acids,Other Octadecanoids,Octadecanoids,Fatty acids,negative,,
2905,neg_1314,0.9569,0.0010,293.1750,293.1760,NLDDIKRKFXEWBK,6-Gingerol CollisionEnergy:102040,CCCCCC(O)CC(=O)CCc1ccc(O)c(OC)c1,9.783950,Organic compounds,Benzenoids,Phenols,Methoxyphenols,Gingerols,,,,negative,,


In [73]:
pd.set_option('display.max_rows', 250)

combined_ms2query_results.loc[combined_ms2query_results["elena_inchikey"] != "nan", ["query_spectrum_nr", "analog_compound_name","elena_compound_name", "inchikey", "elena_inchikey", "ionmode"]]

Unnamed: 0,query_spectrum_nr,analog_compound_name,elena_compound_name,inchikey,elena_inchikey,ionmode
5,pos_6,TRIGONELLINE,Trigonelline,WWNNZCOKKKDOPX,WWNNZCOKKKDOPX-UHFFFAOYSA-N,positive
8,pos_9,Stachydrine (L-proline betaine),1-Methylpiperidine-2-carboxylic acid (N-methyl...,CMUNUTVVOOHQPW,BPSLZWSRHTULGU-UHFFFAOYSA-N,positive
10,pos_11,H-Pro-Hyp-OH,Prolylhydroxyproline,ONPXCLZMBSJLSP,ONPXCLZMBSJLSP-CSMHCCOUSA-N,positive
13,pos_14,ACETYL ARGININE,N-a-Acetyl-L-arginine,SNEIUMQYRCDYCH,SNEIUMQYRCDYCH-LURJTMIESA-N,positive
14,pos_15,ACETYL-CARNITINE,L-Acetylcarnitine CAR(2:0),RDHQFKQIGNGIED,RDHQFKQIGNGIED-MRVPVSSYSA-N,positive
15,pos_16,,Citric acid,,KRKNYBCHXYNGOX-UHFFFAOYSA-N,positive
30,pos_31,Succinoadenosine,Succinyladenosine,VKGZCEJTCKHMRL,VKGZCEJTCKHMRL-VWJPMABRSA-N,positive
41,pos_42,"""N,N-DIMETHYL-ARGININE""",Symmetric | Asymmetric Dimethylarginine,YDGMGEXADBMOMJ,HVPFXCBJHIIJGS-LURJTMIESA-N | YDGMGEXADBMOMJ-L...,positive
44,pos_45,"1,1-DIMETHYL-PROLINIUM",1-Methylpiperidine-2-carboxylic acid (N-methyl...,CMUNUTVVOOHQPW,BPSLZWSRHTULGU-UHFFFAOYSA-N,positive
45,pos_46,"1,1-DIMETHYL-PROLINIUM",1-Methylpiperidine-2-carboxylic acid (N-methyl...,CMUNUTVVOOHQPW,BPSLZWSRHTULGU-UHFFFAOYSA-N,positive


In [76]:
combined_ms2query_results = combined_ms2query_results.fillna("unknown")

In [77]:
combined_ms2query_results.to_csv("ms2query_and_NPC_annotation.csv")

# Only spectra of interest

In [106]:
from matchms.importing import load_from_mgf
all_spectra = list(load_from_mgf("cleaned_spectra_pos_neg_with_numbering.mgf"))

In [108]:
selected_spectra = []
for spectrum in all_spectra:
    if spectrum.get("query_spectrum_nr") in ("pos_1198", "neg_366", "pos_401", "neg_316", "pos_113", "neg_157", "pos_970", "pos_173", 
                                             "neg_1189", "pos_1072", "pos_274", "pos_275", "neg_849", "neg_1298", "neg_1275",
                                             "neg_144", "pos_1331", "pos_334", "neg_543", "neg_241",
                                             "pos_432", "pos_1220", "pos_495", "neg_974", "neg_970", "neg_1170", "pos_1292",
                                             "neg_132", "pos_977", "pos_179", "neg_217", "pos_935", "pos_138", "neg_121",
                                             "neg_744", "neg_750", "pos_1384", "pos_587", "neg_378", "neg_830", "neg_939"):
        selected_spectra.append(spectrum)


In [116]:
from matchms.exporting import save_as_mgf
save_as_mgf(selected_spectra, "clustering_spectra.mgf")

In [115]:
selected_results = combined_results[combined_results["query_spectrum_nr"].isin(["pos_1198", "neg_366", "pos_401", "neg_316", "pos_113", "neg_157", "pos_970", "pos_173", 
                                             "neg_1189", "pos_1072", "pos_274", "pos_275", "neg_849", "neg_1298", "neg_1275",
                                             "neg_144", "pos_1331", "pos_334", "neg_543", "neg_241",
                                             "pos_432", "pos_1220", "pos_495", "neg_974", "neg_970", "neg_1170", "pos_1292",
                                             "neg_132", "pos_977", "pos_179", "neg_217", "pos_935", "pos_138", "neg_121",
                                             "neg_744", "neg_750", "pos_1384", "pos_587", "neg_378", "neg_830", "neg_939"])]

In [117]:
selected_results.to_csv("selected_clustering_spectra_ms2query_and_NPC_annotation.csv")