# Inventa: a computational tool to discover structural novelty in natural  extracts libraries

In [102]:
from __future__ import print_function
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")
from tqdm import tqdm
import pandas as pd
import memo_ms as memo
import sys 
import lineup_widget
import ipywidgets as widgets
from ipywidgets import *
!jupyter nbextension enable --py --sys-prefix lineup_widget

sys.path.append('../src')
sys.path.append('../gnps_postprocessing/src') 

from import_data import*
from process_data import *
from FC import *
from LC import *
from SC import *
from CC import *
from plot import *

from gnps_download_results import *
from consolidates_structures import *
from gnps_results_postprocess import *

Enabling notebook extension lineup_widget/extension...
      - Validating: [32mOK[0m


In [103]:
#paths:

metadata_filename = '../data/P09_metadata_pos.tsv'                      # The path where the metadata is placed
quantitative_data_filename = '../data/P09_GNPS_quant.csv'                    # The path where the quantitative table is placed
tima_results_filename = '../data/Celastraceae_pos_spectral_match_results_repond.tsv' # The path where the ISDB reponderated file is placed
spectral_data_filename = '../data/all_annotation/spectra/specs_ms.mgf'                     # The path where the MEMO matrix is placed
canopus_npc_summary_filename = '../data/P09_canopus_formula_summary.tsv'                     # The path where the SIRIUS-CANOPUS results file is placed
sirius_annotations_filename = '../data/P09_compound_identifications_adducts.tsv'                 # The path where the SIRIUS results file is placed

job_id= '23ce3ce3a78343d2ade0cf874df13941' #P10 'f5c6968de3d74873b90c9641b6c8c14f'

# metadata headers

species_column = 'ATTRIBUTE_Species'
genus_column = 'ATTRIBUTE_Genus'
family_column = 'ATTRIBUTE_Family'
organe_column = 'ATTRIBUTE_Position'
filename_header = 'filename'

# parameters for cleaning-up annotations from GNPS 

max_ppm_error = 5                 # min error in ppm to consider an annotation valable
shared_peaks = 4                 # min number of shared peaks between the MS2 experimental and MS2 from the database, to consider an annotation valable
min_cosine = 0.6                  # min cosine score to consider an annotation valable
ionisation_mode = 'pos'           # ionisation mode according to experimental conditions 'pos' or 'neg'
max_spec_charge = 2

#quantitative table
data_process_origin = 'MZMine3' #'MZMine2'
use_ion_identity= False  #False

#Feature_component

min_specificity = 0.90            # minimun feature specificity to consider

## 
multiple_organism_parts = False  #True: the specificity is going to be considered as the sum of the 'max_parts_per_organism' shared in the samples.
max_parts_per_organism = 4       #max recurrence of the same organism species (for example: 5 samples, same species but different plant part) 

## inputs to use: 
isdb_annotations = False          #True: the tima_results_filename will be considered in the calculations
sirius_annotations = True        #True: the sirius_annotations_filename will be considered in the calculations

## cut-offs: 
min_score_final = 0.3             #cut-off filter for considering an isdb annotation valable. You must be extremenly carefull with this parameter, '0.0' as default.
min_ZodiacScore = 0.9             #cut-off filter for considering a sirius annotation valable. It is used in combination with min_ConfidenceScore.
min_ConfidenceScore= 0.25         #cut-off filter for considering a sirius annotation valable. '0.0' as default.

annotation_preference = 0          # Only Annotated features: '1' or  Only Not annotated features: '0'

#Literature_component

LC_component = True               # LC will be calculated

max_comp_reported_sp = 10          # max number of compounds reported at species level, more than this value, the plant is considered less interesting
max_comp_reported_g = 50         # max number of compounds reported at genus level,more than this value, the plant is considered less interesting
max_comp_reported_f = 500           # max number of compounds reported at genus level,more than this value, the plant is considered less interesting

#weight for each taxonomic level 
ws = 1
wg = 1
wf = 1 

#Similarity_component

SC_component = True                # SC will be calculated

#Class_component

CC_component =  True              # CC will be calculated
min_class_confidence = 0.8       #cut-off filter for considering a sirius class valable. It is used in combination with min_recurrence.
min_recurrence = 1              # minimum recurrence of a chemical class to consider it acceptable

#specify the weight to modulate each component 
w1 = 1           # 1 means the value itself is taken into account. A 0.5 means onle half of the calculated value is taken into account
w2 = 1
w3 = 1
w4 = 1


# Prepare input files

### Download and clean the GNPS results

In [104]:
df_annotations = gnps_download_results(job_id, output_folder ='../data/all_annotation', return_annotation_table=True)
#df_annotations.head()

# Consolidate structure identifiers
gnps_annotations_consolidated  = consolidate_and_convert_structures(df_annotations, prefix='', smiles='Smiles', inchi='INCHI')

# Filter GNPS annotations
gnps_annotations_filtered = gnps_filter_annotations(gnps_annotations_consolidated, 'Consol_InChI', ionisation_mode, max_ppm_error, min_cosine, shared_peaks, max_spec_charge)

# Generate annotation attributes
annot_gnps_df = get_gnps_annotations(get_molecular_formula_from_inchi(gnps_annotations_filtered, 'Consol_InChI'))
#annot_gnps_df.head(2)

annot_gnps_df.to_csv('../data_out/annotation_clean_gnps.tsv', sep='\t')

This is the GNPS job link: https://gnps.ucsd.edu/ProteoSAFe/status.jsp?task=23ce3ce3a78343d2ade0cf874df13941
Downloading the following content: https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task=23ce3ce3a78343d2ade0cf874df13941&view=view_all_annotations_DB


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 40920    0 40920    0     0  31094      0 --:--:--  0:00:01 --:--:-- 31070100  231k    0  231k    0     0  99511      0 --:--:--  0:00:02 --:--:-- 99470100  487k    0  487k    0     0   147k      0 --:--:--  0:00:03 --:--:--  147k100 1094k    0 1094k    0     0   250k      0 --:--:--  0:00:04 --:--:--  250k100 2365k    0 2365k    0     0   446k      0 --:--:--  0:00:05 --:--:--  476k100 4515k    0 4515k    0     0   710k      0 --:--:--  0:00:06 --:--:--  888k100 7554k    0 7554k    0     0  1036k      0 --:--:--  0:00:07 --:--:-- 1493k


GNPS job results were succesfully downloaded as: ../data/all_annotation.zip
GNPS job results were succesfully extracted into the folder: ../data/all_annotation
   FEATURE-BASED MOLECULAR NETWORKING job detected - Version > 28
      209 spectral library annotations in the job.
      2737 nodes in the network (including single nodes).
Both SMILES and InChI were inputted
Converting SMILES to mol object
Succesfully converted to mol object: 157
Exception to the parsing: 0
Not available: 53
Converting INCHI to mol object
Succesfully converted to mol object: 150
Exception to the parsing: 0
Not available: 60
Consolidating the lists
Total mol object from the list 1 = 157
Mol object consolidated from list 2 = 10
Consolidated structures = 167
Converting mol objects to SMILES iso
Converting mol objects to SMILES
Converting mol objects to InChI
Converting mol objects to InChIKey
End
Initial number of annotations: 210
Remaining after ionisation mode filtering: 209
Remaining after max_ppm_error filte

### Metadata table

In [105]:
metadata_df = pd.read_csv(metadata_filename, sep='\t')
metadata_df.head(3)

Unnamed: 0,filename,ATTRIBUTE_Position,ATTRIBUTE_Type,ATTRIBUTE_Family,ATTRIBUTE_Genus,ATTRIBUTE_Species
0,20220918_LFX_AB_89_P09H05_1_POS.mzML,H05,Sample,unknown,Alternaria,Alternaria alternata
1,20220918_LFX_AB_42_P09D06_1_POS.mzML,D06,Sample,unknown,Armillaria,Armillaria bulbosa
2,20220918_LFX_AB_54_P09E06_1_POS.mzML,E06,Sample,unknown,Armillaria,Armillaria mellea


In [106]:
#if you need to create an unique identifier column like Species|part, use as model the followin line. IF the colum is PRESENT, then don't run it.
metadata_df['ATTRIBUTE_Sppart'] = metadata_df[species_column]+ "|" + metadata_df[organe_column].map(str)

In [107]:
sppart_column = 'ATTRIBUTE_Sppart' #'ATTRIBUTE_Position'
col_id_unique = filename_header #'organism_sppart'  #column containing an unique identifier for each sample, like Species_plantpart, Species_solvent. It could be the filename

### Quantification table

In [108]:
#Quantitative table
quant_df = quant_table(quantitative_data_filename, data_process_origin, use_ion_identity)
#if Ion Identity is used, it is necesary to recover the information for each correlation group
correlation_groups_df= correlation_groups(quantitative_data_filename, use_ion_identity)

ion identity not used


#### Combined and remove experimental controls


In [109]:
# merge tables
full_df = full_data(metadata_df, quant_df, filename_header)

#erase all the blanks and QC's - Change the string as needed
list_of_strings_for_QC_Blank_filter = ['Blank_process', 'QC']
column_to_use_for_filtering = 'ATTRIBUTE_Type' #this information should be included in the metadata table

filtered_full_df = drop_samples_based_on_string(full_df,'full_df',list_of_strings_for_QC_Blank_filter, column_to_use_for_filtering)
filtered_metadata_df = drop_samples_based_on_string(metadata_df, 'metadata_df', list_of_strings_for_QC_Blank_filter, column_to_use_for_filtering)



(104, 2745)
(96, 2745)
(104, 7)
(96, 7)


##### Make a minimal table for further processing & apply filter by intensity 

In [110]:
reduced_df = reduce_df(filtered_full_df, filtered_metadata_df, col_id_unique).transpose()

In [92]:
#you can choose between a quantile filter (only the features higher than the third quantile (75%) ares kept):
reduced_df = features_filter(reduced_df, min_threshold=0.02)

In [93]:
#or a simple intensity filter, it removes the features less than X % of relative area (by default 20%). If you want you can combine boht filters
reduced_df = quantile_filter(reduced_df, quantile_threshold=0.75)

##### visualization of the distribution variation before and after filtering by sample 

In [12]:
drop_down = widgets.Dropdown(options=reduced_df.columns,description='Sample to plot', disabled=False)
def dropdown_handler(change):
    global sample
    print(change.new)
    sample = change.new  # This line isn't working
drop_down.observe(dropdown_handler, names='value')
display(drop_down)

Dropdown(description='Sample to plot', options=('20220918_LFX_AB_89_P10H05_1_POS.mzML', '20220918_LFX_AB_42_P1…

In [13]:
distribution_to_plot(sample, quant_df, reduced_df)

NameError: name 'sample' is not defined

# Start calculation the diferent components

# Feature component (FC)

#### FC.1. Annotation Rate

In [111]:
#load annotations files
annot_is_df       = get_isdb_annotations(tima_results_filename, isdb_annotations)
annot_sirius_df   = get_sirius_annotations(sirius_annotations_filename, sirius_annotations) 

annotation_df = annotations(annot_is_df, annot_sirius_df, sirius_annotations, isdb_annotations, min_score_final, min_ConfidenceScore, min_ZodiacScore, correlation_groups_df, use_ion_identity)
#annotation_df.head(2)


The isdb annotations output will be not used


#### FC.2. FC computation

In [112]:
FC = feature_component(quant_df, reduced_df, annotation_df, metadata_df, family_column, genus_column, species_column, col_id_unique, min_specificity, annotation_preference, filename_header, annot_sirius_df, sirius_annotations, annot_gnps_df, min_ZodiacScore, multiple_organism_parts, max_parts_per_organism, use_ion_identity)
FC.head()

Unnamed: 0,filename,ATTRIBUTE_Family,ATTRIBUTE_Genus,ATTRIBUTE_Species,initial_F,filtered_F,Total_SF,Total_SNAF,Total_SNA_GQMFF,MF_prediction_ratio,FS,FC
12,20220918_LFX_AB_57_P09E09_1_POS.mzML,unknown,Chaetomium,Chaetomium globosum,155,155.0,122.0,122.0,389.0,2.51,0.79,0.79
97,20220918_LFX_AB_40_P09D04_1_POS.mzML,unknown,Truncatella,Truncatella angustata,170,170.0,133.0,129.0,462.0,2.72,0.78,0.76
47,20220918_LFX_AB_61_P09F01_1_POS.mzML,unknown,Laxitextum,Laxitextum cf. incrustatum,175,175.0,135.0,129.0,440.0,2.51,0.77,0.74
3,20220918_LFX_AB_80_P09G08_1_POS.mzML,unknown,Armillaria,Armillaria mellea,296,296.0,178.0,178.0,379.0,1.28,0.6,0.6
1,20220918_LFX_AB_42_P09D06_1_POS.mzML,unknown,Armillaria,Armillaria bulbosa,247,247.0,149.0,139.0,359.0,1.45,0.6,0.56


In [113]:
#Show the results in an interactive way
def selection_changed_FC(selection):
    return FC.iloc[selection]
interact(selection_changed_FC, selection=lineup_widget.LineUpWidget(FC));

interactive(children=(LineUpWidget(value=[], description='selection', layout=Layout(align_self='stretch', heig…

# Literature component (LC)


#### LC.1. LC computation

In [114]:
LC = literature_component(LC_component, metadata_df, filename_header, species_column, genus_column, family_column, max_comp_reported_sp, max_comp_reported_g, max_comp_reported_f, ws, wg, wf)
LC

Unnamed: 0,filename,ATTRIBUTE_Family,ATTRIBUTE_Genus,ATTRIBUTE_Species,Reported_comp_Species,Reported_comp_Genus,Reported_comp_Family,LC
0,20220918_LFX_AB_89_P09H05_1_POS.mzML,unknown,Alternaria,Alternaria alternata,263.0,870.0,0.0,0.56
1,20220918_LFX_AB_42_P09D06_1_POS.mzML,unknown,Armillaria,Armillaria bulbosa,0.0,284.0,0.0,0.94
2,20220918_LFX_AB_54_P09E06_1_POS.mzML,unknown,Armillaria,Armillaria mellea,197.0,284.0,0.0,0.75
3,20220918_LFX_AB_80_P09G08_1_POS.mzML,unknown,Armillaria,Armillaria mellea,197.0,284.0,0.0,0.75
4,20220918_LFX_AB_66_P09F06_1_POS.mzML,unknown,Aspergillus,Aspergillus pseudoglaucus,0.0,5538.0,0.0,0.00
...,...,...,...,...,...,...,...,...
94,20220918_LFX_AB_38_P09D02_1_POS.mzML,unknown,Verticillium,Verticillium dahliae,37.0,68.0,0.0,0.95
95,20220918_LFX_AB_50_P09E02_1_POS.mzML,unknown,Verticillium,Verticillium lateritium,0.0,68.0,0.0,0.99
96,20220918_LFX_AB_65_P09F05_1_POS.mzML,unknown,Verticillium,Verticillium lecanii,0.0,68.0,0.0,0.99
97,20220918_LFX_AB_75_P09G03_1_POS.mzML,unknown,Verticillium,Verticillium albo-atrum,0.0,68.0,0.0,0.99


# Similarity component (SC)

#### SC.1 MEMO matrix computation

In [115]:
#load quant table
feat_table_qe = memo.FeatureTable(quantitative_data_filename, software='mzmine')
feat_table_qe.feature_table
#load MGF datafile
spectra_qe = memo.SpectraDocuments(spectral_data_filename, min_relative_intensity = 0.01,
            max_relative_intensity = 1, min_peaks_required=5, losses_from = 10, losses_to = 200, n_decimals = 2)
spectra_qe.document
#compute MEMO matrix
memo_qe = memo.MemoMatrix()
memo_qe.memo_from_aligned_samples(feat_table_qe, spectra_qe)
memo_qe.memo_matrix

#filter blanks and QCs
memo_qe = memo_qe.filter(use_samples_pattern=True, samples_pattern='BLANK') #be careful, default for use_samples_pattern is False!
feat_table_qe = feat_table_qe.filter(use_samples_pattern=True, samples_pattern='BLANK')
memo_qe.memo_matrix

#filter blanks and QCs
memo_qe = memo_qe.filter(use_samples_pattern=True, samples_pattern='QC-MIX') #be careful, default for use_samples_pattern is False!
feat_table_qe = feat_table_qe.filter(use_samples_pattern=True, samples_pattern='QC-MIX')
metric_df = memo_qe.memo_matrix

generating memo_matrix from input featuretable and spectradocument


  0%|          | 0/104 [00:00<?, ?it/s]100%|██████████| 104/104 [00:00<00:00, 2251.10it/s]


#### SC.2 SC computation

In [116]:
SC = similarity_component(metric_df, SC_component, filename_header)
SC

Unnamed: 0,filename,anomaly_IF,anomaly_LOF,anomaly_OCSVM,SC
0,20220918_LFX_AB_95_P09H11_1_POS.mzML,1,1,-1,1
1,20220918_LFX_AB_05_P09A05_1_POS_QC_post_H.mzML,1,1,1,0
2,20220918_LFX_AB_05_P09A05_1_POS_QC_pre_D.mzML,1,1,1,0
3,20220918_LFX_AB_05_P09A05_1_POS.mzML,1,1,1,0
4,20220918_LFX_AB_91_P09H07_1_POS.mzML,1,1,1,0
...,...,...,...,...,...
99,20220918_LFX_AB_78_P09G06_1_POS.mzML,1,1,1,0
100,20220918_LFX_AB_90_P09H06_1_POS.mzML,1,1,1,0
101,20220918_LFX_AB_76_P09G04_1_POS.mzML,1,-1,1,1
102,20220918_LFX_AB_86_P09H02_1_POS.mzML,1,-1,-1,1


#### SC.3. Visualizing the outliers

NOTE: depending on the size of the data set, projections could take a while... 

#### SC.2.1 PCoA + UMAP combined visualization

In [117]:
pcoa_umap_2d(SC_component,
    matrix= metric_df,
    data = SC,
    metric= 'braycurtis', 
    filename_header=filename_header)


The result contains negative eigenvalues. Please compare their magnitude with the magnitude of some of the largest positive eigenvalues. If the negative ones are smaller, it's probably safe to ignore them, but if they are large in magnitude, the results won't be useful. See the Notes section for more details. The smallest eigenvalue is -0.034053282958710664 and the largest is 4.575169182686857.



# Class component (SC)

In [118]:
canopus_npc_df = get_canopus_pred_classes(canopus_npc_summary_filename, CC_component)
CC = class_component(quantitative_data_filename, data_process_origin, canopus_npc_df, filename_header, species_column, genus_column, family_column, metadata_df, reduced_df, min_class_confidence, min_recurrence, CC_component)
CC.head()

Unnamed: 0,filename,NPC#class,ATTRIBUTE_Species,ATTRIBUTE_Genus,ATTRIBUTE_Family,Chemical_class_reported_in_species,Chemical_class_reported_in_genus,New_CC_in_sp,New_CC_in_genus,CCs,CCg,CC
0,20220918_LFX_AB_89_P09H05_1_POS.mzML,"{Pyrroloindole alkaloids, Arteminisin, Purine ...",Alternaria alternata,Alternaria,unknown,"{Naphthalenes and derivatives, Macrolide lacto...","{Naphthalenes and derivatives, Macrolide lacto...","{Pyrroloindole alkaloids, Arteminisin, Purine ...","{Pyrroloindole alkaloids, Arteminisin, Purine ...",0.5,0.5,1.0
1,20220918_LFX_AB_42_P09D06_1_POS.mzML,"{Quadrane sesquiterpenoids, Simple coumarins, ...",Armillaria bulbosa,Armillaria,unknown,nothing in DB,"{Pyrrolidine alkaloids, Friedelane triterpenoi...",nothing in DB,nothing in DB,0.0,0.0,0.0
2,20220918_LFX_AB_54_P09E06_1_POS.mzML,"{Other Octadecanoids, Tripeptides}",Armillaria mellea,Armillaria,unknown,"{Pyrrolidine alkaloids, Friedelane triterpenoi...","{Pyrrolidine alkaloids, Friedelane triterpenoi...","{Other Octadecanoids, Tripeptides}","{Other Octadecanoids, Tripeptides}",0.5,0.5,1.0
3,20220918_LFX_AB_80_P09G08_1_POS.mzML,"{Quadrane sesquiterpenoids, Tripeptides, Guaia...",Armillaria mellea,Armillaria,unknown,"{Pyrrolidine alkaloids, Friedelane triterpenoi...","{Pyrrolidine alkaloids, Friedelane triterpenoi...","{Quadrane sesquiterpenoids, Tripeptides, Guaia...","{Quadrane sesquiterpenoids, Tripeptides, Guaia...",0.5,0.5,1.0
4,20220918_LFX_AB_66_P09F06_1_POS.mzML,"{Carboline alkaloids, Open-chain polyketides, ...",Aspergillus pseudoglaucus,Aspergillus,unknown,nothing in DB,nothing in DB,nothing in DB,nothing in DB,0.0,0.0,0.0


# Priority Score Results

In [125]:
PS = priority_score(filename_header, species_column, genus_column, family_column, sppart_column, w1, w2, w3, w4)
PS.head()


  0%|          | 0/9 [00:00<?, ?it/s]100%|██████████| 9/9 [00:00<00:00, 456.29it/s]


Unnamed: 0,filename,ATTRIBUTE_Family,ATTRIBUTE_Genus,ATTRIBUTE_Species,initial_F,filtered_F,Total_SF,Total_SNAF,Total_SNA_GQMFF,MF_prediction_ratio,FS,FC,PS
0,20220918_LFX_AB_57_P09E09_1_POS.mzML,unknown,Chaetomium,Chaetomium globosum,155,155.0,122.0,122.0,389.0,2.51,0.79,0.79,0.79
1,20220918_LFX_AB_40_P09D04_1_POS.mzML,unknown,Truncatella,Truncatella angustata,170,170.0,133.0,129.0,462.0,2.72,0.78,0.76,0.76
2,20220918_LFX_AB_61_P09F01_1_POS.mzML,unknown,Laxitextum,Laxitextum cf. incrustatum,175,175.0,135.0,129.0,440.0,2.51,0.77,0.74,0.74
3,20220918_LFX_AB_80_P09G08_1_POS.mzML,unknown,Armillaria,Armillaria mellea,296,296.0,178.0,178.0,379.0,1.28,0.6,0.6,0.6
4,20220918_LFX_AB_42_P09D06_1_POS.mzML,unknown,Armillaria,Armillaria bulbosa,247,247.0,149.0,139.0,359.0,1.45,0.6,0.56,0.56


### Display results

In [120]:
#Show the results in an interactive way
def selection_changed(selection):
    return PS.iloc[selection]
interact(selection_changed, selection=lineup_widget.LineUpWidget(PS));


Message serialization failed with:
Out of range float values are not JSON compliant
Supporting this message is deprecated in jupyter-client 7, please make sure your message is JSON-compliant



interactive(children=(LineUpWidget(value=[], description='selection', layout=Layout(align_self='stretch', heig…

#### Select the sample you wan to plot 

In [121]:
drop_down = widgets.Dropdown(options=quant_df.columns,description='Sample to plot', disabled=False)
def dropdown_handler(change):
    global sample
    print(change.new)
    sample = change.new 
drop_down.observe(dropdown_handler, names='value')
display(drop_down) 

Dropdown(description='Sample to plot', options=('20220918_LFX_AB_01_P09A01_1_POS.mzML', '20220918_LFX_AB_02_P0…

#### Plot the selected sample

In [122]:
ionmap2D(sample, quantitative_data_filename, annotation_df, metadata_df, reduced_df, min_specificity, annotation_preference, species_column, organe_column, CC_component, canopus_npc_summary_filename, min_class_confidence, sirius_annotations, sirius_annotations_filename, min_ConfidenceScore, min_ZodiacScore, use_ion_identity, correlation_groups_df, data_process_origin, filename_header)

NameError: name 'sample' is not defined

In [124]:
Cyto_formating = Cyt_format(reduced_df, PS, col_id_unique)