# INVENTA - Prioritization of natural extracts for chemical originality discovery


In [14]:
from __future__ import print_function
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")
import numpy as np
import pandas as pd
import sys 
import lineup_widget
from ipywidgets import *
!jupyter nbextension enable --py --sys-prefix lineup_widget


sys.path.append('../src')
sys.path.append('../gnps_postprocessing/src') 

from import_data import*
from process_data import *
from FC import *
from LC import *
from SC import *
from CC import *
from plot import *

from gnps_download_results import *
from consolidates_structures import *
from gnps_results_postprocess import *

Enabling notebook extension lineup_widget/extension...
      - Validating: [32mOK[0m


# Paths and parameters to define

In [15]:
#paths:

metadata_filename = '../data/Celastraceae_Set_metadata_pos.tsv'                      # The path were you want your folder to be placed
quantitative_data_filename = '../data/Celastraceae_pos_quant.csv'                    # The path were you want your GNPS job folder to be placed
tima_results_filename = '../data/Celastraceae_pos_spectral_match_results_repond.tsv' # ISDB_annot_LP_plantfungi_set # The name you want to give to your project, output resulst in data_out/project_name
vectorized_data_filename = '../data/Celastraceae_memomatrix.csv'                     # the path for your output to be stored in
canopus_npc_summary_filename = '../data/canopus_npc_summary.tsv'                     # Path to your spectral library file
sirius_annotations_filename = '../data/compound_identifications.tsv'                 # Path to the metadata of the spectral file

job_id=  '4c919fcbc83d487493a487012afb920a'  #"yourjobidgoeshere"  

In [16]:
# metadata 

species_column = 'ATTRIBUTE_Species'
genus_column = 'ATTRIBUTE_Genus'
family_column = 'ATTRIBUTE_Family'
organe_column = 'ATTRIBUTE_Organe'
filename_header = 'filename'

In [27]:
# parameters

#For cleaning-up annotations from GNPS 

max_ppm_error = 5                 # min error in ppm to consider an annotation valable
shared_peaks = 4                 # min number of shared peaks between the MS2 experimental and MS2 from the database, to consider an annotation valable
min_cosine = 0.6                  # min cosine score to consider an annotation valable
ionisation_mode = 'neg'           # ionisation mode according to experimental conditions
max_spec_charge = 2

In [18]:
#Feature_component

min_specificity = 0.9               # minimun feature specificity to consider

## inputs to use: 
isdb_annotations = True          # True: the tima_results_filename will be considered in the calculations
sirius_annotations = True         #True: the sirius_annotations_filename will be considered in the calculations

## cut-offs: 
min_score_final = 0.3             #cut-off filter for considering an isdb annotation valable. You must be extremenly carefull with this parameter, '0.0' as default.
min_ZodiacScore = 0.9             #cut-off filter for considering a sirius annotation valable. It is used in combination with min_ConfidenceScore.
min_ConfidenceScore= 0.25         #cut-off filter for considering a sirius annotation valable. '0.0' as default.

annotation_preference = 0          # Only Annotated features: '1' or  Only Not annotated features: '0'

In [19]:
#Literature_component

LC_component = True               # LC will be calculated

max_comp_reported_sp = 20          # max number of compounds reported at species level, more than this value, the plant is considered less interesting
max_comp_reported_g = 100          # max number of compounds reported at genus level,more than this value, the plant is considered less interesting
max_comp_reported_f = 500           # max number of compounds reported at genus level,more than this value, the plant is considered less interesting

In [20]:
#Similarity_component

SC_component = True                # SC will be calculated

#Class_component

CC_component = True               # CC will be calculated
min_class_confidence = 0.8       #cut-off filter for considering a sirius class valable. It is used in combination with min_recurrence.
min_recurrence = 5               # minimum recurrence of a chemical class to consider it acceptable

In [21]:
#specify the weight to modulate each component 
w1 = 1           # 1 means the value itself is taken into account. A 0.5 means onle half of the calculated value is taken into account
w2 = 1
w3 = 1
w4 = 1

# Prepare input files

### Download and clean the GNPS results

In [28]:
df_annotations = gnps_download_results(job_id, output_folder ='../data/all_annotations', return_annotation_table=True)
#df_annotations.head()

This is the GNPS job link: https://gnps.ucsd.edu/ProteoSAFe/status.jsp?task=0edc96db3a794655bbd6ed13c2988bb0
Downloading the following content: https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task=0edc96db3a794655bbd6ed13c2988bb0&view=view_all_annotations_DB


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  211k    0  211k    0     0   126k      0 --:--:--  0:00:01 --:--:--  126k


GNPS job results were succesfully downloaded as: ../data/all_annotations.zip
GNPS job results were succesfully extracted into the folder: ../data/all_annotations
   FEATURE-BASED MOLECULAR NETWORKING job detected - Version > 28
      39 spectral library annotations in the job.
      213 nodes in the network (including single nodes).


##### Clean and consolidate gnps annotations

In [29]:
# Consolidate structure identifiers
gnps_annotations_consolidated  = consolidate_and_convert_structures(df_annotations, prefix='', smiles='Smiles', inchi='INCHI')


Both SMILES and InChI were inputted
Converting SMILES to mol object
Succesfully converted to mol object: 30
Exception to the parsing: 0
Not available: 10
Converting INCHI to mol object
Succesfully converted to mol object: 27
Exception to the parsing: 0
Not available: 13
Consolidating the lists
Total mol object from the list 1 = 30
Mol object consolidated from list 2 = 3
Consolidated structures = 33
Converting mol objects to SMILES iso
Converting mol objects to SMILES
Converting mol objects to InChI
Converting mol objects to InChIKey
End


In [30]:
# Filter GNPS annotations
gnps_annotations_filtered = gnps_filter_annotations(gnps_annotations_consolidated, 'Consol_InChI', ionisation_mode, max_ppm_error, min_cosine, shared_peaks, max_spec_charge)

Initial number of annotations: 40
Remaining after ionisation mode filtering: 1
Remaining after max_ppm_error filtering: 0
Remaining after min_cosine filtering: 0
Remaining after number of shared_peaks filtering: 0
Remaining after number of spectrum charge filtering: 0


In [31]:
# Generate annotation attributes
annot_gnps_df = get_gnps_annotations(get_molecular_formula_from_inchi(gnps_annotations_filtered, 'Consol_InChI'))
#annot_gnps_df.head(2)

Initial number of annotations filtering: 0
After carbon containing adducts filtering: 0
Valid molecular formula: 0


### Load computational annotation results files 

In [51]:
annot_is_df       = get_isdb_annotations(tima_results_filename, isdb_annotations)
annot_sirius_df   = get_sirius_annotations(sirius_annotations_filename, sirius_annotations) 
canopus_npc_df    = get_canopus_pred_classes(canopus_npc_summary_filename, CC_component)

### Metadata table

In [52]:
metadata_df = pd.read_csv(metadata_filename, sep='\t')
metadata_df.head(3)

Unnamed: 0,filename,ATTRIBUTE_Code,ATTRIBUTE_Type,ATTRIBUTE_Family,ATTRIBUTE_Genus,ATTRIBUTE_Species,ATTRIBUTE_Organe,ATTRIBUTE_Broad_organ,ATTRIBUTE_Tissue,ATTRIBUTE_Subsystem
0,LQ-01-61-01_pos.mzXML,V107694,Sample,Celastraceae,Catha,Catha edulis,Leaves,photosynthetic,green tissue,aboveground
1,LQ-01-61-02_pos.mzXML,V107695,Sample,Celastraceae,Catha,Catha edulis,Stems,woody vegetative,woody tissue,aboveground
2,LQ-01-61-03_pos.mzXML,V107696,Sample,Celastraceae,Catha,Catha edulis,Roots,roots,root tissue,belowground


In [53]:
#if you need to create an unique identifier column like Species|part, use as model the followin line. IF the colum is PRESENT, then don't run it.
metadata_df['ATTRIBUTE_Sppart'] = metadata_df[species_column]+ "|" + metadata_df[organe_column].map(str)

In [54]:
col_id_unique = 'ATTRIBUTE_Sppart'  # filename_header   #column containing an unique identifier for each sample, like Species_plantpart, Species_solvent. It could be the filename

### Quantification table

In [55]:
#Quantitative table
quant_df = pd.read_csv(quantitative_data_filename, sep=',',  index_col='row ID')
quant_df = quant_table(quant_df)
quant_df.head(3)

Unnamed: 0_level_0,LQ-01-61-01_pos.mzXML,LQ-01-61-02_pos.mzXML,LQ-01-61-03_pos.mzXML,LQ-01-61-04_pos.mzXML,LQ-01-61-05_pos.mzXML,LQ-01-61-06_pos.mzXML,LQ-01-61-07_pos.mzXML,LQ-01-61-08_pos.mzXML,LQ-01-61-09_pos.mzXML,LQ-01-61-10_pos.mzXML,...,LQ-01-61-69_pos.mzXML,LQ-01-61-70_pos.mzXML,LQ-01-61-71_pos.mzXML,LQ-01-61-72_pos.mzXML,LQ-01-61-73_pos.mzXML,LQ-01-61-74_pos.mzXML,LQ-01-61-75_pos.mzXML,LQ-01-61-76_pos.mzXML,LQ-01-61-77_pos.mzXML,LQ-01-61-78_pos.mzXML
row ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2600773000.0,31347670.0,84137200.0,69254580.0,0.0,0.0,0.0,0.0,3256476.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,886019600.0,19647810.0,8688867.0,2214498000.0,0.0,0.0,0.0,0.0,1639854.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,854458300.0,25941630.0,0.0,461216300.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Combined and remove experimental controls 


In [56]:
# merge tables
full_df = full_data(metadata_df,quant_df)
#full_df.head(2)

In [57]:
#erase all the blanks and QC's - Change the string as needed
list_of_strings_for_QC_Blank_filter = ['Blank', 'QC']
column_to_use_for_filtering = 'ATTRIBUTE_Type' #this information should be included in the metadata table

full_df = drop_samples_based_on_string(full_df,'full_df',list_of_strings_for_QC_Blank_filter, column_to_use_for_filtering)
metadata_df = drop_samples_based_on_string(metadata_df, 'metadata_df', list_of_strings_for_QC_Blank_filter, column_to_use_for_filtering)

(78, 14981)
(76, 14981)
(77, 11)
(76, 11)


##### Make a minimal table for further processing

In [58]:
reduced_df = reduce_df(col_id_unique)
reduced_df.head(2)

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,14961,14962,14963,14964,14965,14966,14967,14968,14969,14970
ATTRIBUTE_Sppart,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Catha edulis|Leaves,2600773000.0,886019600.0,854458300.0,632016100.0,528280000.0,522718400.0,512211000.0,393613100.0,472246300.0,234727500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Catha edulis|Stems,31347670.0,19647810.0,25941630.0,0.0,7494122.0,210707100.0,0.0,50182830.0,18593770.0,322292700.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Start calculation the diferent components

# Feature component (FC)

#### FC.1. Feature Specifificty

In [59]:
specificity_df = top_ions(col_id_unique)
specificity_df.head(2)

Unnamed: 0,row ID,ATTRIBUTE_Sppart,Feature_specificity,filename
0,1,Catha edulis|Leaves,0.932588,LQ-01-61-01_pos.mzXML
1,2,Catha edulis|Aerial_parts,0.69662,LQ-01-61-04_pos.mzXML


#### FC.2. Annotation Rate

In [60]:
annotation_df = annotations(annot_is_df, annot_sirius_df, sirius_annotations, isdb_annotations, min_score_final, min_ConfidenceScore, min_ZodiacScore)
annotation_df.head(2)

Unnamed: 0,cluster index,componentindex,Annotated_GNPS,Annotated_ISDB,Annotated_Sirius,annotation
0,1,113,0,1,0,0
1,2,136,0,1,0,0


#### FC.3. Molecular formula prediction rate

In [61]:
mf_prediction_rate_df = mf_rate(annot_sirius_df, sirius_annotations, min_ZodiacScore, min_specificity, annotation_preference)
mf_prediction_rate_df.head()

Unnamed: 0_level_0,MF_prediction_ratio
filename,Unnamed: 1_level_1
LQ-01-61-28_pos.mzXML,0.632911
LQ-01-61-47_pos.mzXML,0.565
LQ-01-61-37_pos.mzXML,0.52844
LQ-01-61-60_pos.mzXML,0.526316
LQ-01-61-33_pos.mzXML,0.504098


#### FC.4. FC computation

In [62]:
FC = feature_component(min_specificity, annotation_preference, col_id_unique)
FC.head()

Unnamed: 0,filename,ATTRIBUTE_Family,ATTRIBUTE_Genus,ATTRIBUTE_Species,ATTRIBUTE_Sppart,FC,Feature_specificity,MF_prediction_ratio
5,LQ-01-61-06_pos.mzXML,Celastraceae,Celastrus,Celastrus orbiculatus,Celastrus orbiculatus|Roots,0.835938,0.863281,0.480469
75,LQ-01-61-78_pos.mzXML,Celastraceae,Pristimera,Pristimera indica,Pristimera indica|Roots,0.822222,0.841667,0.322222
26,LQ-01-61-27_pos.mzXML,Celastraceae,Euonymus,Euonymus fortunei,Euonymus fortunei|Aerial_parts,0.814516,0.903226,0.491935
59,LQ-01-61-60_pos.mzXML,Celastraceae,Mystroxylon,Mystroxylon aethiopicum,Mystroxylon aethiopicum|Bark,0.805263,0.821053,0.526316
36,LQ-01-61-37_pos.mzXML,Celastraceae,Euonymus,Euonymus sanguineus,Euonymus sanguineus|Roots,0.792661,0.838532,0.52844


# Literature component (LC)


#### LC.1. LC computation

In [63]:
LC = literature_component(LC_component, metadata_df, filename_header, species_column, genus_column, family_column, max_comp_reported_sp, max_comp_reported_g, max_comp_reported_f)
LC

Unnamed: 0,filename,ATTRIBUTE_Family,ATTRIBUTE_Genus,ATTRIBUTE_Species,Reported_comp_Species,Reported_comp_Genus,Reported_comp_Family,LC
0,LQ-01-61-01_pos.mzXML,Celastraceae,Catha,Catha edulis,126.0,126.0,6064,0.80312
1,LQ-01-61-02_pos.mzXML,Celastraceae,Catha,Catha edulis,126.0,126.0,6064,0.80312
2,LQ-01-61-03_pos.mzXML,Celastraceae,Catha,Catha edulis,126.0,126.0,6064,0.80312
3,LQ-01-61-04_pos.mzXML,Celastraceae,Catha,Catha edulis,126.0,126.0,6064,0.80312
4,LQ-01-61-05_pos.mzXML,Celastraceae,Celastrus,Celastrus orbiculatus,212.0,732.0,6064,0.69952
...,...,...,...,...,...,...,...,...
71,LQ-01-61-72_pos.mzXML,Celastraceae,Salacia,Salacia letestuana,0.0,514.0,6064,0.82732
72,LQ-01-61-73_pos.mzXML,Celastraceae,Tripterygium,Tripterygium hypoglaucum,162.0,1353.0,6064,0.66242
73,LQ-01-61-74_pos.mzXML,Celastraceae,Tripterygium,Tripterygium wilfordii,1011.0,1353.0,6064,0.23792
74,LQ-01-61-75_pos.mzXML,Celastraceae,Tripterygium,Tripterygium wilfordii,1011.0,1353.0,6064,0.23792


# Similarity component (SC)

#### SC.1. SC computation

In [64]:
metric_df = pd.read_csv(vectorized_data_filename, sep=',', encoding= 'unicode_escape')
SC = similarity_component(metric_df, SC_component)
SC


X does not have valid feature names, but IsolationForest was fitted with feature names



Unnamed: 0,filename,anomaly_IF,anomaly_LOF,anomaly_OCSVM,SC
0,LQ-01-61-01_pos.mzXML,-1,1,1,1
1,LQ-01-61-02_pos.mzXML,-1,1,1,1
2,LQ-01-61-03_pos.mzXML,-1,1,1,1
3,LQ-01-61-04_pos.mzXML,-1,1,1,1
4,LQ-01-61-05_pos.mzXML,-1,1,1,1
...,...,...,...,...,...
71,LQ-01-61-72_pos.mzXML,1,-1,1,1
72,LQ-01-61-73_pos.mzXML,1,1,1,0
73,LQ-01-61-74_pos.mzXML,-1,1,1,1
74,LQ-01-61-75_pos.mzXML,1,1,1,0


#### SC.2. Visualizing the outliers

NOTE: depending on the size of the data set, projections could take a while... 

##### SC.2.1 UMAP

In [65]:
umap_2d(matrix = metric_df,
        data = SC, 
        metadata=metadata_df)

#### SC.2.2 PCoA

In [66]:
pcoa_2d(
    matrix= metric_df,
    data = SC,
    metric= 'braycurtis')


The result contains negative eigenvalues. Please compare their magnitude with the magnitude of some of the largest positive eigenvalues. If the negative ones are smaller, it's probably safe to ignore them, but if they are large in magnitude, the results won't be useful. See the Notes section for more details. The smallest eigenvalue is -0.0016647020911528683 and the largest is 0.9032327317957183.



#### SC.2.3 PCoA + UMAP combined visualization

In [67]:
pcoa_umap_2d(
    matrix= metric_df,
    data = SC,
    metric= 'braycurtis')

#### SC.2.4 PCoA 3D

In [68]:
pcoa_3d(
    matrix= metric_df,
    data = SC,
    metric= 'braycurtis')


The result contains negative eigenvalues. Please compare their magnitude with the magnitude of some of the largest positive eigenvalues. If the negative ones are smaller, it's probably safe to ignore them, but if they are large in magnitude, the results won't be useful. See the Notes section for more details. The smallest eigenvalue is -0.0016647020911528683 and the largest is 0.9032327317957183.



# Class component (SC)

### CC.1. Retrive and clean the predicted chemical classes from Sirius

In [69]:
CC = class_component(canopus_npc_df, filename_header, species_column,genus_column,family_column, min_class_confidence, min_recurrence, CC_component)
CC

Unnamed: 0,filename,class,ATTRIBUTE_Species,ATTRIBUTE_Genus,ATTRIBUTE_Family,Chemical_class_reported_in_species,Chemical_class_reported_in_genus,New_CC_in_sp,New_CC_in_genus,CC
0,LQ-01-61-01_pos.mzXML,{Agarofuran sesquiterpenoids},Catha edulis,Catha,Celastraceae,{Agarofuran sesquiterpenoids|Eudesmane sesquit...,{Agarofuran sesquiterpenoids|Eudesmane sesquit...,{},{},0.0
1,LQ-01-61-02_pos.mzXML,{Oleanane triterpenoids},Catha edulis,Catha,Celastraceae,{Agarofuran sesquiterpenoids|Eudesmane sesquit...,{Agarofuran sesquiterpenoids|Eudesmane sesquit...,{Oleanane triterpenoids},{Oleanane triterpenoids},1.0
2,LQ-01-61-03_pos.mzXML,"{Triketide meroterpenoids, Ursane and Taraxast...",Catha edulis,Catha,Celastraceae,{Agarofuran sesquiterpenoids|Eudesmane sesquit...,{Agarofuran sesquiterpenoids|Eudesmane sesquit...,"{Ursane and Taraxastane triterpenoids, Tetrake...","{Ursane and Taraxastane triterpenoids, Tetrake...",1.0
3,LQ-01-61-04_pos.mzXML,"{Pyridine alkaloids, Agarofuran sesquiterpenoids}",Catha edulis,Catha,Celastraceae,{Agarofuran sesquiterpenoids|Eudesmane sesquit...,{Agarofuran sesquiterpenoids|Eudesmane sesquit...,{},{},0.0
4,LQ-01-61-05_pos.mzXML,"{Oleanane triterpenoids, Pyridine alkaloids}",Celastrus orbiculatus,Celastrus,Celastraceae,"{Carotenoids (C40, Ψ-Ψ), Menthane monoterpenoi...","{Carotenoids (C40, Ψ-Ψ), Menthane monoterpenoi...",{Pyridine alkaloids},{Pyridine alkaloids},1.0
...,...,...,...,...,...,...,...,...,...,...
71,LQ-01-61-72_pos.mzXML,,,,,,,,,
72,LQ-01-61-73_pos.mzXML,{Pyridine alkaloids},Tripterygium hypoglaucum,Tripterygium,Celastraceae,{Friedelane triterpenoids|Oleanane triterpenoi...,{Friedelane triterpenoids|Oleanane triterpenoi...,{},{},0.0
73,LQ-01-61-74_pos.mzXML,"{Abietane diterpenoids, Oleanane triterpenoids}",Tripterygium wilfordii,Tripterygium,Celastraceae,{Friedelane triterpenoids|Oleanane triterpenoi...,{Friedelane triterpenoids|Oleanane triterpenoi...,{},{},0.0
74,LQ-01-61-75_pos.mzXML,"{Ursane and Taraxastane triterpenoids, Vitamin...",Tripterygium wilfordii,Tripterygium,Celastraceae,{Friedelane triterpenoids|Oleanane triterpenoi...,{Friedelane triterpenoids|Oleanane triterpenoi...,{Vitamin D2 and derivatives},{Vitamin D2 and derivatives},1.0


# Priority rank Results

In [70]:
PR = priority_rank(LC_component, SC_component, CC_component, w1, w2, w3, w4)
#PR.head()

In [71]:
Cyt_format_visualization = Cyt_format(col_id_unique)

### Display results

In [72]:
#Show the results in an interactive way
def selection_changed(selection):
    return PR.iloc[selection]
interact(selection_changed, selection=lineup_widget.LineUpWidget(PR));


Message serialization failed with:
Out of range float values are not JSON compliant
Supporting this message is deprecated in jupyter-client 7, please make sure your message is JSON-compliant



interactive(children=(LineUpWidget(value=[], description='selection', layout=Layout(align_self='stretch', heig…