# INVENTA - Prioritization of natural extracts for chemical originality discovery


In [1]:
from __future__ import print_function
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")
import numpy as np
import pandas as pd
import sys 
import lineup_widget
import ipywidgets as widgets
from ipywidgets import *
!jupyter nbextension enable --py --sys-prefix lineup_widget

sys.path.append('../src')
sys.path.append('../gnps_postprocessing/src') 

from import_data import*
from process_data import *
from FC import *
from LC import *
from SC import *
from CC import *
from plot import *

from gnps_download_results import *
from consolidates_structures import *
from gnps_results_postprocess import *

Enabling notebook extension lineup_widget/extension...
      - Validating: [32mOK[0m


# Paths and parameters to define

In [10]:
#paths:

metadata_filename = '../data/Celastraceae_Set_metadata_pos.tsv'                      # The path where the metadata is placed
quantitative_data_filename = '../data/Celastraceae_pos_quant_id.csv'                    # The path where the quantitative table is placed
tima_results_filename = '../data/Celastraceae_pos_spectral_match_results_repond.tsv' # The path where the ISDB reponderated file is placed
vectorized_data_filename = '../data/Celastraceae_memomatrix.csv'                     # The path where the MEMO matrix is placed
canopus_npc_summary_filename = '../data/canopus_formula_summary_adducts.tsv'                     # The path where the SIRIUS-CANOPUS results file is placed
sirius_annotations_filename = '../data/compound_identifications_adducts.tsv'                 # The path where the SIRIUS results file is placed

job_id= 'df71854c6e644b979228d96b521a490b' #"yourjobidgoeshere"  '4c919fcbc83d487493a487012afb920a'#MZmine2   'df71854c6e644b979228d96b521a490b' #MZmine3_ID

# metadata headers

species_column = 'ATTRIBUTE_Species'
genus_column = 'ATTRIBUTE_Genus'
family_column = 'ATTRIBUTE_Family'
organe_column = 'ATTRIBUTE_Organe'
filename_header = 'filename'

# parameters for cleaning-up annotations from GNPS 

max_ppm_error = 5                 # min error in ppm to consider an annotation valable
shared_peaks = 4                 # min number of shared peaks between the MS2 experimental and MS2 from the database, to consider an annotation valable
min_cosine = 0.6                  # min cosine score to consider an annotation valable
ionisation_mode = 'pos'           # ionisation mode according to experimental conditions 'pos' or 'neg'
max_spec_charge = 2

#quantitative table
data_process_origin = 'MZMine3' #'MZMine2'
use_ion_identity= True  #False

#Feature_component

min_specificity = 0.90            # minimun feature specificity to consider

## 
multiple_organism_parts = True  #True: the specificity is going to be considered as the sum of the 'max_parts_per_organism' shared in the samples.
max_parts_per_organism = 4       #max recurrence of the same organism species (for example: 5 samples, same species but different plant part) 

## inputs to use: 
isdb_annotations = True          #True: the tima_results_filename will be considered in the calculations
sirius_annotations = True        #True: the sirius_annotations_filename will be considered in the calculations

## cut-offs: 
min_score_final = 0.3             #cut-off filter for considering an isdb annotation valable. You must be extremenly carefull with this parameter, '0.0' as default.
min_ZodiacScore = 0.9             #cut-off filter for considering a sirius annotation valable. It is used in combination with min_ConfidenceScore.
min_ConfidenceScore= 0.25         #cut-off filter for considering a sirius annotation valable. '0.0' as default.

annotation_preference = 0          # Only Annotated features: '1' or  Only Not annotated features: '0'

#Literature_component

LC_component = True               # LC will be calculated

max_comp_reported_sp = 10          # max number of compounds reported at species level, more than this value, the plant is considered less interesting
max_comp_reported_g = 50         # max number of compounds reported at genus level,more than this value, the plant is considered less interesting
max_comp_reported_f = 500           # max number of compounds reported at genus level,more than this value, the plant is considered less interesting

#weight for each taxonomic level 
ws = 1
wg = 1
wf = 1 

#Similarity_component

SC_component = True                # SC will be calculated

#Class_component

CC_component =  True              # CC will be calculated
min_class_confidence = 0.8       #cut-off filter for considering a sirius class valable. It is used in combination with min_recurrence.
min_recurrence = 5              # minimum recurrence of a chemical class to consider it acceptable

#specify the weight to modulate each component 
w1 = 1           # 1 means the value itself is taken into account. A 0.5 means onle half of the calculated value is taken into account
w2 = 1
w3 = 1
w4 = 1


# Prepare input files

### Download and clean the GNPS results

In [4]:
df_annotations = gnps_download_results(job_id, output_folder ='../data/all_annotations', return_annotation_table=True)
#df_annotations.head()

# Consolidate structure identifiers
gnps_annotations_consolidated  = consolidate_and_convert_structures(df_annotations, prefix='', smiles='Smiles', inchi='INCHI')

# Filter GNPS annotations
gnps_annotations_filtered = gnps_filter_annotations(gnps_annotations_consolidated, 'Consol_InChI', ionisation_mode, max_ppm_error, min_cosine, shared_peaks, max_spec_charge)

# Generate annotation attributes
annot_gnps_df = get_gnps_annotations(get_molecular_formula_from_inchi(gnps_annotations_filtered, 'Consol_InChI'))
#annot_gnps_df.head(2)


This is the GNPS job link: https://gnps.ucsd.edu/ProteoSAFe/status.jsp?task=df71854c6e644b979228d96b521a490b
Downloading the following content: https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task=df71854c6e644b979228d96b521a490b&view=view_all_annotations_DB


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 46.0M    0 46.0M    0     0  1462k      0 --:--:--  0:00:32 --:--:-- 3002k


GNPS job results were succesfully downloaded as: ../data/all_annotations.zip
GNPS job results were succesfully extracted into the folder: ../data/all_annotations
   FEATURE-BASED MOLECULAR NETWORKING job detected - Version > 28
      2493 spectral library annotations in the job.
      16138 nodes in the network (including single nodes).
Both SMILES and InChI were inputted
Converting SMILES to mol object
  Salt(s) deleted in       : CN(CCCNC(=O)C1CCCO1)C2=NC3=CC(=C(C=C3C(=N2)N)OC)OC.Cl
  Remaining residue        : COc1cc2nc(N(C)CCCNC(=O)C3CCCO3)nc(N)c2cc1OC
Succesfully converted to mol object: 1923
Exception to the parsing: 0
Not available: 571
Converting INCHI to mol object


RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ERROR: [14:00:41] ERROR: 
RDKit ER

Succesfully converted to mol object: 2004
Exception to the parsing: 0
Not available: 490
Consolidating the lists
Total mol object from the list 1 = 1923
Mol object consolidated from list 2 = 218
Consolidated structures = 2141
Converting mol objects to SMILES iso
Converting mol objects to SMILES
Converting mol objects to InChI
Converting mol objects to InChIKey
End
Initial number of annotations: 2494
Remaining after ionisation mode filtering: 2436
Remaining after max_ppm_error filtering: 2036
Remaining after min_cosine filtering: 2036
Remaining after number of shared_peaks filtering: 2036
Remaining after number of spectrum charge filtering: 2036
Initial number of annotations filtering: 2036
After carbon containing adducts filtering: 2036
Valid molecular formula: 1751


### Metadata table

In [11]:
metadata_df = pd.read_csv(metadata_filename, sep='\t')
metadata_df.head(3)

Unnamed: 0,filename,ATTRIBUTE_Code,ATTRIBUTE_Type,ATTRIBUTE_Family,ATTRIBUTE_Genus,ATTRIBUTE_Species,ATTRIBUTE_Organe,ATTRIBUTE_Broad_organ,ATTRIBUTE_Tissue,ATTRIBUTE_Subsystem
0,LQ-01-61-01_pos.mzXML,V107694,Sample,Celastraceae,Catha,Catha edulis,Leaves,photosynthetic,green tissue,aboveground
1,LQ-01-61-02_pos.mzXML,V107695,Sample,Celastraceae,Catha,Catha edulis,Stems,woody vegetative,woody tissue,aboveground
2,LQ-01-61-03_pos.mzXML,V107696,Sample,Celastraceae,Catha,Catha edulis,Roots,roots,root tissue,belowground


In [12]:
#if you need to create an unique identifier column like Species|part, use as model the followin line. IF the colum is PRESENT, then don't run it.
metadata_df['ATTRIBUTE_Sppart'] = metadata_df[species_column]+ "|" + metadata_df[organe_column].map(str)

In [13]:
col_id_unique = filename_header #'ATTRIBUTE_Sppart'  # filename_header   #column containing an unique identifier for each sample, like Species_plantpart, Species_solvent. It could be the filename

### Quantification table

In [21]:
#Quantitative table
quant_df = quant_table(quantitative_data_filename, data_process_origin, use_ion_identity)
quant_df.head(3)

Unnamed: 0_level_0,LQ-01-61-01_pos.mzXML,LQ-01-61-02_pos.mzXML,LQ-01-61-03_pos.mzXML,LQ-01-61-04_pos.mzXML,LQ-01-61-05_pos.mzXML,LQ-01-61-06_pos.mzXML,LQ-01-61-07_pos.mzXML,LQ-01-61-08_pos.mzXML,LQ-01-61-09_pos.mzXML,LQ-01-61-10_pos.mzXML,...,LQ-01-61-69_pos.mzXML,LQ-01-61-70_pos.mzXML,LQ-01-61-71_pos.mzXML,LQ-01-61-72_pos.mzXML,LQ-01-61-73_pos.mzXML,LQ-01-61-74_pos.mzXML,LQ-01-61-75_pos.mzXML,LQ-01-61-76_pos.mzXML,LQ-01-61-77_pos.mzXML,LQ-01-61-78_pos.mzXML
annotation network number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,0.001896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.087568,0.0
1.0,0.0,0.0,0.007044,0.0,0.0,0.0,0.0,0.0,0.377265,0.0,...,0.0,0.0,0.0,0.0,0.041531,0.0,0.0,0.0,0.0,0.0
2.0,0.0,0.01081,0.001208,0.002477,0.0,0.002341,0.019336,0.033384,0.0,0.016455,...,0.004177,0.0,0.004424,0.0,0.00534,0.00769,0.004954,0.0,0.060946,0.007016


In [22]:
#if Ion Identity is used, it is necesary to recover the information for each correlation group
correlation_groups_df= correlation_groups(quantitative_data_filename, use_ion_identity)

#### Combined and remove experimental controls


In [23]:
# merge tables
full_df = full_data(metadata_df, quant_df, filename_header)

#erase all the blanks and QC's - Change the string as needed
list_of_strings_for_QC_Blank_filter = ['Blank', 'QC']
column_to_use_for_filtering = 'ATTRIBUTE_Type' #this information should be included in the metadata table

filtered_full_df = drop_samples_based_on_string(full_df,'full_df',list_of_strings_for_QC_Blank_filter, column_to_use_for_filtering)
filtered_metadata_df = drop_samples_based_on_string(metadata_df, 'metadata_df', list_of_strings_for_QC_Blank_filter, column_to_use_for_filtering)

(78, 13200)
(76, 13200)
(77, 11)
(76, 11)


##### Make a minimal table for further processing & apply filter by intensity 

In [16]:
reduced_df = reduce_df(filtered_full_df, filtered_metadata_df, col_id_unique).transpose()

In [24]:
#you can choose between a quantile filter (only the features higher than the third quantile (75%) ares kept):
reduced_df = features_filter(reduced_df, min_threshold=0.02)

In [25]:
#or a simple intensity filter, it removes the features less than X % of relative area (by default 20%). If you want you can combine boht filters
reduced_df = quantile_filter(reduced_df, quantile_threshold=0.75)

##### visualization of the distribution variation before and after filtering by sample 

In [24]:
drop_down = widgets.Dropdown(options=reduced_df.columns,description='Sample to plot', disabled=False)
def dropdown_handler(change):
    global sample
    print(change.new)
    sample = change.new  # This line isn't working
drop_down.observe(dropdown_handler, names='value')
display(drop_down)

Dropdown(description='Sample to plot', options=('LQ-01-61-01_pos.mzXML', 'LQ-01-61-02_pos.mzXML', 'LQ-01-61-03…

LQ-01-61-78_pos.mzXML


In [25]:
distribution_to_plot(sample, quant_df, reduced_df)

# Start calculation the diferent components

# Feature component (FC)

#### FC.1. Annotation Rate

In [26]:
#load annotations files
annot_is_df       = get_isdb_annotations(tima_results_filename, isdb_annotations)
annot_sirius_df   = get_sirius_annotations(sirius_annotations_filename, sirius_annotations) 

annotation_df = annotations(annot_is_df, annot_sirius_df, sirius_annotations, isdb_annotations, min_score_final, min_ConfidenceScore, min_ZodiacScore, correlation_groups_df, use_ion_identity)
annotation_df.head(2)


Unnamed: 0,annotation network number,annotation
0,0.0,0
1,1.0,0


#### FC.2. FC computation

In [None]:
FC = feature_component(quant_df, reduced_df, annotation_df, metadata_df, family_column, genus_column, species_column, col_id_unique, min_specificity, annotation_preference, filename_header, annot_sirius_df, sirius_annotations, annot_gnps_df, min_ZodiacScore, multiple_organism_parts, max_parts_per_organism, use_ion_identity)
FC.head()

Unnamed: 0,filename,ATTRIBUTE_Family,ATTRIBUTE_Genus,ATTRIBUTE_Species,initial_F,filtered_F,Total_SF,Total_SNAF,FS,FC
13,LQ-01-61-14_pos.mzXML,Celastraceae,Evonymopsis,Evonymopsis sp,1312,3.0,2.0,2.0,0.67,0.67
22,LQ-01-61-23_pos.mzXML,Celastraceae,Euonymus,Euonymus cochinchinensis,598,14.0,11.0,9.0,0.79,0.64
6,LQ-01-61-07_pos.mzXML,Celastraceae,Celastrus,Celastrus paniculatus,1389,67.0,48.0,42.0,0.72,0.63
2,LQ-01-61-03_pos.mzXML,Celastraceae,Catha,Catha edulis,1321,16.0,12.0,10.0,0.75,0.62
30,LQ-01-61-31_pos.mzXML,Celastraceae,Euonymus,Euonymus frigidus,1425,39.0,27.0,23.0,0.69,0.59


In [None]:
#Show the results in an interactive way
def selection_changed_FC(selection):
    return FC.iloc[selection]
interact(selection_changed_FC, selection=lineup_widget.LineUpWidget(FC));

interactive(children=(LineUpWidget(value=[], description='selection', layout=Layout(align_self='stretch', heig…

# Literature component (LC)


#### LC.1. LC computation

In [17]:
LC = literature_component(LC_component, metadata_df, filename_header, species_column, genus_column, family_column, max_comp_reported_sp, max_comp_reported_g, max_comp_reported_f, ws, wg, wf)
LC

Unnamed: 0,filename,ATTRIBUTE_Family,ATTRIBUTE_Genus,ATTRIBUTE_Species,Reported_comp_Species,Reported_comp_Genus,Reported_comp_Family,LC
0,LQ-01-61-01_pos.mzXML,Celastraceae,Catha,Catha edulis,126.0,126.0,6064.0,0.72752
1,LQ-01-61-02_pos.mzXML,Celastraceae,Catha,Catha edulis,126.0,126.0,6064.0,0.72752
2,LQ-01-61-03_pos.mzXML,Celastraceae,Catha,Catha edulis,126.0,126.0,6064.0,0.72752
3,LQ-01-61-04_pos.mzXML,Celastraceae,Catha,Catha edulis,126.0,126.0,6064.0,0.72752
4,LQ-01-61-05_pos.mzXML,Celastraceae,Celastrus,Celastrus orbiculatus,212.0,732.0,6064.0,0.52032
...,...,...,...,...,...,...,...,...
72,LQ-01-61-73_pos.mzXML,Celastraceae,Tripterygium,Tripterygium hypoglaucum,162.0,1353.0,6064.0,0.44612
73,LQ-01-61-74_pos.mzXML,Celastraceae,Tripterygium,Tripterygium wilfordii,1011.0,1353.0,6064.0,-0.40288
74,LQ-01-61-75_pos.mzXML,Celastraceae,Tripterygium,Tripterygium wilfordii,1011.0,1353.0,6064.0,-0.40288
75,LQ-01-61-76_pos.mzXML,0,0,0,0.0,0.0,0.0,1.00000


# Similarity component (SC)

#### SC.1. SC computation

In [18]:
metric_df = pd.read_csv(vectorized_data_filename, sep=',', encoding= 'unicode_escape')
SC = similarity_component(metric_df, SC_component)
SC

Unnamed: 0,filename,anomaly_IF,anomaly_LOF,anomaly_OCSVM,SC
0,LQ-01-61-08_pos.mzXML,1,1,1,0
1,LQ-01-61-09_pos.mzXML,1,-1,1,1
2,LQ-01-61-15_pos.mzXML,1,1,1,0
3,LQ-01-61-05_pos.mzXML,1,1,1,0
4,LQ-01-61-12_pos.mzXML,1,1,1,0
...,...,...,...,...,...
73,LQ-01-61-37_pos.mzXML,1,1,1,0
74,LQ-01-61-73_pos.mzXML,1,1,1,0
75,LQ-01-61-74_pos.mzXML,1,1,1,0
76,LQ-01-61-55_pos.mzXML,1,1,1,0


#### SC.2. Visualizing the outliers

NOTE: depending on the size of the data set, projections could take a while... 

#### SC.2.1 PCoA + UMAP combined visualization

In [None]:
pcoa_umap_2d(
    matrix= metric_df,
    data = SC,
    metric= 'braycurtis', 
    filename_header=filename_header)


The result contains negative eigenvalues. Please compare their magnitude with the magnitude of some of the largest positive eigenvalues. If the negative ones are smaller, it's probably safe to ignore them, but if they are large in magnitude, the results won't be useful. See the Notes section for more details. The smallest eigenvalue is -0.007224834998204779 and the largest is 2.5161027047806597.



##### SC.2.2 optionally you can chosse to plot only UMAP, PCoA or PCoA 3d

In [None]:
#umap_2d(matrix = metric_df,
#       data = SC, 
#       metadata=metadata_df, 
#       filename_header =filename_header)

In [None]:
#pcoa_2d(
#    matrix= metric_df,
#    data = SC,
#    metric= 'braycurtis', #
#     filename_header=filename_header)

In [None]:
#pcoa_3d(
#    matrix= metric_df,
#    data = SC,
#    metric= 'braycurtis',
#    filename_header=filename_header)

# Class component (SC)

In [19]:
canopus_npc_df = get_canopus_pred_classes(canopus_npc_summary_filename, CC_component)
CC = class_component(quantitative_data_filename, data_process_origin, canopus_npc_df, filename_header, species_column, genus_column, family_column, metadata_df, reduced_df, min_class_confidence, min_recurrence, CC_component)
CC

Unnamed: 0,filename,NPC#class,ATTRIBUTE_Species,ATTRIBUTE_Genus,ATTRIBUTE_Family,Chemical_class_reported_in_species,Chemical_class_reported_in_genus,New_CC_in_sp,New_CC_in_genus,CCs,CCg,CC
0,LQ-01-61-02_pos.mzXML,"{Agarofuran sesquiterpenoids, Other Octadecano...",Catha edulis,Catha,Celastraceae,"{Acyclic monoterpenoids, Flavonols, Isoindole ...","{Acyclic monoterpenoids, Flavonols, Isoindole ...",{Other Octadecanoids},{Other Octadecanoids},0.5,0.5,1.0
1,LQ-01-61-03_pos.mzXML,{Agarofuran sesquiterpenoids},Catha edulis,Catha,Celastraceae,"{Acyclic monoterpenoids, Flavonols, Isoindole ...","{Acyclic monoterpenoids, Flavonols, Isoindole ...",{},{},0.5,0.5,1.0
2,LQ-01-61-04_pos.mzXML,"{Agarofuran sesquiterpenoids, Oleanane triterp...",Catha edulis,Catha,Celastraceae,"{Acyclic monoterpenoids, Flavonols, Isoindole ...","{Acyclic monoterpenoids, Flavonols, Isoindole ...","{Oleanane triterpenoids, Dammarane and Protost...","{Oleanane triterpenoids, Dammarane and Protost...",0.5,0.5,1.0
3,LQ-01-61-06_pos.mzXML,"{Agarofuran sesquiterpenoids, Simple coumarins...",Celastrus orbiculatus,Celastrus,Celastraceae,"{Flavan-3-ols, Carotenoids (C40, β-β), Oleanan...","{Unsaturated fatty acids, Flavan-3-ols, Triacy...","{Simple coumarins, Open-chain polyketides, Uns...","{Simple coumarins, Open-chain polyketides}",0.5,0.5,1.0
4,LQ-01-61-07_pos.mzXML,"{Dammarane and Protostane triterpenoids, Cinna...",Celastrus paniculatus,Celastrus,Celastraceae,{Agarofuran sesquiterpenoids|Eudesmane sesquit...,"{Unsaturated fatty acids, Flavan-3-ols, Triacy...","{Dammarane and Protostane triterpenoids, Cinna...","{Dammarane and Protostane triterpenoids, Cinna...",0.5,0.5,1.0
5,LQ-01-61-08_pos.mzXML,"{Agarofuran sesquiterpenoids, Other Octadecano...",Celastrus paniculatus,Celastrus,Celastraceae,{Agarofuran sesquiterpenoids|Eudesmane sesquit...,"{Unsaturated fatty acids, Flavan-3-ols, Triacy...","{Other Octadecanoids, Cinnamic acids and deriv...","{Other Octadecanoids, Cinnamic acids and deriv...",0.5,0.5,1.0
6,LQ-01-61-11_pos.mzXML,{Flavones},Elaeodendron schweinfurthianum,Elaeodendron,Celastraceae,nothing in DB,nothing in DB,nothing in DB,nothing in DB,0.0,0.0,0.0
7,LQ-01-61-13_pos.mzXML,{Dammarane and Protostane triterpenoids},Evonymopsis sp,Evonymopsis,Celastraceae,nothing in DB,nothing in DB,nothing in DB,nothing in DB,0.0,0.0,0.0
8,LQ-01-61-14_pos.mzXML,{Agarofuran sesquiterpenoids},Evonymopsis sp,Evonymopsis,Celastraceae,nothing in DB,nothing in DB,nothing in DB,nothing in DB,0.0,0.0,0.0
9,LQ-01-61-17_pos.mzXML,{Agarofuran sesquiterpenoids},Euonymus bockii,Euonymus,Celastraceae,nothing in DB,"{Flavonols, Furofuranoid lignans|Neolignans, C...",nothing in DB,nothing in DB,0.0,0.0,0.0


# Priority rank Results

In [20]:
PR = priority_rank(FC, LC, SC, CC, LC_component, SC_component, CC_component, w1, w2, w3, w4, filename_header)
PR.head()


Unnamed: 0,filename,ATTRIBUTE_Family,ATTRIBUTE_Genus,ATTRIBUTE_Species,initial_F,filtered_F,Total_SF,Total_SNAF,FS,FC,...,Reported_comp_Species,Reported_comp_Genus,Reported_comp_Family,SC,CCs,CCg,CC,New_CC_in_sp,New_CC_in_genus,PR
0,LQ-01-61-14_pos.mzXML,Celastraceae,Evonymopsis,Evonymopsis sp,1312,3.0,2.0,2.0,0.67,0.67,...,0.0,0.0,6064.0,0,0.0,0.0,0.0,nothing in DB,nothing in DB,1.54872
1,LQ-01-61-23_pos.mzXML,Celastraceae,Euonymus,Euonymus cochinchinensis,598,14.0,11.0,9.0,0.79,0.64,...,0.0,440.0,6064.0,1,0.0,0.0,0.0,nothing in DB,nothing in DB,2.43072
2,LQ-01-61-07_pos.mzXML,Celastraceae,Celastrus,Celastrus paniculatus,1389,67.0,48.0,42.0,0.72,0.63,...,71.0,732.0,6064.0,1,0.5,0.5,1.0,"{Dammarane and Protostane triterpenoids, Cinna...","{Dammarane and Protostane triterpenoids, Cinna...",3.29132
3,LQ-01-61-03_pos.mzXML,Celastraceae,Catha,Catha edulis,1321,16.0,12.0,10.0,0.75,0.62,...,126.0,126.0,6064.0,0,0.5,0.5,1.0,{},{},2.34752
4,LQ-01-61-31_pos.mzXML,Celastraceae,Euonymus,Euonymus frigidus,1425,39.0,27.0,23.0,0.69,0.59,...,0.0,440.0,6064.0,0,0.0,0.0,0.0,nothing in DB,nothing in DB,1.38072


### Display results

In [21]:
#Show the results in an interactive way
def selection_changed(selection):
    return PR.iloc[selection]
interact(selection_changed, selection=lineup_widget.LineUpWidget(PR));

interactive(children=(LineUpWidget(value=[], description='selection', layout=Layout(align_self='stretch', heig…

#### Select the sample you wan to plot 

In [27]:
drop_down = widgets.Dropdown(options=quant_df.columns,description='Sample to plot', disabled=False)
def dropdown_handler(change):
    global sample
    print(change.new)
    sample = change.new 
drop_down.observe(dropdown_handler, names='value')
display(drop_down) 

Dropdown(description='Sample to plot', options=('LQ-01-61-01_pos.mzXML', 'LQ-01-61-02_pos.mzXML', 'LQ-01-61-03…

LQ-01-61-78_pos.mzXML


#### Plot the selected sample

In [29]:
ionmap2D(sample, quantitative_data_filename, annotation_df, metadata_df, reduced_df, min_specificity, annotation_preference, species_column, organe_column, CC_component, canopus_npc_summary_filename, min_class_confidence, sirius_annotations, sirius_annotations_filename, min_ConfidenceScore, min_ZodiacScore, use_ion_identity, correlation_groups_df, data_process_origin, filename_header)

ValueError: 
    Invalid element(s) received for the 'size' property of scattergl.marker
        Invalid elements include: [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]

    The 'size' property is a number and may be specified as:
      - An int or float in the interval [0, inf]
      - A tuple, list, or one-dimensional numpy array of the above