# Inventa: a computational tool to discover structural novelty in natural  extracts libraries


In [1]:
from __future__ import print_function
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import sys 
import lineup_widget
from ipywidgets import *
!jupyter nbextension enable --py --sys-prefix lineup_widget

sys.path.append('../src')
sys.path.append('../gnps_postprocessing/src') 

from import_data import*
from process_data import *
from AC import *
from LC import *
from SC import *
from CC import *
from plot import *

Enabling notebook extension lineup_widget/extension...
      - Validating: [32mOK[0m


# Paths and parameters to define

In [2]:
# Suffixes necessary for the job: 

repository_path= '/mnt/c/Users/quirosgu/Desktop/Underexplored/pos/' #'/mnt/c/Users/quirosgu/Desktop/Indiv_PF1600/'  # The path were you want your folder to be placed
spectra_suffix= '_features_ms2_pos.mgf'
file_extention = '.mzML'
ionization_mode = 'pos'

# metadata headers

sampletype_header = 'sample_type'
species_column = 'organism_species'
genus_column = 'organism_genus'
family_column = 'organims_family'
filename_header = 'sample_id'#'ms_filename'
organe_column = 'organism_organ'


#quantitative table
data_process_origin = 'MZMine2' #'MZMine2'
use_ion_identity= False  #False

#Annotation component 

intensity_filter  = True
quantile_filter = True

min_threshold = 0.002
quantile_threshold = 0.75


## cut-offs: 
min_score_final = 0.3             #cut-off filter for considering an isdb annotation valable. You must be extremenly carefull with this parameter, '0.0' as default.
min_ZodiacScore = 0.9             #cut-off filter for considering a sirius annotation valable. It is used in combination with min_ConfidenceScore.
min_ConfidenceScore= 0.05         #cut-off filter for considering a sirius annotation valable. '0.0' as default.

#Literature_component

LC_component = True               # LC will be calculated

max_comp_reported_sp = 10          # max number of compounds reported at species level, more than this value, the plant is considered less interesting
max_comp_reported_g = 50         # max number of compounds reported at genus level,more than this value, the plant is considered less interesting
max_comp_reported_f = 500           # max number of compounds reported at genus level,more than this value, the plant is considered less interesting

#weight for each taxonomic level 
ws = 1
wg = 1
wf = 1 

#Similarity_component

SC_component = True                # SC will be calculated

#Class_component

CC_component =  True              # CC will be calculated
min_class_confidence = 0.8       #cut-off filter for considering a sirius class valable. It is used in combination with min_recurrence.
min_recurrence = 5              # minimum recurrence of a chemical class to consider it acceptable

#specify the weight to modulate each component 
w1 = 1           # 1 means the value itself is taken into account. A 0.5 means onle half of the calculated value is taken into account
w2 = 1
w3 = 1
w4 = 1


### Load Metadata from individual files

In [3]:
metadata_df = get_metadata_ind_files(repository_path)
metadata_df.head()

100%|██████████| 127/127 [00:00<00:00, 298.25it/s]


Unnamed: 0,sample_filename_pos,sample_id,sample_type,organims_family,organism_genus,organism_species,organism_organ,massive_id
0,Blank-01_pos.mzML,Blank-01,blank,,,,,MSV000090527
0,Blank-02_pos.mzML,Blank-02,blank,,,,,MSV000090527
0,Blank-03_pos.mzML,Blank-03,blank,,,,,MSV000090527
0,Blank-04_pos.mzML,Blank-04,blank,,,,,MSV000090527
0,Blank-05_pos.mzML,Blank-05,blank,,,,,MSV000090527


In [4]:
#if you need to create an unique identifier column like Species|part, use as model the followin line. IF the colum is PRESENT, then don't run it.
metadata_df['organism_sppart'] = metadata_df[species_column]+ "|" + metadata_df[organe_column].map(str)
metadata_df.head(5)

Unnamed: 0,sample_filename_pos,sample_id,sample_type,organims_family,organism_genus,organism_species,organism_organ,massive_id,organism_sppart
0,Blank-01_pos.mzML,Blank-01,blank,,,,,MSV000090527,
0,Blank-02_pos.mzML,Blank-02,blank,,,,,MSV000090527,
0,Blank-03_pos.mzML,Blank-03,blank,,,,,MSV000090527,
0,Blank-04_pos.mzML,Blank-04,blank,,,,,MSV000090527,
0,Blank-05_pos.mzML,Blank-05,blank,,,,,MSV000090527,


In [5]:
sppart_column = 'organism_sppart'
col_id_unique = filename_header #'organism_sppart'  #column containing an unique identifier for each sample, like Species_plantpart, Species_solvent. It could be the filename

# Start calculation the diferent components

# Annotation Component (FC)

#### AC.1. Process, clean and merge the quant tables, sirius and isdb annotations

In [6]:
ind_quant_table_full(repository_path, ionization_mode, data_process_origin, file_extention, use_ion_identity, min_score_final, min_ConfidenceScore, min_ZodiacScore)

100%|██████████| 128/128 [00:03<00:00, 33.66it/s]

Result are in : /mnt/c/Users/quirosgu/Desktop/Underexplored/pos/results/V115804_pos_quant_annotations.tsv





#### AC.2. Calculate the annotation rate of each sample

In [7]:
AC = annotation_component(repository_path, ionization_mode, file_extention, intensity_filter, quantile_filter, min_threshold, quantile_threshold, filename_header, metadata_df, species_column, genus_column, family_column, sppart_column)
AC.head()

100%|██████████| 128/128 [00:00<00:00, 197.68it/s]


Unnamed: 0,sample_id,organism_species,organism_genus,organims_family,organism_sppart,initial_features,features_after_filtering,Annot_features_after_filtering,AC
0,V101288,Strombosiopsis tetrandra,Strombosiopsis,Strombosiaceae,Strombosiopsis tetrandra|roots,480,45,36,0.2
1,V101594,Strombosia pustulata,Strombosia,Strombosiaceae,Strombosia pustulata|roots,530,43,38,0.12
2,V101800,Aptandra zenkeri,Aptandra,Aptandraceae,Aptandra zenkeri|leaves,710,41,37,0.1
3,V102553,Desbordesia glaucescens,Desbordesia,Irvingiaceae,Desbordesia glaucescens|aerial parts,305,78,66,0.15
4,V102554,Desbordesia glaucescens,Desbordesia,Irvingiaceae,Desbordesia glaucescens|whole plant,447,64,62,0.03


# Literature Component (LC)


#### LC.1. LC computation

In [8]:
LC = literature_component_ind(LC_component, repository_path, metadata_df, filename_header, species_column, genus_column, family_column, max_comp_reported_sp, max_comp_reported_g, max_comp_reported_f, ws, wg, wf)
LC.head()

Unnamed: 0,sample_id,organims_family,organism_genus,organism_species,Reported_comp_Species,Reported_comp_Genus,Reported_comp_Family,LC
0,V101288,Strombosiaceae,Strombosiopsis,Strombosiopsis tetrandra,0.0,0.0,14,1.0
1,V101594,Strombosiaceae,Strombosia,Strombosia pustulata,0.0,0.0,14,1.0
2,V101800,Aptandraceae,Aptandra,Aptandra zenkeri,0.0,0.0,3,1.0
3,V102553,Irvingiaceae,Desbordesia,Desbordesia glaucescens,0.0,0.0,3,1.0
4,V102554,Irvingiaceae,Desbordesia,Desbordesia glaucescens,0.0,0.0,3,1.0


# Class component (CC)

In [9]:
CC= class_component_ind_files(CC_component, repository_path, ionization_mode, min_class_confidence, metadata_df, filename_header, species_column, genus_column, family_column)
CC.head()

100%|██████████| 128/128 [00:01<00:00, 109.33it/s]


Unnamed: 0,sample_id,NPC#class,organism_species,organism_genus,organims_family,Chemical_class_reported_in_species,Chemical_class_reported_in_genus,New_CC_in_sp,New_CC_in_genus,CCs,CCg,CC
0,V101288,"{Simple coumarins, Prenyl quinone meroterpenoi...",Strombosiopsis tetrandra,Strombosiopsis,Strombosiaceae,nothing in DB,nothing in DB,nothing in DB,nothing in DB,0,0,1
1,V101594,"{Simple coumarins, Prenyl quinone meroterpenoi...",Strombosia pustulata,Strombosia,Strombosiaceae,nothing in DB,nothing in DB,nothing in DB,nothing in DB,0,0,1
2,V101800,"{Simple coumarins, Macrolide lactams, Prenyl q...",Aptandra zenkeri,Aptandra,Aptandraceae,nothing in DB,nothing in DB,nothing in DB,nothing in DB,0,0,1
3,V102553,"{Cembrane diterpenoids, Labdane diterpenoids, ...",Desbordesia glaucescens,Desbordesia,Irvingiaceae,nothing in DB,nothing in DB,nothing in DB,nothing in DB,0,0,1
4,V102554,"{Limonoids, Simple coumarins, Prenyl quinone m...",Desbordesia glaucescens,Desbordesia,Irvingiaceae,nothing in DB,nothing in DB,nothing in DB,nothing in DB,0,0,1


# Similarity component (SC)

#### SC.1. Calculate MEMO matrix from individual files

In [10]:
#calculate MEMO matrix from individual files
metric_df = calculate_memo_matrix_ind_files(repository_path, ionization_mode, spectra_suffix, filename_header)
metric_df.head()

100%|██████████| 127/127 [01:09<00:00,  1.84it/s]


Computing MEMO matrix from unaligned samples took: 72.109375 seconds


  memo_unaligned.memo_matrix.index = memo_unaligned.memo_matrix.index.str.replace(spectra_suffix, "")


Unnamed: 0,sample_id,peak@55.94,peak@61.01,peak@62.02,peak@72.94,peak@79.02,peak@87.91,peak@90.95,peak@95.05,peak@105.92,...,loss@83.77,peak@777.31,peak@778.31,peak@430.14,peak@331.60,peak@440.85,loss@23.80,peak@426.20,loss@55.32,peak@187.61
0,Blank-01,1.0,5.0,2.0,4.0,7.0,1.0,4.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Blank-02,1.0,7.0,9.0,2.0,6.0,0.0,6.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Blank-03,2.0,9.0,8.0,4.0,9.0,0.0,5.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Blank-04,1.0,4.0,5.0,3.0,8.0,0.0,3.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Blank-05,1.0,3.0,2.0,1.0,2.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
#remove experimental controls and blancks
list_of_strings_for_QC_Blank_filter = ['blank', 'QC'] #erase all the blanks and QC's - Change the string as needed
column_to_use_for_filtering = sampletype_header #this information should be included in the metadata table
metric_df= drop_samples_based_on_string_ind(repository_path, ionization_mode, filename_header, sampletype_header, metric_df, metadata_df, list_of_strings_for_QC_Blank_filter, column_to_use_for_filtering)
#metric_df.head()

(127, 61485)
(70, 61485)


In [12]:
#if metric_df was previously calculated: 
#metric_df = load_metric_df(repository_path, ionization_mode)
#metric_df.head()

#### SC.2. SC calculation

In [13]:
SC = similarity_component_ind(repository_path, ionization_mode, filename_header, metric_df)
SC.head(10)

Unnamed: 0,sample_id,anomaly_IF,anomaly_LOF,anomaly_OCSVM,SC
0,V101288,1,1,1,0
1,V101594,1,1,1,0
2,V101800,1,1,1,0
3,V102553,1,1,1,0
4,V102554,1,1,1,0
5,V102555,1,1,1,0
6,V102560,1,1,1,0
7,V102561,1,1,1,0
8,V102567,1,1,1,0
9,V102592,1,1,1,0


# Priority rank Results

In [14]:
PS = priority_score_ind(repository_path, filename_header, ionization_mode, species_column, genus_column, family_column, sppart_column, w1, w2, w3, w4)
PS.head()

100%|██████████| 128/128 [00:01<00:00, 105.51it/s]


Unnamed: 0,sample_id,organism_species,organism_genus,organims_family,organism_sppart,initial_features,features_after_filtering,Annot_features_after_filtering,AC,LC,Reported_comp_Species,Reported_comp_Genus,Reported_comp_Family,CCs,CCg,CC,New_CC_in_sp,New_CC_in_genus,SC,PS
0,V101288,Strombosiopsis tetrandra,Strombosiopsis,Strombosiaceae,Strombosiopsis tetrandra|roots,480,45,36,0.2,1.0,0.0,0.0,14,0,0,1,nothing in DB,nothing in DB,0,2.2
1,V101594,Strombosia pustulata,Strombosia,Strombosiaceae,Strombosia pustulata|roots,530,43,38,0.12,1.0,0.0,0.0,14,0,0,1,nothing in DB,nothing in DB,0,2.12
2,V101800,Aptandra zenkeri,Aptandra,Aptandraceae,Aptandra zenkeri|leaves,710,41,37,0.1,1.0,0.0,0.0,3,0,0,1,nothing in DB,nothing in DB,0,2.1
3,V102553,Desbordesia glaucescens,Desbordesia,Irvingiaceae,Desbordesia glaucescens|aerial parts,305,78,66,0.15,1.0,0.0,0.0,3,0,0,1,nothing in DB,nothing in DB,0,2.15
4,V102554,Desbordesia glaucescens,Desbordesia,Irvingiaceae,Desbordesia glaucescens|whole plant,447,64,62,0.03,1.0,0.0,0.0,3,0,0,1,nothing in DB,nothing in DB,0,2.03


### Display results

In [15]:
#Show the results in an interactive way
def selection_changed(selection):
    return PS.iloc[selection]
interact(selection_changed, selection=lineup_widget.LineUpWidget(PS));

interactive(children=(LineUpWidget(value=[], description='selection', layout=Layout(align_self='stretch', heig…