# Inventa: a computational tool to discover structural novelty in natural  extracts libraries


In [3]:
from __future__ import print_function
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import sys 
import lineup_widget
from ipywidgets import *
!jupyter nbextension enable --py --sys-prefix lineup_widget

sys.path.append('../src')
sys.path.append('../gnps_postprocessing/src') 

from import_data import*
from process_data import *
from AC import *
from LC import *
from SC import *
from CC import *
from plot import *

Enabling notebook extension lineup_widget/extension...
      - Validating: [32mOK[0m


# Paths and parameters to define

In [4]:
# Suffixes necessary for the job: 

repository_path= '/mnt/c/Users/quirosgu/Desktop/Indiv_PF1600/'  # The path were you want your folder to be placed
#quant_table_suffix ='_quant_pos.csv'
spectra_suffix= '_features_ms2_pos.mgf'
#metadata_sample_suffix ='_metadata.tsv'
#isdb_sample_suffix = '_isdb_matched_pos_repond.tsv'
#sirius_sample_suffix = 'compound_identifications_adducts.tsv'
canopus_sample_suffix = 'npc_summary.csv' #'_summary_adducts.tsv' #'canopus_summary.tsv'
#memo_sample_suffix= '_memo_pos.csv'
file_extention = '.mzXML'
ionization_mode = 'pos'

# metadata headers

sampletype_header = 'sample_type'
species_column = 'organism_species'
genus_column = 'organism_genus'
family_column = 'organism_family'
filename_header = 'sample_id'#'ms_filename'
organe_column = 'organism_organe'


#quantitative table
data_process_origin = 'MZMine2' #'MZMine2'
use_ion_identity= False  #False

#Annotation component 

intensity_filter  = True
quantile_filter = True

min_threshold = 0.002
quantile_threshold = 0.75


## cut-offs: 
min_score_final = 0.3             #cut-off filter for considering an isdb annotation valable. You must be extremenly carefull with this parameter, '0.0' as default.
min_ZodiacScore = 0.9             #cut-off filter for considering a sirius annotation valable. It is used in combination with min_ConfidenceScore.
min_ConfidenceScore= 0.25         #cut-off filter for considering a sirius annotation valable. '0.0' as default.

#Literature_component

LC_component = True               # LC will be calculated

max_comp_reported_sp = 10          # max number of compounds reported at species level, more than this value, the plant is considered less interesting
max_comp_reported_g = 50         # max number of compounds reported at genus level,more than this value, the plant is considered less interesting
max_comp_reported_f = 500           # max number of compounds reported at genus level,more than this value, the plant is considered less interesting

#weight for each taxonomic level 
ws = 1
wg = 1
wf = 1 

#Similarity_component

SC_component = True                # SC will be calculated

#Class_component

CC_component =  True              # CC will be calculated
min_class_confidence = 0.8       #cut-off filter for considering a sirius class valable. It is used in combination with min_recurrence.
min_recurrence = 5              # minimum recurrence of a chemical class to consider it acceptable

#specify the weight to modulate each component 
w1 = 1           # 1 means the value itself is taken into account. A 0.5 means onle half of the calculated value is taken into account
w2 = 1
w3 = 1
w4 = 1


### Load Metadata from individual files

In [5]:
metadata_df = get_metadata_ind_files(repository_path)
metadata_df.head()

100%|██████████| 1920/1920 [00:21<00:00, 88.66it/s]


Unnamed: 0,sample_id,sample_type,sample_substance_name,organism_kingdom,organism_phylum,organism_class,organism_order,organism_family,organism_genus,organism_species,...,pos_injection_date,bio_leish_donovani_10ugml_inhibition,bio_leish_donovani_2ugml_inhibition,bio_tryp_brucei_rhodesiense_10ugml_inhibition,bio_tryp_brucei_rhodesiense_2ugml_inhibition,bio_tryp_cruzi_10ugml_inhibition,bio_l6_cytotoxicity_10ugml_inhibition,sample_filename_neg,neg_injection_date,massive_id
0,VGF138_A01,qc,qc_mix,,,,,,,,...,2017-10-27,,,,,,,VGF138_A01_neg.mzXML,2017-10-27,MSV000087728
0,VGF138_A02,sample,V111819GP-01,Plantae,Tracheophyta,Magnoliopsida,Saxifragales,Paeoniaceae,Paeonia,Paeonia suffruticosa,...,2017-10-27,16.0,10.2,4.8,0.0,8.7,4.2,VGF138_A02_neg.mzXML,2017-10-27,MSV000087728
0,VGF138_A03,sample,V111988GP-01,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Scrophulariaceae,Buddleja,Buddleja officinalis,...,2017-10-27,36.7,12.2,9.5,6.0,0.0,0.5,VGF138_A03_neg.mzXML,2017-10-27,MSV000087728
0,VGF138_A04,sample,V112020GP-01,Plantae,Tracheophyta,Magnoliopsida,Rosales,Moraceae,Ficus,Ficus tikoua,...,2017-10-27,20.9,10.4,23.6,4.7,1.1,22.5,VGF138_A04_neg.mzXML,2017-10-27,MSV000087728
0,VGF138_A05,sample,V112033GP-01,Plantae,Tracheophyta,Magnoliopsida,Solanales,Solanaceae,Nicandra,Nicandra physalodes,...,2017-10-27,5.7,6.7,14.1,7.7,2.5,7.5,VGF138_A05_neg.mzXML,2017-10-27,MSV000087728


In [24]:
#if you need to create an unique identifier column like Species|part, use as model the followin line. IF the colum is PRESENT, then don't run it.
metadata_df['organism_sppart'] = metadata_df[species_column]+ "|" + metadata_df[organe_column].map(str)
sppart_column = 'organism_sppart'
metadata_df.head(5)

Unnamed: 0,sample_id,sample_type,sample_substance_name,organism_kingdom,organism_phylum,organism_class,organism_order,organism_family,organism_genus,organism_species,...,bio_leish_donovani_10ugml_inhibition,bio_leish_donovani_2ugml_inhibition,bio_tryp_brucei_rhodesiense_10ugml_inhibition,bio_tryp_brucei_rhodesiense_2ugml_inhibition,bio_tryp_cruzi_10ugml_inhibition,bio_l6_cytotoxicity_10ugml_inhibition,sample_filename_neg,neg_injection_date,massive_id,organism_sppart
0,VGF138_A01,qc,qc_mix,,,,,,,,...,,,,,,,VGF138_A01_neg.mzXML,2017-10-27,MSV000087728,
0,VGF138_A02,sample,V111819GP-01,Plantae,Tracheophyta,Magnoliopsida,Saxifragales,Paeoniaceae,Paeonia,Paeonia suffruticosa,...,16.0,10.2,4.8,0.0,8.7,4.2,VGF138_A02_neg.mzXML,2017-10-27,MSV000087728,Paeonia suffruticosa|leaves
0,VGF138_A03,sample,V111988GP-01,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Scrophulariaceae,Buddleja,Buddleja officinalis,...,36.7,12.2,9.5,6.0,0.0,0.5,VGF138_A03_neg.mzXML,2017-10-27,MSV000087728,Buddleja officinalis|leaves
0,VGF138_A04,sample,V112020GP-01,Plantae,Tracheophyta,Magnoliopsida,Rosales,Moraceae,Ficus,Ficus tikoua,...,20.9,10.4,23.6,4.7,1.1,22.5,VGF138_A04_neg.mzXML,2017-10-27,MSV000087728,Ficus tikoua|multiple
0,VGF138_A05,sample,V112033GP-01,Plantae,Tracheophyta,Magnoliopsida,Solanales,Solanaceae,Nicandra,Nicandra physalodes,...,5.7,6.7,14.1,7.7,2.5,7.5,VGF138_A05_neg.mzXML,2017-10-27,MSV000087728,Nicandra physalodes|fruits


In [7]:
col_id_unique = filename_header #'organism_sppart'  #column containing an unique identifier for each sample, like Species_plantpart, Species_solvent. It could be the filename

# Start calculation the diferent components

# Annotation Component (FC)

#### AC.1. Process, clean and merge the quant tables, sirius and isdb annotations

In [8]:
ind_quant_table_full(repository_path, ionization_mode, data_process_origin, file_extention, use_ion_identity, min_score_final, min_ConfidenceScore, min_ZodiacScore)

100%|██████████| 1920/1920 [02:03<00:00, 15.51it/s]

Result are in : /mnt/c/Users/quirosgu/Desktop/Indiv_PF1600/results/VGF159_H11_pos_quant_annotations.tsv





#### AC.2. Calculate the annotation rate of each sample

In [31]:
AC = annotation_component(repository_path, ionization_mode, file_extention, intensity_filter, quantile_filter, min_threshold, quantile_threshold, filename_header, metadata_df, species_column, genus_column, family_column, sppart_column)
AC.head()

100%|██████████| 1921/1921 [00:20<00:00, 93.83it/s] 


Unnamed: 0,sample_id,organism_family,organism_genus,organism_species,organism_sppart,initial_features,features_after_filtering,Annot_features_after_filtering,AC
0,VGF138_A02,Paeoniaceae,Paeonia,Paeonia suffruticosa,Paeonia suffruticosa|leaves,343,91,48,0.47
1,VGF138_A03,Scrophulariaceae,Buddleja,Buddleja officinalis,Buddleja officinalis|leaves,514,99,51,0.48
2,VGF138_A04,Moraceae,Ficus,Ficus tikoua,Ficus tikoua|multiple,498,75,47,0.37
3,VGF138_A05,Solanaceae,Nicandra,Nicandra physalodes,Nicandra physalodes|fruits,697,64,34,0.47
4,VGF138_A06,Polygalaceae,Asemeia,Asemeia extraaxillaris,Asemeia extraaxillaris|roots,522,87,38,0.56


# Literature Component (LC)


#### LC.1. LC computation

In [10]:
LC = literature_component(LC_component, repository_path, metadata_df, filename_header, species_column, genus_column, family_column, max_comp_reported_sp, max_comp_reported_g, max_comp_reported_f, ws, wg, wf)
LC.head()

Unnamed: 0,sample_id,organism_family,organism_genus,organism_species,Reported_comp_Species,Reported_comp_Genus,Reported_comp_Family,LC
0,VGF138_A02,Paeoniaceae,Paeonia,Paeonia suffruticosa,271.0,1058.0,1058.0,0.49624
1,VGF138_A03,Scrophulariaceae,Buddleja,Buddleja officinalis,133.0,557.0,2301.0,0.70958
2,VGF138_A04,Moraceae,Ficus,Ficus tikoua,0.0,895.0,6850.0,0.684
3,VGF138_A05,Solanaceae,Nicandra,Nicandra physalodes,6.0,6.0,13616.0,0.72048
4,VGF138_A06,Polygalaceae,Asemeia,Asemeia extraaxillaris,0.0,74.0,1434.0,0.95652


# Similarity component (SC)

#### SC.1. Calculate MEMO matrix from individual files

In [11]:
#calculate MEMO matrix from individual files
metric_df = calculate_memo_matrix_ind_files(repository_path, spectra_suffix, filename_header)
#metric_df.head()

100%|██████████| 1920/1920 [18:35<00:00,  1.72it/s]


Computing MEMO matrix from unaligned samples took: 1142.484375 seconds


  memo_unaligned.memo_matrix.index = memo_unaligned.memo_matrix.index.str.replace(spectra_suffix, "")


In [12]:
#remove experimental controls and blancks
list_of_strings_for_QC_Blank_filter = ['blank', 'qc'] #erase all the blanks and QC's - Change the string as needed
column_to_use_for_filtering = sampletype_header #this information should be included in the metadata table
metric_df= drop_samples_based_on_string_ind(metric_df,metadata_df, filename_header, sampletype_header,'metric_df', list_of_strings_for_QC_Blank_filter, column_to_use_for_filtering)
#metric_df.head()


(1920, 103429)
(1600, 103429)


#### SC.2. SC calculation

In [13]:
SC = similarity_component_ind(repository_path, metric_df, SC_component, filename_header)
SC.head(10)

Unnamed: 0,sample_id,anomaly_IF,anomaly_LOF,anomaly_OCSVM,SC
0,VGF138_A02,1,1,1,0
1,VGF138_A03,1,1,1,0
2,VGF138_A04,1,1,1,0
3,VGF138_A05,1,-1,1,1
4,VGF138_A06,1,1,1,0
5,VGF138_A07,1,1,1,0
6,VGF138_A08,1,1,1,0
7,VGF138_A09,1,1,1,0
8,VGF138_A10,1,1,1,0
9,VGF138_A11,1,1,1,0


# Class component (CC)

In [18]:
CC= class_component_ind_files_PF1600(CC_component, repository_path, min_class_confidence, metadata_df, filename_header, species_column, genus_column, family_column)
CC.head()

100%|██████████| 1921/1921 [00:39<00:00, 48.52it/s]


Unnamed: 0,sample_id,NPC#class,organism_species,organism_genus,organism_family,Chemical_class_reported_in_species,Chemical_class_reported_in_genus,New_CC_in_sp,New_CC_in_genus,CCs,CCg,CC
0,VGF138_A02,"{Nitro fatty acids, Cinnamic acids and derivat...",Paeonia suffruticosa,Paeonia,Paeoniaceae,"{Taraxerane triterpenoids, Phenylethanoids, Fu...","{Fatty aldehydes, Acyclic monoterpenoids, Phen...","{Nitro fatty acids, Diacylglycerols, Macrolide...","{Nitro fatty acids, Diacylglycerols, Macrolide...",0.5,0.5,1.0
1,VGF138_A03,"{Nitro fatty acids, Phenylethanoids, Glyceroph...",Buddleja officinalis,Buddleja,Scrophulariaceae,"{Miscellaneous apocarotenoids, Cinnamic acids ...",{Cinnamic acids and derivatives|Phenylethanoid...,"{Nitro fatty acids, Glycerophosphates, Cannabi...","{Nitro fatty acids, Glycerophosphates, Cannabi...",0.5,0.5,1.0
2,VGF138_A04,"{Nitro fatty acids, Dipeptides, Cyclic peptide...",Ficus tikoua,Ficus,Moraceae,nothing in DB,"{Isocoumarins, Lupane triterpenoids, Cholestan...",nothing in DB,nothing in DB,0.0,0.0,1.0
3,VGF138_A05,"{Nitro fatty acids, Dipeptides, Cyclic peptide...",Nicandra physalodes,Nicandra,Solanaceae,"{Cycloartane triterpenoids|Lanostane, Tirucall...","{Cycloartane triterpenoids|Lanostane, Tirucall...","{Nitro fatty acids, Ceramides, Dipeptides, Lim...","{Nitro fatty acids, Ceramides, Dipeptides, Lim...",0.5,0.5,1.0
4,VGF138_A06,"{Nitro fatty acids, Glycerophosphates, Cinnami...",Asemeia extraaxillaris,Asemeia,Polygalaceae,nothing in DB,nothing in DB,nothing in DB,nothing in DB,0.0,0.0,1.0


# Priority rank Results

In [19]:
PS = priority_score_ind(repository_path, AC, LC, SC, CC, LC_component, SC_component, CC_component, w1, w2, w3, w4, filename_header)
PS.head()

Unnamed: 0,sample_id,initial_features,features_after_filtering,Annot_features_after_filtering,AC,LC,Reported_comp_Species,Reported_comp_Genus,Reported_comp_Family,SC,CCs,CCg,CC,New_CC_in_sp,New_CC_in_genus,PS
0,VGF138_A02,343,343,148,0.6,0.49624,271.0,1058.0,1058.0,0,0.5,0.5,1.0,"{Nitro fatty acids, Diacylglycerols, Macrolide...","{Nitro fatty acids, Diacylglycerols, Macrolide...",2.09624
1,VGF138_A03,514,514,238,0.5,0.70958,133.0,557.0,2301.0,0,0.5,0.5,1.0,"{Nitro fatty acids, Glycerophosphates, Cannabi...","{Nitro fatty acids, Glycerophosphates, Cannabi...",2.20958
2,VGF138_A04,498,498,254,0.5,0.684,0.0,895.0,6850.0,0,0.0,0.0,1.0,nothing in DB,nothing in DB,2.184
3,VGF138_A05,697,697,298,0.6,0.72048,6.0,6.0,13616.0,1,0.5,0.5,1.0,"{Nitro fatty acids, Ceramides, Dipeptides, Lim...","{Nitro fatty acids, Ceramides, Dipeptides, Lim...",3.32048
4,VGF138_A06,522,522,226,0.6,0.95652,0.0,74.0,1434.0,0,0.0,0.0,1.0,nothing in DB,nothing in DB,2.55652


### Display results

In [20]:
#Show the results in an interactive way
def selection_changed(selection):
    return PS.iloc[selection]
interact(selection_changed, selection=lineup_widget.LineUpWidget(PS));

interactive(children=(LineUpWidget(value=[], description='selection', layout=Layout(align_self='stretch', heig…