# Inventa: a computational tool to discover structural novelty in natural  extracts libraries


In [1]:
from __future__ import print_function
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import sys 
import lineup_widget
from ipywidgets import *
!jupyter nbextension enable --py --sys-prefix lineup_widget

sys.path.append('../src')
sys.path.append('../gnps_postprocessing/src') 

from import_data import*
from process_data import *
from AC import *
from LC import *
from SC import *
from CC import *
from plot import *

Enabling notebook extension lineup_widget/extension...
      - Validating: [32mOK[0m


# Paths and parameters to define

In [2]:
# Suffixes necessary for the job: 

repository_path= '/mnt/c/Users/quirosgu/Desktop/underexplored_collection/pos/'#'/mnt/c/Users/quirosgu/Desktop/Indiv_PF1600/'  # The path were you want your folder to be placed
#quant_table_suffix ='_quant_pos.csv'
spectra_suffix= '_features_ms2_pos.mgf'
#metadata_sample_suffix ='_metadata.tsv'
#isdb_sample_suffix = '_isdb_matched_pos_repond.tsv'
#sirius_sample_suffix = 'compound_identifications_adducts.tsv'
canopus_sample_suffix = 'npc_summary.csv' #'_summary_adducts.tsv' #'canopus_summary.tsv'
#memo_sample_suffix= '_memo_pos.csv'
file_extention = '.mzML'
ionization_mode = 'pos'

# metadata headers

sampletype_header = 'sample_type'
species_column = 'organism_species'
genus_column = 'organism_genus'
family_column = 'organism_family'
filename_header = 'sample_id'#'ms_filename'
organe_column = 'organism_organ'


#quantitative table
data_process_origin = 'MZMine2' #'MZMine2'
use_ion_identity= False  #False

#Annotation component 

intensity_filter  = True
quantile_filter = True

min_threshold = 0.002
quantile_threshold = 0.75


## cut-offs: 
min_score_final = 0.3             #cut-off filter for considering an isdb annotation valable. You must be extremenly carefull with this parameter, '0.0' as default.
min_ZodiacScore = 0.9             #cut-off filter for considering a sirius annotation valable. It is used in combination with min_ConfidenceScore.
min_ConfidenceScore= 0.05         #cut-off filter for considering a sirius annotation valable. '0.0' as default.

#Literature_component

LC_component = True               # LC will be calculated

max_comp_reported_sp = 10          # max number of compounds reported at species level, more than this value, the plant is considered less interesting
max_comp_reported_g = 50         # max number of compounds reported at genus level,more than this value, the plant is considered less interesting
max_comp_reported_f = 500           # max number of compounds reported at genus level,more than this value, the plant is considered less interesting

#weight for each taxonomic level 
ws = 1
wg = 1
wf = 1 

#Similarity_component

SC_component = True                # SC will be calculated

#Class_component

CC_component =  True              # CC will be calculated
min_class_confidence = 0.8       #cut-off filter for considering a sirius class valable. It is used in combination with min_recurrence.
min_recurrence = 5              # minimum recurrence of a chemical class to consider it acceptable

#specify the weight to modulate each component 
w1 = 1           # 1 means the value itself is taken into account. A 0.5 means onle half of the calculated value is taken into account
w2 = 1
w3 = 1
w4 = 1


### Load Metadata from individual files

In [3]:
metadata_df = get_metadata_ind_files(repository_path)
metadata_df.head()

  0%|          | 0/240 [00:00<?, ?it/s]  5%|▌         | 12/240 [00:00<00:02, 83.74it/s] 18%|█▊        | 44/240 [00:00<00:00, 200.12it/s] 30%|██▉       | 71/240 [00:00<00:00, 216.76it/s] 41%|████      | 98/240 [00:00<00:00, 235.20it/s] 53%|█████▎    | 128/240 [00:00<00:00, 256.09it/s] 67%|██████▋   | 160/240 [00:00<00:00, 274.19it/s] 80%|████████  | 192/240 [00:00<00:00, 286.81it/s] 93%|█████████▎| 224/240 [00:00<00:00, 296.30it/s]100%|██████████| 240/240 [00:00<00:00, 261.80it/s]


Unnamed: 0,sample_filename_pos,sample_id,sample_type,organism_family,organism_genus,organism_species,organism_organ
0,Blank-01_pos.mzML,Blank-01,blank,,,,
0,Blank-02_pos.mzML,Blank-02,blank,,,,
0,Blank-03_pos.mzML,Blank-03,blank,,,,
0,Blank-04_pos.mzML,Blank-04,blank,,,,
0,Blank-05_pos.mzML,Blank-05,blank,,,,


In [4]:
#if you need to create an unique identifier column like Species|part, use as model the followin line. IF the colum is PRESENT, then don't run it.
metadata_df['organism_sppart'] = metadata_df[species_column]+ "|" + metadata_df[organe_column].map(str)
metadata_df.head(5)

Unnamed: 0,sample_filename_pos,sample_id,sample_type,organism_family,organism_genus,organism_species,organism_organ,organism_sppart
0,Blank-01_pos.mzML,Blank-01,blank,,,,,
0,Blank-02_pos.mzML,Blank-02,blank,,,,,
0,Blank-03_pos.mzML,Blank-03,blank,,,,,
0,Blank-04_pos.mzML,Blank-04,blank,,,,,
0,Blank-05_pos.mzML,Blank-05,blank,,,,,


In [5]:
sppart_column = 'organism_sppart'
col_id_unique = filename_header #'organism_sppart'  #column containing an unique identifier for each sample, like Species_plantpart, Species_solvent. It could be the filename

# Start calculation the diferent components

# Annotation Component (FC)

#### AC.1. Process, clean and merge the quant tables, sirius and isdb annotations

In [6]:
ind_quant_table_full(repository_path, ionization_mode, data_process_origin, file_extention, use_ion_identity, min_score_final, min_ConfidenceScore, min_ZodiacScore)

  0%|          | 0/240 [00:00<?, ?it/s] 20%|█▉        | 47/240 [00:00<00:00, 462.69it/s] 39%|███▉      | 94/240 [00:02<00:04, 34.65it/s]  48%|████▊     | 115/240 [00:03<00:04, 26.16it/s] 53%|█████▎    | 127/240 [00:04<00:04, 23.98it/s] 56%|█████▋    | 135/240 [00:04<00:04, 22.77it/s] 59%|█████▉    | 141/240 [00:05<00:04, 21.88it/s] 61%|██████    | 146/240 [00:05<00:04, 20.32it/s] 62%|██████▎   | 150/240 [00:05<00:04, 19.04it/s] 64%|██████▍   | 153/240 [00:05<00:04, 18.58it/s] 65%|██████▌   | 156/240 [00:06<00:04, 18.36it/s] 66%|██████▋   | 159/240 [00:06<00:04, 18.38it/s] 68%|██████▊   | 162/240 [00:06<00:04, 18.12it/s] 68%|██████▊   | 164/240 [00:06<00:04, 17.89it/s] 69%|██████▉   | 166/240 [00:06<00:04, 17.75it/s] 70%|███████   | 169/240 [00:06<00:03, 18.28it/s] 72%|███████▏  | 172/240 [00:07<00:03, 18.94it/s] 73%|███████▎  | 175/240 [00:07<00:03, 19.29it/s] 74%|███████▍  | 177/240 [00:07<00:03, 19.10it/s] 75%|███████▍  | 179/240 [00:07<00:03, 17.85it/s] 75%|█████

Result are in : /mnt/c/Users/quirosgu/Desktop/underexplored_collection/pos/results/V116331_pos_quant_annotations.tsv





#### AC.2. Calculate the annotation rate of each sample

In [7]:
AC = annotation_component(repository_path, ionization_mode, file_extention, intensity_filter, quantile_filter, min_threshold, quantile_threshold, filename_header, metadata_df, species_column, genus_column, family_column, sppart_column)
AC.head()

  0%|          | 0/240 [00:00<?, ?it/s] 28%|██▊       | 67/240 [00:00<00:00, 641.69it/s] 55%|█████▌    | 132/240 [00:00<00:00, 175.26it/s] 69%|██████▉   | 166/240 [00:00<00:00, 146.02it/s] 79%|███████▉  | 189/240 [00:01<00:00, 130.38it/s] 86%|████████▋ | 207/240 [00:01<00:00, 124.86it/s] 93%|█████████▎| 223/240 [00:01<00:00, 121.19it/s] 99%|█████████▉| 237/240 [00:01<00:00, 115.91it/s]100%|██████████| 240/240 [00:01<00:00, 140.51it/s]


Unnamed: 0,sample_id,organism_species,organism_genus,organism_family,organism_sppart,initial_features,features_after_filtering,Annot_features_after_filtering,AC
0,V100448,Manotes expansa,Manotes,Connaraceae,Manotes expansa|fruits,348,66,60,0.09
1,V100921,Manotes griffoniana,Manotes,Connaraceae,Manotes griffoniana|leaves,404,78,75,0.04
2,V100958,Calyptrotheca taitense,Calyptrotheca,Didiereaceae,Calyptrotheca taitense|roots,589,69,63,0.09
3,V101033,Calyptrotheca taitense,Calyptrotheca,Didiereaceae,Calyptrotheca taitense|stems,627,76,74,0.03
4,V101063,Odontosoria chusana,Odontosoria,Lindsaeaceae,Odontosoria chusana|multiple,314,93,82,0.12


# Literature Component (LC)


#### LC.1. LC computation

In [8]:
LC = literature_component_ind(LC_component, repository_path, metadata_df, filename_header, species_column, genus_column, family_column, max_comp_reported_sp, max_comp_reported_g, max_comp_reported_f, ws, wg, wf)
LC.head()

Unnamed: 0,sample_id,organism_family,organism_genus,organism_species,Reported_comp_Species,Reported_comp_Genus,Reported_comp_Family,LC
0,V100448,Connaraceae,Manotes,Manotes expansa,0.0,0.0,25.0,1.0
1,V100921,Connaraceae,Manotes,Manotes griffoniana,0.0,0.0,25.0,1.0
2,V100958,Didiereaceae,Calyptrotheca,Calyptrotheca taitense,0.0,0.0,25.0,1.0
3,V101033,Didiereaceae,Calyptrotheca,Calyptrotheca taitense,0.0,0.0,25.0,1.0
4,V101063,Lindsaeaceae,Odontosoria,Odontosoria chusana,0.0,4.0,8.0,1.0


# Class component (CC)

In [9]:
CC= class_component_ind_files(CC_component, repository_path, ionization_mode, min_class_confidence, metadata_df, filename_header, species_column, genus_column, family_column)
CC.head()

  0%|          | 0/240 [00:00<?, ?it/s] 26%|██▋       | 63/240 [00:00<00:00, 581.04it/s] 51%|█████     | 122/240 [00:01<00:01, 95.27it/s] 62%|██████▎   | 150/240 [00:01<00:01, 80.47it/s] 70%|███████   | 168/240 [00:01<00:00, 74.70it/s] 75%|███████▌  | 181/240 [00:02<00:00, 73.01it/s] 80%|████████  | 192/240 [00:02<00:00, 72.34it/s] 84%|████████▍ | 202/240 [00:02<00:00, 71.15it/s] 88%|████████▊ | 211/240 [00:02<00:00, 70.08it/s] 91%|█████████▏| 219/240 [00:02<00:00, 69.09it/s] 95%|█████████▍| 227/240 [00:02<00:00, 68.28it/s] 98%|█████████▊| 235/240 [00:02<00:00, 65.36it/s]100%|██████████| 240/240 [00:03<00:00, 79.61it/s]


Unnamed: 0,sample_id,NPC#class,organism_species,organism_genus,organism_family,Chemical_class_reported_in_species,Chemical_class_reported_in_genus,New_CC_in_sp,New_CC_in_genus,CCs,CCg,CC
0,V100448,"{Carotenoids (C40, β-Ψ), Flavanones, Dihydrofl...",Manotes expansa,Manotes,Connaraceae,nothing in DB,nothing in DB,nothing in DB,nothing in DB,0.0,0.0,1.0
1,V100921,"{Macrolide lactones, Pyridine alkaloids, Amino...",Manotes griffoniana,Manotes,Connaraceae,nothing in DB,nothing in DB,nothing in DB,nothing in DB,0.0,0.0,1.0
2,V100958,"{Macrolide lactones, Depsipeptides, Tripeptide...",Calyptrotheca taitense,Calyptrotheca,Didiereaceae,nothing in DB,nothing in DB,nothing in DB,nothing in DB,0.0,0.0,1.0
3,V101033,"{Macrolide lactones, Purine alkaloids, Apocaro...",Calyptrotheca taitense,Calyptrotheca,Didiereaceae,nothing in DB,nothing in DB,nothing in DB,nothing in DB,0.0,0.0,1.0
4,V101063,"{Miscellaneous polyketides, Pyridine alkaloids...",Odontosoria chusana,Odontosoria,Lindsaeaceae,nothing in DB,nothing in DB,nothing in DB,nothing in DB,0.0,0.0,1.0


# Similarity component (SC)

#### SC.1. Calculate MEMO matrix from individual files

In [10]:
#calculate MEMO matrix from individual files
metric_df = calculate_memo_matrix_ind_files(repository_path, ionization_mode, spectra_suffix, filename_header)
metric_df.head()

  0%|          | 0/239 [00:00<?, ?it/s]  1%|▏         | 3/239 [00:00<00:11, 21.43it/s]  3%|▎         | 6/239 [00:00<00:09, 23.32it/s]  4%|▍         | 9/239 [00:00<00:09, 24.40it/s]  5%|▌         | 12/239 [00:00<00:09, 24.08it/s]  6%|▋         | 15/239 [00:00<00:09, 23.67it/s]  8%|▊         | 18/239 [00:00<00:09, 23.09it/s]  9%|▉         | 21/239 [00:02<00:56,  3.83it/s] 10%|▉         | 23/239 [00:04<01:36,  2.23it/s] 10%|█         | 25/239 [00:07<02:06,  1.70it/s] 11%|█         | 26/239 [00:08<02:20,  1.52it/s] 11%|█▏        | 27/239 [00:09<02:34,  1.37it/s] 12%|█▏        | 28/239 [00:10<02:47,  1.26it/s] 12%|█▏        | 29/239 [00:11<02:58,  1.18it/s] 13%|█▎        | 30/239 [00:12<03:09,  1.11it/s] 13%|█▎        | 31/239 [00:13<03:11,  1.09it/s] 13%|█▎        | 32/239 [00:14<03:16,  1.05it/s] 14%|█▍        | 33/239 [00:15<03:20,  1.03it/s] 14%|█▍        | 34/239 [00:16<03:22,  1.01it/s] 15%|█▍        | 35/239 [00:17<03:23,  1.00it/s] 15%|█▌        | 36/239 [00:18<0

Computing MEMO matrix from unaligned samples took: 131.84375 seconds


  memo_unaligned.memo_matrix.index = memo_unaligned.memo_matrix.index.str.replace(spectra_suffix, "")


In [None]:
#remove experimental controls and blancks
list_of_strings_for_QC_Blank_filter = ['blank', 'QC'] #erase all the blanks and QC's - Change the string as needed
column_to_use_for_filtering = sampletype_header #this information should be included in the metadata table
metric_df= drop_samples_based_on_string_ind(repository_path, ionization_mode, filename_header, sampletype_header, metric_df, metadata_df, list_of_strings_for_QC_Blank_filter, column_to_use_for_filtering)
#metric_df.head()

(239, 68558)
(182, 68558)


In [None]:
#if metric_df was previously calculated: 
#metric_df = load_metric_df(repository_path, ionization_mode)
#metric_df.head()

#### SC.2. SC calculation

In [None]:
SC = similarity_component_ind(repository_path, ionization_mode, filename_header, metric_df)
SC.head(10)

Unnamed: 0,sample_id,anomaly_IF,anomaly_LOF,anomaly_OCSVM,SC
0,V100448,1,-1,1,1
1,V100921,1,1,-1,1
2,V100958,1,1,1,0
3,V101033,1,1,1,0
4,V101063,1,1,1,0
5,V101091,1,1,1,0
6,V101093,1,1,1,0
7,V101156,1,1,1,0
8,V101157,1,1,1,0
9,V101288,1,1,1,0


# Priority rank Results

In [None]:
PS = priority_score_ind(repository_path, filename_header, ionization_mode, species_column, genus_column, family_column, sppart_column, w1, w2, w3, w4)
PS.head()

  0%|          | 0/240 [00:00<?, ?it/s]  5%|▍         | 11/240 [00:00<00:02, 109.07it/s] 13%|█▎        | 31/240 [00:00<00:01, 158.14it/s] 21%|██        | 50/240 [00:00<00:01, 168.21it/s] 29%|██▉       | 69/240 [00:00<00:00, 173.79it/s] 37%|███▋      | 88/240 [00:00<00:00, 176.68it/s] 44%|████▍     | 106/240 [00:00<00:00, 174.52it/s] 52%|█████▏    | 125/240 [00:00<00:00, 177.68it/s] 60%|█████▉    | 143/240 [00:00<00:00, 176.19it/s] 68%|██████▊   | 162/240 [00:00<00:00, 179.60it/s] 76%|███████▌  | 182/240 [00:01<00:00, 182.77it/s] 84%|████████▍ | 202/240 [00:01<00:00, 185.77it/s] 92%|█████████▎| 222/240 [00:01<00:00, 187.84it/s]100%|██████████| 240/240 [00:01<00:00, 178.86it/s]


KeyError: 'CC'

### Display results

In [None]:
#Show the results in an interactive way
def selection_changed(selection):
    return PS.iloc[selection]
interact(selection_changed, selection=lineup_widget.LineUpWidget(PS));

interactive(children=(LineUpWidget(value=[], description='selection', layout=Layout(align_self='stretch', heig…