# Inventa: a computational tool to discover structural novelty in natural  extracts libraries


In [1]:
from __future__ import print_function
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import sys 
import lineup_widget
from ipywidgets import *
!jupyter nbextension enable --py --sys-prefix lineup_widget

sys.path.append('../src')
sys.path.append('../gnps_postprocessing/src') 

from import_data import*
from process_data import *
from AC import *
from LC import *
from SC import *
from CC import *
from plot import *

from gnps_download_results import *
from consolidates_structures import *
from gnps_results_postprocess import *

Enabling notebook extension lineup_widget/extension...
      - Validating: [32mOK[0m


# Paths and parameters to define

In [2]:
# Suffixes necessary for the job: 

repository_path= '/mnt/c/Users/quirosgu/Desktop/Indiv_PF1600/'  # The path were you want your folder to be placed
quant_table_suffix ='_quant_pos.csv'
spectra_suffix= '_pos.mgf'
metadata_sample_suffix ='_metadata.tsv'
isdb_sample_suffix = '_isdb_matched_pos_repond.tsv'
sirius_sample_suffix = 'compound_identifications_adducts.tsv'
canopus_sample_suffix = 'npc_summary.csv' #'_summary_adducts.tsv' #'canopus_summary.tsv'
memo_sample_suffix= '_memo_pos.csv'
file_extention = '.mzXML'
ionization_mode = 'pos'

# metadata headers

sampletype_header = 'sample_type'
species_column = 'organism_species'
genus_column = 'organism_genus'
family_column = 'organism_family'
filename_header = 'ms_filename'
organe_column = 'organism_organe'


#quantitative table
data_process_origin = 'MZMine2' #'MZMine2'
use_ion_identity= False  #False

#Annotation component 

intensity_filter  = True
quantile_filter = True

min_threshold = 0.002
quantile_threshold = 0.75


## cut-offs: 
min_score_final = 0.3             #cut-off filter for considering an isdb annotation valable. You must be extremenly carefull with this parameter, '0.0' as default.
min_ZodiacScore = 0.9             #cut-off filter for considering a sirius annotation valable. It is used in combination with min_ConfidenceScore.
min_ConfidenceScore= 0.25         #cut-off filter for considering a sirius annotation valable. '0.0' as default.

#Literature_component

LC_component = True               # LC will be calculated

max_comp_reported_sp = 10          # max number of compounds reported at species level, more than this value, the plant is considered less interesting
max_comp_reported_g = 50         # max number of compounds reported at genus level,more than this value, the plant is considered less interesting
max_comp_reported_f = 500           # max number of compounds reported at genus level,more than this value, the plant is considered less interesting

#weight for each taxonomic level 
ws = 1
wg = 1
wf = 1 

#Similarity_component

SC_component = True                # SC will be calculated

#Class_component

CC_component =  True              # CC will be calculated
min_class_confidence = 0.8       #cut-off filter for considering a sirius class valable. It is used in combination with min_recurrence.
min_recurrence = 5              # minimum recurrence of a chemical class to consider it acceptable

#specify the weight to modulate each component 
w1 = 1           # 1 means the value itself is taken into account. A 0.5 means onle half of the calculated value is taken into account
w2 = 1
w3 = 1
w4 = 1


### Load Metadata from individual files

In [3]:
metadata_df = get_metadata_ind_files(repository_path, metadata_sample_suffix, filename_header, file_extention)
metadata_df.head(5)

Unnamed: 0,sample_id,sample_type,sample_substance_name,organism_kingdom,organism_phylum,organism_class,organism_order,organism_family,organism_genus,organism_species,...,query_otol_family,query_otol_tribe,query_otol_genus,query_otol_species,ott.type,ott.value,wd.type,wd.value,img.type,img.value
0,VGF138_A01,qc,qc_mix,,,,,,,,...,,,,,,,,,,
1,VGF138_A02,sample,V111819GP-01,Plantae,Tracheophyta,Magnoliopsida,Saxifragales,Paeoniaceae,Paeonia,Paeonia suffruticosa,...,,,,,,,,,,
2,VGF138_A02,sample,V111819GP-01,Plantae,Tracheophyta,Magnoliopsida,Saxifragales,Paeoniaceae,Paeonia,paeonia suffruticosa,...,Paeoniaceae,,Paeonia,Paeonia suffruticosa,literal,137620.0,uri,http://www.wikidata.org/entity/Q163018,uri,http://commons.wikimedia.org/wiki/Special:File...
3,VGF138_A03,sample,V111988GP-01,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Scrophulariaceae,Buddleja,Buddleja officinalis,...,,,,,,,,,,
4,VGF138_A03,sample,V111988GP-01,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Scrophulariaceae,Buddleja,buddleja officinalis,...,Scrophulariaceae,Buddlejeae,Buddleja,Buddleja officinalis,literal,129762.0,uri,http://www.wikidata.org/entity/Q4984693,uri,http://commons.wikimedia.org/wiki/Special:File...


In [4]:
#if you need to create an unique identifier column like Species|part, use as model the followin line. IF the colum is PRESENT, then don't run it.
metadata_df['organism_sppart'] = metadata_df[species_column]+ "|" + metadata_df[organe_column].map(str)
metadata_df.head(5)

Unnamed: 0,sample_id,sample_type,sample_substance_name,organism_kingdom,organism_phylum,organism_class,organism_order,organism_family,organism_genus,organism_species,...,query_otol_tribe,query_otol_genus,query_otol_species,ott.type,ott.value,wd.type,wd.value,img.type,img.value,organism_sppart
0,VGF138_A01,qc,qc_mix,,,,,,,,...,,,,,,,,,,
1,VGF138_A02,sample,V111819GP-01,Plantae,Tracheophyta,Magnoliopsida,Saxifragales,Paeoniaceae,Paeonia,Paeonia suffruticosa,...,,,,,,,,,,Paeonia suffruticosa|leaves
2,VGF138_A02,sample,V111819GP-01,Plantae,Tracheophyta,Magnoliopsida,Saxifragales,Paeoniaceae,Paeonia,paeonia suffruticosa,...,,Paeonia,Paeonia suffruticosa,literal,137620.0,uri,http://www.wikidata.org/entity/Q163018,uri,http://commons.wikimedia.org/wiki/Special:File...,paeonia suffruticosa|leaves
3,VGF138_A03,sample,V111988GP-01,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Scrophulariaceae,Buddleja,Buddleja officinalis,...,,,,,,,,,,Buddleja officinalis|leaves
4,VGF138_A03,sample,V111988GP-01,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Scrophulariaceae,Buddleja,buddleja officinalis,...,Buddlejeae,Buddleja,Buddleja officinalis,literal,129762.0,uri,http://www.wikidata.org/entity/Q4984693,uri,http://commons.wikimedia.org/wiki/Special:File...,buddleja officinalis|leaves


In [5]:
col_id_unique = filename_header #'organism_sppart'  #column containing an unique identifier for each sample, like Species_plantpart, Species_solvent. It could be the filename

# Start calculation the diferent components

# Annotation Component (FC)

#### AC.1. Process, clean and merge the quant tables, sirius and isdb annotations

In [7]:
ind_quant_table_full(repository_path, ionization_mode, data_process_origin, file_extention, use_ion_identity, min_score_final, min_ConfidenceScore, min_ZodiacScore)

100%|██████████| 17/17 [00:00<00:00, 20.40it/s]

Result are in : /mnt/c/Users/quirosgu/Desktop/Indiv_PF1600/results/VGF138_B04_pos_quant_annotations.tsv





In [21]:
file_extention

'.mzXML'

In [22]:
path = os.path.normpath(repository_path)
samples_dir = [directory for directory in os.listdir(path)]

files = []
original_feature_count = []
feature_count_filtered = []
annotated_features_count = []

for directory in tqdm(samples_dir):
    quant_annotations_path = os.path.join(path, path +'/results/', directory + '_'+ionization_mode + '_quant_annotations.tsv')
    
    column = os.path.join(path, directory, directory + file_extention)
    column = column.rsplit('/',1)[1]

100%|██████████| 17/17 [00:00<00:00, 110376.42it/s]


In [23]:
column

'VGF138_B04.mzXML'

#### AC.2. Calculate the annotation rate of each sample

In [24]:
def annotation_component(repository_path, ionization_mode, file_extention, intensity_filter, quantile_filter, min_threshold, quantile_threshold):

    path = os.path.normpath(repository_path)
    samples_dir = [directory for directory in os.listdir(path)]

    files = []
    original_feature_count = []
    feature_count_filtered = []
    annotated_features_count = []

    for directory in tqdm(samples_dir):
        quant_annotations_path = os.path.join(path, path +'/results/', directory + '_'+ionization_mode + '_quant_annotations.tsv')
        
        column = os.path.join(path, directory, directory + file_extention)
        column = column.rsplit('/',1)[1]

        try:
            df_original = pd.read_csv(quant_annotations_path, sep='\t')
        except FileNotFoundError:
            continue
        except NotADirectoryError:
            continue
        
        #recover filenames 
        files.append(directory)
        #read original filename
        df_original = pd.read_csv(quant_annotations_path, sep='\t')
        #normalize
        df_original[column] = df_original[column]/df_original[column].sum()

        #original number of features
        dfo = df_original[column]
        dfo = dfo[dfo>0.0].count()
        original_feature_count.append(dfo)

        #check and apply filtering steps if applicable

        if intensity_filter == True and quantile_filter == True:
                
            dff = df_original.copy()
            #apply intensity filter
            dff[column].values[dff[column] < min_threshold] = 0 #change all the values lower than x for 0 in the dataframe
            dff[column] = dff[column]/dff[column].sum() #once the data was filtered, the table is normalized sample-wise

            #apply quantile filtering
            dff = dff.replace (0, np.nan)
            dff = dff[dff[column] < dff[column].quantile(quantile_threshold)]#change all the values lower than x quantile for 0 in the dataframe
            dff[column] = dff[column]/dff[column].sum()#once the data was filtered, the table is normalized sample-wis

        elif intensity_filter == True and quantile_filter == False:
            
            dff = df_original.copy()
            #apply intensity filter
            dff[column].values[dff[column] < min_threshold] = 0 #change all the values lower than x for 0 in the dataframe
            dff[column] = dff[column]/dff[column].sum() #once the data was filtered, the table is normalized sample-wise

        elif intensity_filter == False and quantile_filter == True:
            dff = df_original.copy()
            #apply quantile filtering
            dff = dff.replace (0, np.nan)
            dff = dff[dff[column] < dff[column].quantile(quantile_threshold)]#change all the values lower than x quantile for 0 in the dataframe
            dff[column] = dff[column]/dff[column].sum()#once the data was filtered, the table is normalized sample-wis

        else:
            dff = df_original

        #number of features after filtering
        dffc = dff[column]
        dffc = dffc[dffc>0.0].count()
        feature_count_filtered.append(dffc)


        #number of features after filtering annotated
        dfa = dff[[column, 'annotation']]
        dfa = dfa[dfa['annotation'] == 1]
        dfa = dfa[column]
        dfac = dfa[dfa>0.0].count()
        annotated_features_count.append(dfac)

    AC = pd.DataFrame({'ms_filename': files,'initial_features': original_feature_count, 'features_after_filtering' : feature_count_filtered, 'Annot_features_after_filtering': annotated_features_count })
    AC['AC'] = AC['Annot_features_after_filtering']/AC['features_after_filtering']*100
    AC['AC'] = AC['AC'].round(decimals = 1)
    return AC

In [25]:
AC = annotation_component(repository_path, ionization_mode, intensity_filter, quantile_filter, min_threshold, quantile_threshold, file_extention)
AC

  0%|          | 0/17 [00:00<?, ?it/s]


TypeError: can only concatenate str (not "bool") to str

# Literature Component (LC)


#### LC.1. LC computation

In [92]:
LC = literature_component(LC_component, metadata_df, filename_header, species_column, genus_column, family_column, max_comp_reported_sp, max_comp_reported_g, max_comp_reported_f, ws, wg, wf)
LC

Unnamed: 0,ms_filename,organism_family,organism_genus,organism_species,Reported_comp_Species,Reported_comp_Genus,Reported_comp_Family,LC
0,LQ-01-61-01,Celastraceae,Catha,Catha edulis,126.0,126.0,6064.0,0.72752
1,LQ-01-61-02,Celastraceae,Catha,Catha edulis,126.0,126.0,6064.0,0.72752
2,LQ-01-61-03,Celastraceae,Catha,Catha edulis,126.0,126.0,6064.0,0.72752
3,LQ-01-61-04,Celastraceae,Catha,Catha edulis,126.0,126.0,6064.0,0.72752
4,LQ-01-61-05,Celastraceae,Celastrus,Celastrus orbiculatus,212.0,732.0,6064.0,0.52032
...,...,...,...,...,...,...,...,...
73,LQ-01-61-74,Celastraceae,Tripterygium,Tripterygium wilfordii,1011.0,1353.0,6064.0,0.00000
74,LQ-01-61-75,Celastraceae,Tripterygium,Tripterygium wilfordii,1011.0,1353.0,6064.0,0.00000
75,LQ-01-61-76,0,0,0,0.0,0.0,0.0,1.00000
76,LQ-01-61-77,0,0,0,0.0,0.0,0.0,1.00000


# Similarity component (SC)

#### SC.1. SC computation

In [101]:
#calculate MEMO matrix from individual files
metric_df = calculate_memo_matrix_ind_files(repository_path, spectra_suffix, filename_header)
metric_df.head(5)

100%|██████████| 78/78 [01:00<00:00,  1.29it/s]


Computing MEMO matrix from unaligned samples took: 62.578125 seconds



The default value of regex will change from True to False in a future version.



Unnamed: 0,ms_filename,peak@53.04,peak@57.07,peak@77.04,peak@78.03,peak@83.09,peak@85.07,peak@91.05,peak@93.07,peak@95.05,...,peak@662.27,peak@470.31,peak@655.05,peak@386.73,peak@411.57,peak@611.62,peak@340.64,peak@569.51,peak@704.01,peak@717.90
0,LQ-01-61-01,55.0,258.0,19.0,66.0,249.0,158.0,250.0,345.0,267.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,LQ-01-61-02,31.0,310.0,5.0,16.0,347.0,133.0,229.0,411.0,134.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,LQ-01-61-03,26.0,249.0,9.0,35.0,316.0,90.0,275.0,469.0,111.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,LQ-01-61-04,49.0,291.0,15.0,66.0,331.0,135.0,284.0,450.0,234.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,LQ-01-61-05,48.0,392.0,11.0,36.0,372.0,145.0,286.0,399.0,191.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [103]:
#remove experimental controls and blancks
list_of_strings_for_QC_Blank_filter = ['blank', 'qc'] #erase all the blanks and QC's - Change the string as needed
column_to_use_for_filtering = sampletype_header #this information should be included in the metadata table
metric_df = drop_samples_based_on_string_ind(metric_df,metadata_df, filename_header, sampletype_header,'metric_df', list_of_strings_for_QC_Blank_filter, column_to_use_for_filtering)
#metric_df.head()


(78, 66836)
(76, 66836)


In [89]:
SC = similarity_component(metric_df, SC_component, filename_header)
SC.head(10)

Unnamed: 0,ms_filename,anomaly_IF,anomaly_LOF,anomaly_OCSVM,SC
0,LQ-01-61-02_pos,1,1,1,0
1,LQ-01-61-03_pos,1,1,1,0
2,LQ-01-61-04_pos,1,1,1,0
3,LQ-01-61-05_pos,1,1,1,0
4,LQ-01-61-06_pos,1,1,1,0
5,LQ-01-61-07_pos,1,-1,-1,1
6,LQ-01-61-08_pos,1,1,1,0
7,LQ-01-61-09_pos,1,-1,-1,1
8,LQ-01-61-10_pos,1,1,1,0
9,LQ-01-61-11_pos,1,1,1,0


# Class component (CC)

In [82]:
CC= class_component_ind_files_PF1600(CC_component, repository_path, canopus_sample_suffix, min_class_confidence, metadata_df, filename_header, species_column, genus_column, family_column)
CC.head()

Unnamed: 0,ms_filename,NPC#class,organism_species,organism_genus,organism_family,Chemical_class_reported_in_species,Chemical_class_reported_in_genus,New_CC_in_sp,New_CC_in_genus,CCs,CCg,CC
0,VGF138_A03.mzXML,"{Cholestane steroids, Dipeptides, Dicarboxylic...",Buddleja officinalis,Buddleja,Scrophulariaceae,"{Monosaccharides, Cycloartane triterpenoids|Er...","{Monosaccharides, Cycloartane triterpenoids|Er...","{Cholestane steroids, Dipeptides, Dicarboxylic...","{Cholestane steroids, Dipeptides, Dicarboxylic...",0.5,0.5,1.0
1,VGF138_A05.mzXML,"{Ergostane steroids, Cholestane steroids, Dipe...",Nicandra physalodes,Nicandra,Solanaceae,"{Cycloartane triterpenoids|Lanostane, Tirucall...","{Cycloartane triterpenoids|Lanostane, Tirucall...","{Ergostane steroids, Ceramides, Cholestane ste...","{Ergostane steroids, Ceramides, Cholestane ste...",0.5,0.5,1.0
2,VGF138_A07.mzXML,"{Isoflavones, Quinolizidine alkaloids, Flavan-...",Sophora davidii,Sophora,Fabaceae,"{Pterocarpan, Isoflavones, Flavanones, Quinoli...","{Pterocarpan, Isoflavones, Flavanones, Quinoli...","{Ceramides, Macrolide lactones, Flavan-3-ols, ...","{Ceramides, Macrolide lactones, Flavan-3-ols, ...",0.5,0.5,1.0
3,VGF138_A09.mzXML,"{Cholestane steroids, Macrolide lactones, Estr...",Breynia fruticosa,Breynia,Phyllanthaceae,"{Ergostane steroids, Megastigmanes, Cholestane...","{Ergostane steroids, Megastigmanes, Cholestane...","{Macrolide lactones, Cholestane steroids, Estr...","{Cholestane steroids, Macrolide lactones, Estr...",0.5,0.5,1.0
4,VGF138_B03.mzXML,"{Kavalactones and derivatives, Dipeptides, Sph...",Caesalpinia enneaphylla,Caesalpinia,Fabaceae,nothing in DB,nothing in DB,nothing in DB,nothing in DB,0.0,0.0,1.0


# Priority rank Results

In [197]:
PR = priority_rank(LC_component, SC_component, CC_component, w1, w2, w3, w4)
#PR.head()

TypeError: priority_rank() missing 5 required positional arguments: 'w1', 'w2', 'w3', 'w4', and 'filename_header'

In [None]:
Cyt_format_visualization = Cyt_format(col_id_unique)

### Display results

In [None]:
#Show the results in an interactive way
def selection_changed(selection):
    return PR.iloc[selection]
interact(selection_changed, selection=lineup_widget.LineUpWidget(PR));

interactive(children=(LineUpWidget(value=[], description='selection', layout=Layout(align_self='stretch', heig…