# Inventa: a computational tool to discover structural novelty in natural  extracts libraries


In [1]:
from __future__ import print_function
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")
import numpy as np
import pandas as pd
import os
import sys 
import lineup_widget
from ipywidgets import *
!jupyter nbextension enable --py --sys-prefix lineup_widget

sys.path.append('../src')
sys.path.append('../gnps_postprocessing/src') 

from import_data import*
from process_data import *
from FC import *
from LC import *
from SC import *
from CC import *
from plot import *

from gnps_download_results import *
from consolidates_structures import *
from gnps_results_postprocess import *

Enabling notebook extension lineup_widget/extension...
      - Validating: [32mOK[0m


# Paths and parameters to define

In [195]:
# Suffixes necessary for the job: 

repository_path= '/mnt/c/Users/quirosgu/Desktop/Indiv_PF1600/'  # The path were you want your folder to be placed
quant_table_suffix ='_quant_pos.csv'
spectra_suffix= '_pos.mgf'
metadata_sample_suffix ='_metadata.tsv'
isdb_sample_suffix = '_isdb_matched_pos_repond.tsv'
sirius_sample_suffix = 'compound_identifications_adducts.tsv'
canopus_sample_suffix = 'canopus_summary.tsv'
memo_sample_suffix= '_memo_pos.csv'
file_extention = '.mzXML'
polarity = '_pos'

# metadata headers

sampletype_header = 'sample_type'
species_column = 'organism_species'
genus_column = 'organism_genus'
family_column = 'organism_family'
filename_header = 'ms_filename'
organe_column = 'organism_organe'

# parameters for cleaning-up annotations from GNPS 

max_ppm_error = 5                 # min error in ppm to consider an annotation valable
shared_peaks = 4                 # min number of shared peaks between the MS2 experimental and MS2 from the database, to consider an annotation valable
min_cosine = 0.6                  # min cosine score to consider an annotation valable
ionisation_mode = 'pos'           # ionisation mode according to experimental conditions 'pos' or 'neg'
max_spec_charge = 2

#quantitative table
data_process_origin = 'MZMine2' #'MZMine2'
use_ion_identity= False  #False

#Feature_component

min_specificity = 0.90            # minimun feature specificity to consider

## 
multiple_organism_parts = True  #True: the specificity is going to be considered as the sum of the 'max_parts_per_organism' shared in the samples.
max_parts_per_organism = 4       #max recurrence of the same organism species (for example: 5 samples, same species but different plant part) 

## inputs to use: 
isdb_annotations = True          #True: the tima_results_filename will be considered in the calculations
sirius_annotations = True        #True: the sirius_annotations_filename will be considered in the calculations

## cut-offs: 
min_score_final = 0.3             #cut-off filter for considering an isdb annotation valable. You must be extremenly carefull with this parameter, '0.0' as default.
min_ZodiacScore = 0.9             #cut-off filter for considering a sirius annotation valable. It is used in combination with min_ConfidenceScore.
min_ConfidenceScore= 0.25         #cut-off filter for considering a sirius annotation valable. '0.0' as default.

annotation_preference = 0          # Only Annotated features: '1' or  Only Not annotated features: '0'

#Literature_component

LC_component = True               # LC will be calculated

max_comp_reported_sp = 10          # max number of compounds reported at species level, more than this value, the plant is considered less interesting
max_comp_reported_g = 50         # max number of compounds reported at genus level,more than this value, the plant is considered less interesting
max_comp_reported_f = 500           # max number of compounds reported at genus level,more than this value, the plant is considered less interesting

#weight for each taxonomic level 
ws = 1
wg = 1
wf = 1 

#Similarity_component

SC_component = True                # SC will be calculated

#Class_component

CC_component =  True              # CC will be calculated
min_class_confidence = 0.8       #cut-off filter for considering a sirius class valable. It is used in combination with min_recurrence.
min_recurrence = 5              # minimum recurrence of a chemical class to consider it acceptable

#specify the weight to modulate each component 
w1 = 1           # 1 means the value itself is taken into account. A 0.5 means onle half of the calculated value is taken into account
w2 = 1
w3 = 1
w4 = 1


### Load Metadata from individual files

In [108]:
metadata_df = get_metadata_ind_files(repository_path, metadata_sample_suffix, filename_header, file_extention)
metadata_df.head(5)

Unnamed: 0,sample_id,sample_type,sample_substance_name,organism_kingdom,organism_phylum,organism_class,organism_order,organism_family,organism_genus,organism_species,...,organism_subsystem,sample_plate_id,ms_filename,ms_injection_date,bio_leish_donovani_10ugml_inhibition,bio_leish_donovani_2ugml_inhibition,bio_tryp_brucei_rhodesiense_10ugml_inhibition,bio_tryp_brucei_rhodesiense_2ugml_inhibition,bio_tryp_cruzi_10ugml_inhibition,bio_l6_cytotoxicity_10ugml_inhibition
0,VGF138_A03,sample,V111988GP-01,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Scrophulariaceae,Buddleja,Buddleja officinalis,...,aboveground,VGF138,VGF138_A03,2017-10-27,36.7,12.2,9.5,6.0,0.0,0.5
1,VGF138_A05,sample,V112033GP-01,Plantae,Tracheophyta,Magnoliopsida,Solanales,Solanaceae,Nicandra,Nicandra physalodes,...,aboveground,VGF138,VGF138_A05,2017-10-27,5.7,6.7,14.1,7.7,2.5,7.5
2,VGF138_A07,sample,V112053GP-01,Plantae,Tracheophyta,Magnoliopsida,Fabales,Fabaceae,Sophora,Sophora davidii,...,aboveground,VGF138,VGF138_A07,2017-10-27,9.3,10.8,14.6,7.9,0.0,4.2
3,VGF138_A09,sample,V112076GP-01,Plantae,Tracheophyta,Magnoliopsida,Malpighiales,Phyllanthaceae,Breynia,Breynia fruticosa,...,aboveground,VGF138,VGF138_A09,2017-10-27,6.2,5.2,10.1,4.4,1.3,5.1
4,VGF138_A12,blank,blank,,,,,,,,...,,VGF138,VGF138_A12,2017-10-27,,,,,,


In [111]:
#if you need to create an unique identifier column like Species|part, use as model the followin line. IF the colum is PRESENT, then don't run it.
metadata_df['organism_sppart'] = metadata_df[species_column]+ "|" + metadata_df[organe_column].map(str)
metadata_df.head(5)

Unnamed: 0,sample_id,sample_type,sample_substance_name,organism_kingdom,organism_phylum,organism_class,organism_order,organism_family,organism_genus,organism_species,...,sample_plate_id,ms_filename,ms_injection_date,bio_leish_donovani_10ugml_inhibition,bio_leish_donovani_2ugml_inhibition,bio_tryp_brucei_rhodesiense_10ugml_inhibition,bio_tryp_brucei_rhodesiense_2ugml_inhibition,bio_tryp_cruzi_10ugml_inhibition,bio_l6_cytotoxicity_10ugml_inhibition,organism_sppart
0,VGF138_A03,sample,V111988GP-01,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Scrophulariaceae,Buddleja,Buddleja officinalis,...,VGF138,VGF138_A03,2017-10-27,36.7,12.2,9.5,6.0,0.0,0.5,Buddleja officinalis|leaves
1,VGF138_A05,sample,V112033GP-01,Plantae,Tracheophyta,Magnoliopsida,Solanales,Solanaceae,Nicandra,Nicandra physalodes,...,VGF138,VGF138_A05,2017-10-27,5.7,6.7,14.1,7.7,2.5,7.5,Nicandra physalodes|fruits
2,VGF138_A07,sample,V112053GP-01,Plantae,Tracheophyta,Magnoliopsida,Fabales,Fabaceae,Sophora,Sophora davidii,...,VGF138,VGF138_A07,2017-10-27,9.3,10.8,14.6,7.9,0.0,4.2,Sophora davidii|green stems
3,VGF138_A09,sample,V112076GP-01,Plantae,Tracheophyta,Magnoliopsida,Malpighiales,Phyllanthaceae,Breynia,Breynia fruticosa,...,VGF138,VGF138_A09,2017-10-27,6.2,5.2,10.1,4.4,1.3,5.1,Breynia fruticosa|leaves
4,VGF138_A12,blank,blank,,,,,,,,...,VGF138,VGF138_A12,2017-10-27,,,,,,,


In [104]:
col_id_unique = filename_header #'organism_sppart'  #column containing an unique identifier for each sample, like Species_plantpart, Species_solvent. It could be the filename

# Start calculation the diferent components

# Annotation Component (FC)

#### AC.1. Download and clean GNPS annotations for individual files

In [168]:
#AC.X. Load quant information / add annotation status / filter 

def ind_quant_table(repository_path, quant_table_suffix, data_process_origin, use_ion_dentity):

    for r, d, f in os.walk(repository_path):
        for file in (f for f in f if f.endswith(quant_table_suffix)):
                
                complete_file_path =r+'/'+file 
                df = pd.read_csv(complete_file_path)
                
                df.rename(columns = lambda x: x.replace(' Peak area', ''),inplace=True)
                df.rename(columns = lambda x: x.replace(file_extention, ''),inplace=True)
                df.drop(list(df.filter(regex = 'Unnamed:')), axis = 1, inplace = True)
                df.sort_index(axis=1, inplace=True)

                if data_process_origin == 'MZMine3':


                    if use_ion_dentity == True:

                        df.drop(['row ion mobility',
                            'row ion mobility unit', 'row CCS', 'best ion',
                            'correlation group ID', 'auto MS2 verify',
                            'identified by n=', 'partners', 'neutral M mass'], axis=1, inplace=True)

                        #complete correlation groups
                        df['annotation network number'] = df['annotation network number'].fillna(df['row ID'].apply(str) + 'x')
                        df.drop('row ID', axis =1, inplace=True)
                        df = df.groupby('annotation network number', dropna=False).max()

                    else:
                        #prepare quant table acordingly 

                        df.drop(['row ion mobility', 'correlation group ID', 'best ion', 'row ion mobility unit', 'row CCS', 
                        'annotation network number', 'auto MS2 verify', 'identified by n=', 'partners', 'neutral M mass'], axis=1, inplace=True)
                        df.set_index('row ID', inplace=True)

                else:
                    df 

                df1 = df[['row ID', 'row m/z', 'row retention time']]
                df.drop(['row m/z', 'row retention time'], axis =1, inplace=True)

                #normalize
                df.set_index('row ID', inplace=True)
                df = df.apply(lambda x: x/x.sum(), axis=0)

                df =pd.merge(df, df1, how ='left', on='row ID')

                #rename columns
                df.rename(columns={'row m/z': 'm/z', 'row retention time':'retention time (min)'}, inplace=True)
                df['m/z']=df['m/z'].round(decimals = 6)
                df['retention time (min)']=df['retention time (min)'].round(decimals = 2)

                #add ISDB and Sirius annotations


                #filter 



                prefix = 'treated_'
                df.to_csv(r+'/'+prefix+file, sep =',')

In [169]:
#clean and normaliza individual quant_tables (run it just once!)

ind_quant_table(repository_path, quant_table_suffix, data_process_origin, use_ion_identity)

In [193]:
def get_isdb_annotations_ind(repository_path, isdb_sample_suffix, isdb_annotations, min_score_final):
    
    if isdb_annotations == True:
        for r, d, f in os.walk(repository_path):
            for file in (f for f in f if f.endswith(isdb_sample_suffix)):
                    
                    complete_file_path =r+'/'+file 
                    df = pd.read_csv(complete_file_path, sep='\t', usecols =['feature_id', 'libname', 'structure_molecular_formula','structure_inchi','final_score'], 
                                low_memory=False)
                    
                    #recover one value from multiple options:
                    df['final_score'] = df['final_score'].astype(str).str.split('|').str[-1].astype(float)
                    df['libname'] = df['libname'].str.split('|').str[-1].astype(str)
                    df['structure_molecular_formula'] = df['structure_molecular_formula'].str.split('|').str[-1].astype(str)

                    #quality annotations filtering

                    def score_final_isdb(final_score):
                        if final_score >= min_score_final:
                            annotated=1 #good annotation
                        else:
                            annotated=0 #'bad annotation'
                        return annotated   

                    df['Annotation_ISDB'] = df.apply(lambda x: score_final_isdb(x['final_score']), axis=1)
                    df.loc[df['libname']== 'MS1_match', 'Annotated_ISDB'] = 0
     

                    prefix = 'treated_'
                    df.to_csv(r+'/'+prefix+file, sep ='\t')

In [194]:
#treat individual ISDB annotations

get_isdb_annotations_ind(repository_path, isdb_sample_suffix, isdb_annotations, min_score_final)

In [203]:
def get_sirius_annotations_ind(repository_path, sirius_sample_suffix, min_ConfidenceScore, min_ZodiacScore):
    
    if sirius_annotations == True:
        for r, d, f in os.walk(repository_path):
            for file in (f for f in f if f.endswith(sirius_sample_suffix)):
                    
                    complete_file_path =r+'/'+file 
                    df = pd.read_csv(complete_file_path,sep='\t', 
                                usecols =['id','molecularFormula', 'ConfidenceScore','ZodiacScore', 'adduct', 'name'], 
                                low_memory=False)
                    
                    df['shared name'] = df['id'].str.split('_').str[-1].astype(int)
                    df['ConfidenceScore'] = df['ConfidenceScore'].fillna(0)
                    df['ZodiacScore'] = df['ZodiacScore'].fillna(0)
                    df.drop('id', axis=1, inplace = True)
                    #df.astype('int64')

                    def Sirius_annotation(ConfidenceScore, ZodiacScore):
                        if ConfidenceScore >= min_ConfidenceScore and ZodiacScore >= min_ZodiacScore:
                            annotated=1 #good annotation
                        else:
                            annotated=0 #'bad annotation'
                        return annotated

                    df['Annotated_Sirius'] = df.apply(lambda x: Sirius_annotation(x['ConfidenceScore'], x['ZodiacScore']), axis=1)

                    prefix = 'treated_'
                    df.to_csv(r+'/'+prefix+file, sep ='\t')

In [204]:
#treat individual ISDB annotations

get_sirius_annotations_ind(repository_path, sirius_sample_suffix, min_ConfidenceScore, min_ZodiacScore)

# Literature Component (LC)


#### LC.1. LC computation

In [92]:
LC = literature_component(LC_component, metadata_df, filename_header, species_column, genus_column, family_column, max_comp_reported_sp, max_comp_reported_g, max_comp_reported_f, ws, wg, wf)
LC

Unnamed: 0,ms_filename,organism_family,organism_genus,organism_species,Reported_comp_Species,Reported_comp_Genus,Reported_comp_Family,LC
0,LQ-01-61-01,Celastraceae,Catha,Catha edulis,126.0,126.0,6064.0,0.72752
1,LQ-01-61-02,Celastraceae,Catha,Catha edulis,126.0,126.0,6064.0,0.72752
2,LQ-01-61-03,Celastraceae,Catha,Catha edulis,126.0,126.0,6064.0,0.72752
3,LQ-01-61-04,Celastraceae,Catha,Catha edulis,126.0,126.0,6064.0,0.72752
4,LQ-01-61-05,Celastraceae,Celastrus,Celastrus orbiculatus,212.0,732.0,6064.0,0.52032
...,...,...,...,...,...,...,...,...
73,LQ-01-61-74,Celastraceae,Tripterygium,Tripterygium wilfordii,1011.0,1353.0,6064.0,0.00000
74,LQ-01-61-75,Celastraceae,Tripterygium,Tripterygium wilfordii,1011.0,1353.0,6064.0,0.00000
75,LQ-01-61-76,0,0,0,0.0,0.0,0.0,1.00000
76,LQ-01-61-77,0,0,0,0.0,0.0,0.0,1.00000


# Similarity component (SC)

#### SC.1. SC computation

In [101]:
#calculate MEMO matrix from individual files
metric_df = calculate_memo_matrix_ind_files(repository_path, spectra_suffix, filename_header)
metric_df.head(5)

100%|██████████| 78/78 [01:00<00:00,  1.29it/s]


Computing MEMO matrix from unaligned samples took: 62.578125 seconds



The default value of regex will change from True to False in a future version.



Unnamed: 0,ms_filename,peak@53.04,peak@57.07,peak@77.04,peak@78.03,peak@83.09,peak@85.07,peak@91.05,peak@93.07,peak@95.05,...,peak@662.27,peak@470.31,peak@655.05,peak@386.73,peak@411.57,peak@611.62,peak@340.64,peak@569.51,peak@704.01,peak@717.90
0,LQ-01-61-01,55.0,258.0,19.0,66.0,249.0,158.0,250.0,345.0,267.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,LQ-01-61-02,31.0,310.0,5.0,16.0,347.0,133.0,229.0,411.0,134.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,LQ-01-61-03,26.0,249.0,9.0,35.0,316.0,90.0,275.0,469.0,111.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,LQ-01-61-04,49.0,291.0,15.0,66.0,331.0,135.0,284.0,450.0,234.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,LQ-01-61-05,48.0,392.0,11.0,36.0,372.0,145.0,286.0,399.0,191.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [102]:
metric_df.tail()

Unnamed: 0,ms_filename,peak@53.04,peak@57.07,peak@77.04,peak@78.03,peak@83.09,peak@85.07,peak@91.05,peak@93.07,peak@95.05,...,peak@662.27,peak@470.31,peak@655.05,peak@386.73,peak@411.57,peak@611.62,peak@340.64,peak@569.51,peak@704.01,peak@717.90
73,LQ-01-61-74,38.0,317.0,10.0,23.0,378.0,120.0,308.0,481.0,158.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74,LQ-01-61-75,50.0,380.0,5.0,0.0,441.0,166.0,342.0,486.0,159.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75,LQ-01-61-77,1.0,22.0,1.0,0.0,16.0,1.0,7.0,7.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76,LQ-01-61-78,13.0,334.0,2.0,40.0,201.0,124.0,157.0,248.0,90.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
77,LQ-01-61-76,0.0,16.0,0.0,0.0,13.0,2.0,3.0,6.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [103]:
#remove experimental controls and blancks
list_of_strings_for_QC_Blank_filter = ['blank', 'qc'] #erase all the blanks and QC's - Change the string as needed
column_to_use_for_filtering = sampletype_header #this information should be included in the metadata table
metric_df = drop_samples_based_on_string_ind(metric_df,metadata_df, filename_header, sampletype_header,'metric_df', list_of_strings_for_QC_Blank_filter, column_to_use_for_filtering)
#metric_df.head()


(78, 66836)
(76, 66836)


In [89]:
SC = similarity_component(metric_df, SC_component, filename_header)
SC.head(10)

Unnamed: 0,ms_filename,anomaly_IF,anomaly_LOF,anomaly_OCSVM,SC
0,LQ-01-61-02_pos,1,1,1,0
1,LQ-01-61-03_pos,1,1,1,0
2,LQ-01-61-04_pos,1,1,1,0
3,LQ-01-61-05_pos,1,1,1,0
4,LQ-01-61-06_pos,1,1,1,0
5,LQ-01-61-07_pos,1,-1,-1,1
6,LQ-01-61-08_pos,1,1,1,0
7,LQ-01-61-09_pos,1,-1,-1,1
8,LQ-01-61-10_pos,1,1,1,0
9,LQ-01-61-11_pos,1,1,1,0


# Class component (CC)

In [82]:
CC= class_component_ind_files(CC_component, repository_path, canopus_sample_suffix, metadata_df, filename_header, species_column, genus_column, family_column)
CC

KeyError: 'partial_filename'

# Priority rank Results

In [None]:
PR = priority_rank(LC_component, SC_component, CC_component, w1, w2, w3, w4)
#PR.head()

In [None]:
Cyt_format_visualization = Cyt_format(col_id_unique)

### Display results

In [None]:
#Show the results in an interactive way
def selection_changed(selection):
    return PR.iloc[selection]
interact(selection_changed, selection=lineup_widget.LineUpWidget(PR));

interactive(children=(LineUpWidget(value=[], description='selection', layout=Layout(align_self='stretch', heig…