# INVENTA - Prioritization of natural extracts for chemical originality discovery


In [1]:
import sys 
sys.path.append('../src')

import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import plotly.express as px
import zipfile

import lineup_widget
from __future__ import print_function
from ipywidgets import interact, interactive, interact_manual

from sklearn.metrics import pairwise_distances
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn import preprocessing
from skbio.stats.ordination import pcoa
from skbio import OrdinationResults

from inventa import quant_table
from inventa import reduce_df
from inventa import full_data
from inventa import top_ions
from inventa import annotations
from inventa import feature_component
from inventa import literature_component
from inventa import similarity_component
from inventa import sirius_classes
from inventa import search_reported_class
from inventa import class_component
from inventa import process_gnps_results
from inventa import drop_samples_based_on_string
from inventa import process_gnps_results

In [2]:
from PIL import Image                                                                                
img = Image.open('../src/Priority_rank.png')
img.show()

# Parameters to define:

In [3]:
#Input filenames: drag them in the data folder

metadata_filename = '../data/Celastraceae_Set_metadata_pos.tsv'
quantitative_data_filename = '../data/Celastraceae_pos_quant.csv'
reponderation_taxo_results_filename = '../data/Celastraceae_pos_spectral_match_results_repond.tsv'
vectorized_data_filename = '../data/Celastraceae_memomatrix.csv'
sirius_results_filename = '../data/canopus_npc_summary.tsv'

## Download GNPS results

In [4]:
!del all_annotations.zip

# Replace the job ID in the line below <<< ====
!curl -d "" "https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task=4c919fcbc83d487493a487012afb920a&view=view_all_annotations_DB" -o ../data/all_annotations.zip

! unzip -q -d ../data/all_annotations/ -o ../data/all_annotations.zip
print('==================')
print('GNPS job results were succesfully downloaded')

clusterinfosummary = process_gnps_results('../data/all_annotations/')

zsh:1: command not found: del
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 38.9M    0 38.9M    0     0   564k      0 --:--:--  0:01:10 --:--:--  502k:--:--     00 --:--:--  0:00:19 --:--:--  541k10.9M    0 10.9M    0     0   484k      0 --:--:--  0:00:23 --:--:--  556k-:--:--  0:00:32 --:--:--  578k-:--:--  0:00:45 --:--:--  493k
GNPS job results were succesfully downloaded
FBMN job detected
   Number of spectral library annotations in job = 1836
   Number of network nodes in the job = 14970


## Prepare input files

### Metadata table

In [7]:
#metadata 
metadata_df = pd.read_csv(metadata_filename, sep='\t')

#if a column Species\part is NOT present run the following command, if IS PRESENT don't run.
metadata_df['ATTRIBUTE_Sppart'] = metadata_df["ATTRIBUTE_Species"]+ "|" + metadata_df["ATTRIBUTE_Organe"].map(str)
metadata_df.head(3)

Unnamed: 0,filename,ATTRIBUTE_Code,ATTRIBUTE_Type,ATTRIBUTE_Family,ATTRIBUTE_Genus,ATTRIBUTE_Species,ATTRIBUTE_Organe,ATTRIBUTE_Broad_organ,ATTRIBUTE_Tissue,ATTRIBUTE_Subsystem,ATTRIBUTE_Sppart
0,LQ-01-61-01_pos.mzXML,V107694,Sample,Celastraceae,Catha,Catha edulis,Leaves,photosynthetic,green tissue,aboveground,Catha edulis|Leaves
1,LQ-01-61-02_pos.mzXML,V107695,Sample,Celastraceae,Catha,Catha edulis,Stems,woody vegetative,woody tissue,aboveground,Catha edulis|Stems
2,LQ-01-61-03_pos.mzXML,V107696,Sample,Celastraceae,Catha,Catha edulis,Roots,roots,root tissue,belowground,Catha edulis|Roots


### Quantification table

In [8]:
#Quantitative table
quant_df = pd.read_csv(quantitative_data_filename, sep=',', index_col='row ID')
quant_df = quant_table(quant_df)
quant_df.head(3)

Unnamed: 0_level_0,LQ-01-61-01_pos.mzXML,LQ-01-61-02_pos.mzXML,LQ-01-61-03_pos.mzXML,LQ-01-61-04_pos.mzXML,LQ-01-61-05_pos.mzXML,LQ-01-61-06_pos.mzXML,LQ-01-61-07_pos.mzXML,LQ-01-61-08_pos.mzXML,LQ-01-61-09_pos.mzXML,LQ-01-61-10_pos.mzXML,...,LQ-01-61-69_pos.mzXML,LQ-01-61-70_pos.mzXML,LQ-01-61-71_pos.mzXML,LQ-01-61-72_pos.mzXML,LQ-01-61-73_pos.mzXML,LQ-01-61-74_pos.mzXML,LQ-01-61-75_pos.mzXML,LQ-01-61-76_pos.mzXML,LQ-01-61-77_pos.mzXML,LQ-01-61-78_pos.mzXML
row ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2600773000.0,31347670.0,84137200.0,69254580.0,0.0,0.0,0.0,0.0,3256476.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,886019600.0,19647810.0,8688867.0,2214498000.0,0.0,0.0,0.0,0.0,1639854.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,854458300.0,25941630.0,0.0,461216300.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Merge tables

In [9]:
full_df = full_data(metadata_df,quant_df)
print(full_df.shape)
full_df.head(2)

(78, 14981)


Unnamed: 0,filename,ATTRIBUTE_Code,ATTRIBUTE_Type,ATTRIBUTE_Family,ATTRIBUTE_Genus,ATTRIBUTE_Species,ATTRIBUTE_Organe,ATTRIBUTE_Broad_organ,ATTRIBUTE_Tissue,ATTRIBUTE_Subsystem,...,14961,14962,14963,14964,14965,14966,14967,14968,14969,14970
0,LQ-01-61-01_pos.mzXML,V107694,Sample,Celastraceae,Catha,Catha edulis,Leaves,photosynthetic,green tissue,aboveground,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,LQ-01-61-02_pos.mzXML,V107695,Sample,Celastraceae,Catha,Catha edulis,Stems,woody vegetative,woody tissue,aboveground,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Remove experimental controls

In [10]:
#erase all the blanks and QC's - Change the string as needed
list_of_strings_for_QC_Blank_filter = ['Blank', 'QC']
column_to_use_for_filtering = 'ATTRIBUTE_Type'

full_df = drop_samples_based_on_string(full_df,list_of_strings_for_QC_Blank_filter, column_to_use_for_filtering)
metadata_df = drop_samples_based_on_string(metadata_df,list_of_strings_for_QC_Blank_filter, column_to_use_for_filtering)

(78, 14981)
(76, 14981)
(77, 11)
(76, 11)


In [11]:
# Make a minimal table for further processing
reduced_df = reduce_df(full_df, metadata_df, 'ATTRIBUTE_Sppart')
reduced_df.head(2)

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,14961,14962,14963,14964,14965,14966,14967,14968,14969,14970
ATTRIBUTE_Sppart,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Catha edulis|Leaves,2600773000.0,886019600.0,854458300.0,632016100.0,528280000.0,522718400.0,512211000.0,393613100.0,472246300.0,234727500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Catha edulis|Stems,31347670.0,19647810.0,25941630.0,0.0,7494122.0,210707100.0,0.0,50182830.0,18593770.0,322292700.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Feature component (FC)

### FC. Feature Specifificty

In [12]:
specificity_df = top_ions(reduced_df, quant_df)
specificity_df.head(2)

Unnamed: 0,row ID,filename,ATTRIBUTE_Sppart,Feature_specificity
0,1,LQ-01-61-01_pos.mzXML,Catha edulis|Leaves,93.25882
1,2,LQ-01-61-04_pos.mzXML,Catha edulis|Aerial_parts,69.662011


### FC. Annotation Rate

In [14]:
annot_gnps_df = pd.read_csv(clusterinfosummary, 
                                sep='\t', 
                                usecols=['cluster index','componentindex', 'SpectrumID'], 
                                low_memory=False)
annot_is_df = pd.read_csv(reponderation_taxo_results_filename,
                                sep='\t', 
                                usecols =['libname','feature_id','short_inchikey'], 
                                low_memory=False)

annotation_df = annotations(annot_gnps_df, annot_is_df)
annotation_df.head(2)

Unnamed: 0,cluster index,componentindex,Annotated_GNPS,annotation
0,1,113,0,0
1,2,136,0,0


In [15]:
FC = feature_component(specificity_df, annotation_df, metadata_df)
#FC = FC.sort_values(by=['FC'], ascending=False)
FC.head(2)

Unnamed: 0,filename,ATTRIBUTE_Species,ATTRIBUTE_Sppart,Sample_specificity,FC
5,LQ-01-61-06_pos.mzXML,Celastrus orbiculatus,Celastrus orbiculatus|Roots,0.863281,0.824219
75,LQ-01-61-78_pos.mzXML,Pristimera indica,Pristimera indica|Roots,0.841667,0.813889


# Literature component (LC)


In [16]:
#erase all the blanks and QC's

LC = literature_component(metadata_df)
LC = LC.sort_values(by=['LC'], ascending=False)
LC.head(2)

Unnamed: 0,filename,ATTRIBUTE_Family,ATTRIBUTE_Genus,ATTRIBUTE_Species,Reported_comp_Family,Reported_comp_Genus,Reported_comp_Species,LC
44,LQ-01-61-36_pos.mzXML,Celastraceae,Euonymus,Euonymus myrianthus,0.0,0.0,0,1
54,LQ-01-61-46_pos.mzXML,Celastraceae,Maytenus,Maytenus inermis,0.0,0.0,0,1


# SIMILARITY COMPONENT (SC)

In [17]:
metric_df = pd.read_csv(vectorized_data_filename, sep=',', encoding= 'unicode_escape')
metric_df.head(3)

Unnamed: 0,filename,peak@53.04,peak@57.07,peak@77.04,peak@78.03,peak@83.09,peak@85.07,peak@91.05,peak@93.07,peak@95.05,...,peak@470.31,peak@185.87,peak@286.39,peak@655.05,peak@158.75,peak@386.73,peak@411.57,peak@611.62,peak@466.50,peak@123.63
0,LQ-01-61-01_pos.mzXML,45.0,246.0,23.0,66.0,239.0,147.0,230.0,293.0,191.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,LQ-01-61-02_pos.mzXML,32.0,290.0,8.0,16.0,338.0,159.0,285.0,395.0,144.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,LQ-01-61-03_pos.mzXML,31.0,259.0,8.0,35.0,317.0,131.0,301.0,468.0,131.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
SC = similarity_component(metric_df[:-1])
SC.head(2)



Unnamed: 0,filename,anomaly_IF,anomaly_LOF,anomaly_OCSVM,SC
0,LQ-01-61-01_pos.mzXML,1,1,1,0
1,LQ-01-61-02_pos.mzXML,1,1,1,0


## Chemical classes from Sirius

### CC. Chemical class

In [19]:
#Upload sirius results 
sirius_df            = pd.read_csv(sirius_results_filename ,
                                sep='\t').dropna()
#SIRIUS.head(2)

sirus_classes_df = sirius_classes(specificity_df, metadata_df,sirius_df)
sirus_classes_df.head(2)

Unnamed: 0,filename,classe
0,LQ-01-61-01_pos.mzXML,"{Simple phenolic acids, Piperidine alkaloids, ..."
1,LQ-01-61-02_pos.mzXML,"{Cholane steroids, Depsipeptides, Tetraketide ..."


### CC. Search for reported chemical classes according to the species in the set

In [20]:
reported_classes_df = search_reported_class(metadata_df)
reported_classes_df.head(2)

Unnamed: 0,filename,ATTRIBUTE_Species,ATTRIBUTE_Genus,ATTRIBUTE_Family,ATTRIBUTE_Family.1,ATTRIBUTE_Sppart,Chemical_class_reported_in_species,Chemical_class_reported_in_genus
0,LQ-01-61-01_pos.mzXML,Catha edulis,Catha,Celastraceae,Celastraceae,Catha edulis|Leaves,"{Stigmastane steroids, Menthane monoterpenoids...","{Stigmastane steroids, Menthane monoterpenoids..."
1,LQ-01-61-02_pos.mzXML,Catha edulis,Catha,Celastraceae,Celastraceae,Catha edulis|Stems,"{Stigmastane steroids, Menthane monoterpenoids...","{Stigmastane steroids, Menthane monoterpenoids..."


### CC. Computation of CC

In [21]:
CC = class_component(reported_classes_df, sirus_classes_df)
CC.head(2)

Unnamed: 0,filename,ATTRIBUTE_Species,ATTRIBUTE_Genus,ATTRIBUTE_Family,ATTRIBUTE_Family.1,ATTRIBUTE_Sppart,Chemical_class_reported_in_species,Chemical_class_reported_in_genus,classe,New_in_species,New_in_genus,CC
0,LQ-01-61-01_pos.mzXML,Catha edulis,Catha,Celastraceae,Celastraceae,Catha edulis|Leaves,"{Stigmastane steroids, Menthane monoterpenoids...","{Stigmastane steroids, Menthane monoterpenoids...","{Simple phenolic acids, Piperidine alkaloids, ...","{Caryophyllane sesquiterpenoids, Polyether ion...","{Caryophyllane sesquiterpenoids, Polyether ion...",1
1,LQ-01-61-02_pos.mzXML,Catha edulis,Catha,Celastraceae,Celastraceae,Catha edulis|Stems,"{Stigmastane steroids, Menthane monoterpenoids...","{Stigmastane steroids, Menthane monoterpenoids...","{Cholane steroids, Depsipeptides, Tetraketide ...","{Tetraketide meroterpenoids, Lanostane, Tiruca...","{Cholane steroids, Depsipeptides, Tetraketide ...",1


# Priority rank Results

In [22]:
PR =pd.merge(
                left=FC,
                right=LC[['filename', 'Reported_comp_Species', 'Reported_comp_Genus', 'LC', 'ATTRIBUTE_Family']], 
                how='left', 
                left_on='filename', 
                right_on='filename')
PR =pd.merge(
                    left=PR,
                    right=SC[['filename', 'SC']], 
                    how='left', 
                    left_on='filename', 
                    right_on='filename')
PR =pd.merge(
                    left=PR,
                    right=CC[['filename', 'New_in_species', 'New_in_genus', 'CC']], 
                    how='left', 
                    left_on='filename', 
                    right_on='filename')
PR['CC'] = PR['CC'].fillna(0)

In [23]:
#specify the weight to modulate each component 
w1 = 1
w2 = 1
w3 = 1
w4 = 1

PR['PR'] = w1*PR['FC'] + w2*PR['LC'] + w3*PR['SC'] + w4*PR['CC']
PR = PR.sort_values(by=['PR'], ascending=False)
PR.head()

Unnamed: 0,filename,ATTRIBUTE_Species,ATTRIBUTE_Sppart,Sample_specificity,FC,Reported_comp_Species,Reported_comp_Genus,LC,ATTRIBUTE_Family,SC,New_in_species,New_in_genus,CC,PR
7,LQ-01-61-37_pos.mzXML,Euonymus sanguineus,Euonymus sanguineus|Roots,0.838532,0.73945,1,515.0,1,Celastraceae,1.0,"{Tetraketide meroterpenoids, Unsaturated fatty...","{Tetraketide meroterpenoids, Unsaturated fatty...",1.0,3.73945
15,LQ-01-61-07_pos.mzXML,Celastrus paniculatus,Celastrus paniculatus|Seeds,0.71345,0.670565,2,910.0,1,Celastraceae,1.0,"{Prenyl quinone meroterpenoids, Arteminisin, S...","{Prenyl quinone meroterpenoids, Arteminisin, D...",1.0,3.670565
0,LQ-01-61-06_pos.mzXML,Celastrus orbiculatus,Celastrus orbiculatus|Roots,0.863281,0.824219,253,910.0,0,Celastraceae,1.0,"{Zearalenones, Prenyl quinone meroterpenoids, ...","{Zearalenones, Prenyl quinone meroterpenoids, ...",1.0,2.824219
4,LQ-01-61-27_pos.mzXML,Euonymus fortunei,Euonymus fortunei|Aerial_parts,0.903226,0.790323,1,515.0,1,Celastraceae,0.0,"{Secoiridoid monoterpenoids, Picrotoxane sesqu...","{Secoiridoid monoterpenoids, Picrotoxane sesqu...",1.0,2.790323
6,LQ-01-61-23_pos.mzXML,Euonymus cochinchinensis,Euonymus cochinchinensis|Leaves,0.805031,0.764151,0,0.0,1,Celastraceae,1.0,,,0.0,2.764151


### Display results

In [27]:
#Show the results in an interactive way
def selection_changed(selection):
    return PR.iloc[selection]
interact(selection_changed, selection=lineup_widget.LineUpWidget(PR));

interactive(children=(LineUpWidget(value=[], description='selection', layout=Layout(align_self='stretch', heig…

In [28]:
#Save it as .csv 
PR.to_csv('../results/INVENTA_results.csv', sep=',')