# INVENTA - Prioritization of natural extracts for chemical originality discovery


In [26]:
import sys 
sys.path.append('../src')

import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import plotly.express as px
import zipfile
import yaml

import lineup_widget
from __future__ import print_function
from ipywidgets import interact, interactive, interact_manual
!jupyter nbextension enable --py --sys-prefix lineup_widget

from sklearn.metrics import pairwise_distances
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn import preprocessing
from skbio.stats.ordination import pcoa
from skbio import OrdinationResults

from inventa import quant_table
from inventa import reduce_df
from inventa import full_data
from inventa import top_ions
from inventa import annotations
from inventa import feature_component
from inventa import literature_component
from inventa import similarity_component
from inventa import sirius_classes
from inventa import search_reported_class
from inventa import class_component
from inventa import process_gnps_results
from inventa import drop_samples_based_on_string
from inventa import process_gnps_results

from plot import pcoa_2d

Enabling notebook extension lineup_widget/extension...
      - Validating: [32mOK[0m


![title](../src/Priority_rank.png)

In [27]:
#loading parameters:

if not os.path.exists('../configs/user_defined/default.yaml'):
    print('No configs/user_defined/default.yaml: copy from configs/default/default.yaml and modifiy according to your needs')

with open (r'../configs/user_defined/default.yaml') as file:
     params_list = yaml.load(file, Loader=yaml.FullLoader)    

FC_component = params_list['Feature_component'][0]['calculate_FC']
min_specificity = params_list['Feature_component'][1]['min_specificity']
only_feature_specificity = params_list['Feature_component'][2]['only_feature_specificity']
only_gnps_annotations = params_list['Feature_component'][3]['only_gnps_annotations']
only_ms2_annotations = params_list['Feature_component'][4]['only_ms2_annotations']
annotation_preference= params_list['Feature_component'][5]['annotation_preference']

LC_component = params_list['Literature_component'][0]['calculate_LC']
max_comp_reported = params_list['Literature_component'][1]['max_comp_reported']
min_comp_reported  = params_list['Literature_component'][2]['min_comp_reported']

CC_component = params_list['Class_component'][0]['calculate_CC']

SC_component = params_list['Similarity_component'][0]['calculate_SC']

metadata_filename = params_list['paths'][0]['metadata_filename']
quantitative_data_filename = params_list['paths'][1]['quantitative_data_filename']
tima_results_filename = params_list['paths'][2]['tima_results_filename']
vectorized_data_filename = params_list['paths'][3]['vectorized_data_filename']
canopus_npc_summary_filename = params_list['paths'][4]['canopus_npc_summary_filename']
sirius_annotations_filename = params_list['paths'][5]['sirius_annotations_filename']

### Download GNPS results

In [8]:
!del all_annotations.zip

# Replace the job ID in the line below <<< ====
!curl -d "" "https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task=4c919fcbc83d487493a487012afb920a&view=view_all_annotations_DB" -o ../data/all_annotations.zip
!pwd
!ls ../data
!unzip -q -d ../data/all_annotations/ -o ../data/all_annotations.zip

print('==================')
print('GNPS job results were succesfully downloaded')

clusterinfosummary = process_gnps_results('../data/all_annotations')

/bin/bash: del: command not found
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 38.9M    0 38.9M    0     0  1563k      0 --:--:--  0:00:25 --:--:-- 3894k36k      0 --:--:--  0:00:09 --:--:--  235k     0 --:--:--  0:00:18 --:--:-- 1939k
/mnt/c/Users/quirosgu/Documents/GitHub/INVENTA/notebook
198a574e172443ad84f43e739ceea2c8.tsv
Celastraceae_Set_metadata_pos.tsv
Celastraceae_memomatrix.csv
Celastraceae_pos_quant.csv
Celastraceae_pos_spectral_match_results_repond.tsv
all_annotations
all_annotations.zip
canopus_npc_summary.tsv
GNPS job results were succesfully downloaded
FBMN job detected
   Number of spectral library annotations in job = 1836
   Number of network nodes in the job = 14970


## Prepare input files

### Metadata table

In [7]:
#metadata 
metadata_df = pd.read_csv(metadata_filename, sep='\t')

#if a column Species\part is NOT present run the following command, if IS PRESENT don't run.
metadata_df['ATTRIBUTE_Sppart'] = metadata_df["ATTRIBUTE_Species"]+ "|" + metadata_df["ATTRIBUTE_Organe"].map(str)
metadata_df.head(3)

Unnamed: 0,filename,ATTRIBUTE_Code,ATTRIBUTE_Type,ATTRIBUTE_Family,ATTRIBUTE_Genus,ATTRIBUTE_Species,ATTRIBUTE_Organe,ATTRIBUTE_Broad_organ,ATTRIBUTE_Tissue,ATTRIBUTE_Subsystem,ATTRIBUTE_Sppart
0,LQ-01-61-01_pos.mzXML,V107694,Sample,Celastraceae,Catha,Catha edulis,Leaves,photosynthetic,green tissue,aboveground,Catha edulis|Leaves
1,LQ-01-61-02_pos.mzXML,V107695,Sample,Celastraceae,Catha,Catha edulis,Stems,woody vegetative,woody tissue,aboveground,Catha edulis|Stems
2,LQ-01-61-03_pos.mzXML,V107696,Sample,Celastraceae,Catha,Catha edulis,Roots,roots,root tissue,belowground,Catha edulis|Roots


### Quantification table

In [9]:
#Quantitative table
quant_df = pd.read_csv(quantitative_data_filename, sep=',',  index_col='row ID')
quant_df = quant_table(quant_df)
quant_df.head(3)

Unnamed: 0_level_0,LQ-01-61-01_pos.mzXML,LQ-01-61-02_pos.mzXML,LQ-01-61-03_pos.mzXML,LQ-01-61-04_pos.mzXML,LQ-01-61-05_pos.mzXML,LQ-01-61-06_pos.mzXML,LQ-01-61-07_pos.mzXML,LQ-01-61-08_pos.mzXML,LQ-01-61-09_pos.mzXML,LQ-01-61-10_pos.mzXML,...,LQ-01-61-69_pos.mzXML,LQ-01-61-70_pos.mzXML,LQ-01-61-71_pos.mzXML,LQ-01-61-72_pos.mzXML,LQ-01-61-73_pos.mzXML,LQ-01-61-74_pos.mzXML,LQ-01-61-75_pos.mzXML,LQ-01-61-76_pos.mzXML,LQ-01-61-77_pos.mzXML,LQ-01-61-78_pos.mzXML
row ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2600773000.0,31347670.0,84137200.0,69254580.0,0.0,0.0,0.0,0.0,3256476.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,886019600.0,19647810.0,8688867.0,2214498000.0,0.0,0.0,0.0,0.0,1639854.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,854458300.0,25941630.0,0.0,461216300.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Merge tables

In [10]:
full_df = full_data(metadata_df,quant_df)
print(full_df.shape)
full_df.head(2)

(78, 14981)


Unnamed: 0,filename,ATTRIBUTE_Code,ATTRIBUTE_Type,ATTRIBUTE_Family,ATTRIBUTE_Genus,ATTRIBUTE_Species,ATTRIBUTE_Organe,ATTRIBUTE_Broad_organ,ATTRIBUTE_Tissue,ATTRIBUTE_Subsystem,...,14961,14962,14963,14964,14965,14966,14967,14968,14969,14970
0,LQ-01-61-01_pos.mzXML,V107694,Sample,Celastraceae,Catha,Catha edulis,Leaves,photosynthetic,green tissue,aboveground,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,LQ-01-61-02_pos.mzXML,V107695,Sample,Celastraceae,Catha,Catha edulis,Stems,woody vegetative,woody tissue,aboveground,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Remove experimental controls

In [11]:
#erase all the blanks and QC's - Change the string as needed
list_of_strings_for_QC_Blank_filter = ['Blank', 'QC']
column_to_use_for_filtering = 'ATTRIBUTE_Type' #this information should be included in the metadata table

full_df = drop_samples_based_on_string(full_df,list_of_strings_for_QC_Blank_filter, column_to_use_for_filtering)
metadata_df = drop_samples_based_on_string(metadata_df,list_of_strings_for_QC_Blank_filter, column_to_use_for_filtering)

(78, 14981)
(76, 14981)
(77, 11)
(76, 11)


In [12]:
# Make a minimal table for further processing
reduced_df = reduce_df(full_df, metadata_df, 'ATTRIBUTE_Sppart')
reduced_df.head(2)

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,14961,14962,14963,14964,14965,14966,14967,14968,14969,14970
ATTRIBUTE_Sppart,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Catha edulis|Leaves,2600773000.0,886019600.0,854458300.0,632016100.0,528280000.0,522718400.0,512211000.0,393613100.0,472246300.0,234727500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Catha edulis|Stems,31347670.0,19647810.0,25941630.0,0.0,7494122.0,210707100.0,0.0,50182830.0,18593770.0,322292700.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Feature component (FC)

### FC. Feature Specifificty

In [13]:
specificity_df = top_ions(reduced_df, quant_df)
specificity_df.head(2)

Unnamed: 0,row ID,filename,ATTRIBUTE_Sppart,Feature_specificity
0,1,LQ-01-61-01_pos.mzXML,Catha edulis|Leaves,93.25882
1,2,LQ-01-61-04_pos.mzXML,Catha edulis|Aerial_parts,69.662011


### FC. Annotation Rate

In [14]:
annot_gnps_df = pd.read_csv(clusterinfosummary, 
                                sep='\t', 
                                usecols=['cluster index','componentindex', 'SpectrumID'], 
                                low_memory=False)
annot_is_df = pd.read_csv(tima_results_filename,
                                sep='\t', 
                                usecols =['libname','feature_id','short_inchikey'], 
                                low_memory=False)

annotation_df = annotations(annot_gnps_df, annot_is_df, only_gnps_annotations, only_ms2_annotations)
annotation_df.head(2)

Unnamed: 0,cluster index,componentindex,Annotated_GNPS,annotation
0,1,113,0,0
1,2,136,0,0


In [25]:
FC = feature_component(specificity_df, annotation_df, metadata_df, FC_component, 
                        only_feature_specificity, min_specificity)
#FC = FC.sort_values(by=['FC'], ascending=False)
FC.head(2)

TypeError: feature_component() takes 5 positional arguments but 6 were given

# Literature component (LC)


In [28]:
#erase all the blanks and QC's
LC = literature_component(metadata_df, LC_component, min_comp_reported, max_comp_reported)
LC = LC.sort_values(by=['LC'], ascending=False)
LC.head(2)

TypeError: literature_component() takes 1 positional argument but 4 were given

# Similarity component (SC)

In [29]:
metric_df = pd.read_csv(vectorized_data_filename, sep=',', encoding= 'unicode_escape')
metric_df.head(3)

Unnamed: 0,filename,peak@53.04,peak@57.07,peak@77.04,peak@78.03,peak@83.09,peak@85.07,peak@91.05,peak@93.07,peak@95.05,...,peak@470.31,peak@185.87,peak@286.39,peak@655.05,peak@158.75,peak@386.73,peak@411.57,peak@611.62,peak@466.50,peak@123.63
0,LQ-01-61-01_pos.mzXML,45.0,246.0,23.0,66.0,239.0,147.0,230.0,293.0,191.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,LQ-01-61-02_pos.mzXML,32.0,290.0,8.0,16.0,338.0,159.0,285.0,395.0,144.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,LQ-01-61-03_pos.mzXML,31.0,259.0,8.0,35.0,317.0,131.0,301.0,468.0,131.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
SC = similarity_component(metric_df[:-1], SC_component)
SC.head(2)



Unnamed: 0,filename,anomaly_IF,anomaly_LOF,anomaly_OCSVM,SC
0,LQ-01-61-01_pos.mzXML,1,1,1,0
1,LQ-01-61-02_pos.mzXML,1,1,1,0


##### Visualizing the similarity results

In [None]:
pcoa_2d(
    matrix= metric_df[:-1],
    data = SC,
    metric= 'braycurtis',
    filename_col = 'filename',
    group_col='anomaly_IF',
    title='"Isolation Forest & PCoA (bray-curtis) based on the feature matrix"'
    )
pcoa_2d(
    matrix= metric_df[:-1],
    data = SC,
    metric= 'braycurtis',
    filename_col = 'filename',
    group_col='anomaly_LOF',
    title='"Local outlier factor & PCoA (bray-curtis) based on the feature matrix"'
    )

pcoa_2d(
    matrix= metric_df[:-1],
    data = SC,
    metric= 'braycurtis',
    filename_col = 'filename',
    group_col='anomaly_OCSVM',
    title='"One-class support vector machine & PCoA (bray-curtis) based on the feature matrix"'
    )

## Chemical classes from Sirius

### CC. Chemical class

In [None]:
#Upload sirius results 
canopus_npc_df            = pd.read_csv(canopus_npc_summary_filename,
                                sep='\t').dropna()
#SIRIUS.head(2)

sirus_classes_df = sirius_classes(specificity_df, metadata_df,canopus_npc_df)
sirus_classes_df.head(2)

### CC. Search for reported chemical classes according to the species in the set

In [None]:
reported_classes_df = search_reported_class(metadata_df)
reported_classes_df.head(2)

### CC. Computation of CC

In [None]:
CC = class_component(reported_classes_df, sirus_classes_df, CC_component)
CC.head(2)

# Priority rank Results

In [None]:
PR =pd.merge(
                left=FC,
                right=LC[['filename', 'Reported_comp_Species', 'Reported_comp_Genus', 'LC', 'ATTRIBUTE_Family']], 
                how='left', 
                left_on='filename', 
                right_on='filename')
PR =pd.merge(
                    left=PR,
                    right=SC[['filename', 'SC']], 
                    how='left', 
                    left_on='filename', 
                    right_on='filename')
PR =pd.merge(
                    left=PR,
                    right=CC[['filename', 'New_in_species', 'New_in_genus', 'CC']], 
                    how='left', 
                    left_on='filename', 
                    right_on='filename')
PR['CC'] = PR['CC'].fillna(0)

In [None]:
#specify the weight to modulate each component 
w1 = 1
w2 = 1
w3 = 1
w4 = 1

PR['PR'] = w1*PR['FC'] + w2*PR['LC'] + w3*PR['SC'] + w4*PR['CC']
PR = PR.sort_values(by=['PR'], ascending=False)
PR.head()

### Display results

In [None]:
#Show the results in an interactive way
def selection_changed(selection):
    return PR.iloc[selection]
interact(selection_changed, selection=lineup_widget.LineUpWidget(PR));

In [None]:
#Save it as .csv 
PR.to_csv('../results/INVENTA_results.tsv', sep='\t')