In [7]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import grequests
import urllib.parse
from tqdm import tqdm
import os 

In [2]:
df_meta = pd.read_csv(
    '/mnt/c/Users/quirosgu/Documents/GitHub/inventa/data_loc/LotusDB_inhouse_metadata.csv',
    sep=",")

In [3]:
df_meta.head(2)

Unnamed: 0.1,Unnamed: 0,structure_taxonomy_npclassifier_03class,organism_name,organism_taxonomy_06family,organism_taxonomy_08genus,organism_taxonomy_09species,Reported_comp_Family,Reported_comp_Genus,Reported_comp_Species
0,0,Cholestane steroids,Cerastoderma edule,Cardiidae,Cerastoderma,Cerastoderma edule,4,2,1
1,1,Cholestane steroids,Mya arenaria,Myidae,Mya,Mya arenaria,1,1,1


In [15]:
repository_path = '/mnt/c/Users/quirosgu/Desktop/Celastraceae/Scientific_data/'
canopus_table_path = '/mnt/c/Users/quirosgu/Desktop/Celastraceae/Scientific_data/POS_canopus_formula_summary_adducts.tsv' 
annotations_table_path = '/mnt/c/Users/quirosgu/Desktop/Celastraceae/Scientific_data/POS_compound_identifications_adducts.tsv'

min_ZodiacScore = 0.9             #cut-off filter for considering a sirius annotation valable. It is used in combination with min_ConfidenceScore.
min_ConfidenceScore= 0.0         #cut-off filter for considering a sirius annotation valable. '0.0' as default.
min_class_confidence = 0.8       #cut-off filter for considering a sirius class valable. It is used in combination with min_recurrence.



In [9]:
def sunburst_count_plotter(repository_path, canopus_df, organism):
            
    canopus_df = canopus_df[['row ID','NPC#pathway', 'NPC#superclass', 'NPC#class']]
    canopus_df = canopus_df.replace({np.nan:'None'})

    #generate treemap        
    fig1 = px.treemap(canopus_df, path=['NPC#pathway', 'NPC#superclass', 'NPC#class'],
                    color='NPC#pathway',
                    color_discrete_map={
                        'Terpenoids':'#44AA99',
                        'Alkaloids': '#88CCEE',
                        'Amino acids and Peptides': '#DDCC77',
                        'Polyketides': '#CC6677',
                        'Shikimates and Phenylpropanoids': '#AA4499',
                        'Fatty acids': '#882255',
                        'Carbohydrates': '#F4A261',})
    fig1.update_layout(margin = dict(t=50, l=25, r=25, b=25),
    title_text=" ("  +  organism + ") " + "- metabolite annotation overview (size proportional to number of annotations)")
    fig1.update_annotations(font_size=18, font_family="sans-serif")
    

    #generate sunburst

    fig2 = px.sunburst(canopus_df, path=['NPC#pathway', 'NPC#superclass', 'NPC#class'],
                    color='NPC#pathway',
                    color_discrete_map={
                        'Terpenoids':'#44AA99',
                        'Alkaloids': '#88CCEE',
                        'Amino acids and Peptides': '#DDCC77',
                        'Polyketides': '#CC6677',
                        'Shikimates and Phenylpropanoids': '#AA4499',
                        'Fatty acids': '#882255',
                        'Carbohydrates': '#F4A261',})
    fig2.update_layout(margin = dict(t=50, l=25, r=25, b=25),
    title_text= " ("  +  organism + ") " + "- metabolite annotation overview (size proportional to number of annotations)")
    fig2.update_annotations(font_size=18, font_family="sans-serif")
    
    path = os.path.normpath(repository_path)
    pathout = os.path.join(path, 'results/')
    os.makedirs(pathout, exist_ok=True)
    pathout_treemap = os.path.join(pathout, 'treemap_pos.html')
    pathout_sunburst = os.path.join(pathout, 'sunburst_pos.html')
    fig1.write_html(pathout_treemap)
    fig2.write_html(pathout_sunburst)

# CANOPUS

In [16]:
canopus_df = pd.read_csv(canopus_table_path, sep='\t')
canopus_df = canopus_df[['id', 'molecularFormula', 'adduct', 'NPC#pathway',
       'NPC#pathway Probability', 'NPC#superclass',
       'NPC#superclass Probability', 'NPC#class', 'NPC#class Probability']]
canopus_df.rename(columns={'NPC#class Probability': 'classProbability'}, inplace=True) 
canopus_df['shared name'] = canopus_df['id'].str.split('_').str[-1].astype(int)
canopus_df.drop('id', axis=1, inplace=True)
canopus_df.rename(columns={'shared name': 'row ID', 'adduct': 'adduct (sirius)', 'molecularFormula': 'MF (sirius)', 'name': 'Compound name (sirius)'}, inplace=True) 
canopus_df.drop(canopus_df[canopus_df.classProbability > min_class_confidence].index, inplace=True)
canopus_df.drop(['classProbability', 'NPC#superclass Probability', 'NPC#pathway Probability'], axis=1, inplace=True)

#aggregate features 
agg_func = {'adduct (sirius)': set, 'MF (sirius)': 'first', 'NPC#pathway': 'first', 'NPC#superclass': 'first', 'NPC#class': 'first'}
canopus_df = canopus_df.groupby('row ID', as_index=False).agg(agg_func)

In [17]:
#total number of unique molecular formulas
canopus_df.shape[0]

9990

In [22]:
dfg = canopus_df.groupby(['NPC#pathway']).count()
dfg

Unnamed: 0_level_0,row ID,adduct (sirius),MF (sirius),NPC#superclass,NPC#class
NPC#pathway,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alkaloids,1347,1347,1347,1347,1347
Amino acids and Peptides,482,482,482,482,482
Carbohydrates,97,97,97,97,97
Fatty acids,1269,1269,1269,1269,1269
Polyketides,761,761,761,761,761
Shikimates and Phenylpropanoids,726,726,726,726,726
Terpenoids,5305,5305,5305,5305,5305


In [18]:
sample_dir='Set Celastraceae'
organism='positive ionization mode'
sunburst_count_plotter(repository_path, canopus_df, organism)


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



# CSI FIngerID annotations

In [32]:
dfa = pd.read_csv(annotations_table_path, sep='\t')
dfa['shared name'] = dfa['id'].str.split('_').str[-1].astype(int)
#drop duplicate annotations (for the same feature)
dfa = dfa.drop_duplicates(subset='shared name', keep="first")
dfa.head()

Unnamed: 0,rank,formulaRank,#adducts,#predictedFPs,ConfidenceScore,CSI:FingerIDScore,ZodiacScore,SiriusScore,molecularFormula,adduct,...,name,smiles,xlogp,pubchemids,links,dbflags,ionMass,retentionTimeInSeconds,id,shared name
0,1,1,1,1,0.999987,-121.665111,1.0,79.302958,C34H41NO17,[M + H]+,...,,CC(=O)OCC12C(C(C3C(C14C(C(C(=O)C2OC(=O)C)C(O4)...,-1.4,21602036;101335369,COCONUT:(CNP0194230);KNApSAcK:(43389);Natural ...,3178514,736.24436,154.510876,13643_2204_SIRIUS_5_5_3_ULT_v3_Celestraceae_PO...,14655
1,2,1,1,2,0.999986,-121.590303,0.964322,36.130446,C40H51NO19,[M + H]+,...,,CC1C(C(=O)OC2C(C(C3(C(C(C4C(C3(C2(C)O)OC4(COC(...,0.4,21586642;73806769;101290208,COCONUT:(CNP0266907);KNApSAcK:(13111);Natural ...,3178514,850.312727,236.3095,13857_2204_SIRIUS_5_5_3_ULT_v3_Celestraceae_PO...,14876
2,3,1,1,2,0.999985,-120.667814,0.956281,37.762325,C40H51NO19,[M + H]+,...,,CC1C(C(=O)OC2C(C(C3(C(C(C4C(C3(C2(C)O)OC4(COC(...,0.4,21586642;73806769;101290208,COCONUT:(CNP0266907);KNApSAcK:(13111);Natural ...,3178514,850.31259,210.474,13858_2204_SIRIUS_5_5_3_ULT_v3_Celestraceae_PO...,14877
3,4,1,2,2,0.99998,-66.263006,1.0,33.253204,C39H43NO16,[M + H]+,...,Pyridine alkaloids,CC(=O)OCC12C(OC(C)=O)C(=O)C3C(O)C14OC3(C)COC(=...,1.236607,,2204_NatProd_UNIGE_5_4_1_v2:(Pyridine alkaloids),0,782.262993,221.095998,13746_2204_SIRIUS_5_5_3_ULT_v3_Celestraceae_PO...,14762
5,6,1,1,2,0.99998,-128.933851,0.976884,16.41499,C40H51NO19,[M + Na]+,...,,CC1C(C(=O)OC2C(C(C3(C(C(C4C(C3(C2(C)O)OC4(COC(...,0.4,21586642;73806769;101290208,COCONUT:(CNP0266907);KNApSAcK:(13111);Natural ...,3178514,872.294447,236.598748,13891_2204_SIRIUS_5_5_3_ULT_v3_Celestraceae_PO...,14910


In [33]:
#total number of unique annotations
dfa.shape[0]

12478

In [22]:
#recover 1 candidate after repond
repond_path = '/mnt/c/Users/quirosgu/Desktop/Celastraceae/Scientific Data/Data to upload/Celast_pos_repond.tsv'
repond_df = pd.read_csv(repond_path, sep='\t')

df = repond_df[['feature_id', 'structure_name',
       'structure_molecular_formula', 'library',
       'structure_smiles_2D', 'structure_inchikey_2D','score_final', 'rank_final', 'best_candidate_organism', 'best_candidate']]

df['structure_name_1']=df['structure_name'].str.split('|').str[1].astype(str)
df['structure_molecular_formula_1']=df['structure_molecular_formula'].str.split('|').str[1].astype(str)
df['library_1']=df['library'].str.split('|').str[1].astype(str)
df['structure_smiles_2D_1']=df['structure_smiles_2D'].str.split('|').str[1].astype(str)
df['structure_inchikey_2D_1']=df['structure_inchikey_2D'].str.split('|').str[1].astype(str)
df['score_final_1']=df['score_final'].str.split('|').str[1]#.astype(int)
#df['rank_final_1']=df['rank_final'].str.split('|').str[1].astype(int)
df['best_candidate_organism_1']=df['best_candidate_organism'].str.split('|').str[1].astype(str)
df['best_candidate_1']=df['best_candidate'].str.split('|').str[1].astype(str)


#fill NAN for existent candidates
# df['column_A'].fillna(df['column_B'], inplace=True)
#df['structure_name_1']= df['structure_name_1'].fillna(repond_df['structure_name'])
#df['structure_molecular_formula_1']= df['structure_molecular_formula_1'].fillna(repond_df['structure_molecular_formula'])
#df['library_1']= df['library_1'].fillna(repond_df['library'])
#df['structure_smiles_2D_1']= df['structure_smiles_2D_1'].fillna(repond_df['structure_smiles_2D'])
#df['structure_inchikey_2D_1']= df['structure_inchikey_2D_1'].fillna(repond_df['structure_inchikey_2D'])
#df['best_candidate_organism_1']= df['best_candidate_organism_1'].fillna(repond_df['best_candidate_organism'])
#df['best_candidate_1']= df['best_candidate_1'].fillna(repond_df['best_candidate'])

df = df[['feature_id', 'structure_name_1','score_final_1',
       'structure_molecular_formula_1', 'library_1',
       'structure_smiles_2D_1', 'structure_inchikey_2D_1', 'best_candidate_organism_1', 'best_candidate_1']]

#merge with the original table
repond_df = pd.merge(repond_df, df, how='left', on= 'feature_id')
repond_df.to_csv('/mnt/c/Users/quirosgu/Desktop/Celastraceae/Scientific Data/Data to upload/Celast_pos_repond_top1.tsv', sep='\t')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['structure_name_1']=df['structure_name'].str.split('|').str[1].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['structure_molecular_formula_1']=df['structure_molecular_formula'].str.split('|').str[1].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['library_1']=df['lib