## notebook identifies characteristic peaks for all dihydroxy compounds

- di determined by substructure search
- output is a boxplot describing the characteristic peaks and their stats

In [1]:
import pandas as pd
import numpy as np
import plotly
import plotly.express as px

### get peak data
- from v_get_peaks_files.ipynb

In [2]:
all_file_peaks_part_1 = pd.read_parquet('/home/jovyan/work/notebooks/outputs/all_file_peaks_part_1.gzip')

In [3]:
all_file_peaks_part_2 = pd.read_parquet('/home/jovyan/work/notebooks/outputs/all_file_peaks_part_2.gzip')

In [4]:
all_file_peaks_part_3 = pd.read_parquet('/home/jovyan/work/notebooks/outputs/all_file_peaks_part_3.gzip')

In [5]:
all_file_peaks_part_4 = pd.read_parquet('/home/jovyan/work/notebooks/outputs/all_file_peaks_part_4.gzip')

In [6]:
all_file_peaks_part_5 = pd.read_parquet('/home/jovyan/work/notebooks/outputs/all_file_peaks_part_5.gzip')

### get di data
- from v_BA_M+H_substruct_df_for_manuscript.ipynb

In [7]:
di_table = pd.read_csv('/home/jovyan/work/notebooks/outputs/library_df_dihydroxy_BA_only_m+h_matches.csv',sep=',', index_col='spectrum_id')

In [8]:
di_table_ID = di_table.index.to_list()

### identify di peak data

In [9]:
all_file_peaks_part_1_di = all_file_peaks_part_1[all_file_peaks_part_1.index.isin(di_table_ID)]

In [10]:
all_file_peaks_part_2_di = all_file_peaks_part_2[all_file_peaks_part_2.index.isin(di_table_ID)]

In [11]:
all_file_peaks_part_3_di = all_file_peaks_part_3[all_file_peaks_part_3.index.isin(di_table_ID)]

In [12]:
all_file_peaks_part_4_di = all_file_peaks_part_4[all_file_peaks_part_4.index.isin(di_table_ID)]

In [13]:
all_file_peaks_part_5_di = all_file_peaks_part_5[all_file_peaks_part_5.index.isin(di_table_ID)]

In [14]:
all_file_peaks_di = pd.concat([all_file_peaks_part_1_di, all_file_peaks_part_2_di, all_file_peaks_part_3_di,
                                      all_file_peaks_part_4_di, all_file_peaks_part_5_di], axis=0)

In [15]:
all_file_peaks_di

Unnamed: 0_level_0,level_0,index,i,i_norm,i_tic_norm,mz,mz_nl,precmz
scan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CCMSLIB00010012258,1927,0,292.0,0.064673,0.013038,145.099136,558.430864,703.530
CCMSLIB00010012258,1928,1,90.0,0.019934,0.004019,146.101257,557.428743,703.530
CCMSLIB00010012258,1929,2,341.0,0.075526,0.015226,147.116302,556.413698,703.530
CCMSLIB00010012258,1930,3,509.0,0.112735,0.022727,149.131912,554.398088,703.530
CCMSLIB00010012258,1931,4,116.0,0.025692,0.005179,150.134186,553.395814,703.530
...,...,...,...,...,...,...,...,...
CCMSLIB00000222745,41152,2,303.0,0.303303,0.110544,343.264099,48.020901,391.285
CCMSLIB00000222745,41153,3,726.0,0.726727,0.264867,345.280487,46.004513,391.285
CCMSLIB00000222745,41154,4,177.0,0.177177,0.064575,347.296692,43.988308,391.285
CCMSLIB00000222745,41155,5,156.0,0.156156,0.056914,355.264801,36.020199,391.285


In [16]:
len(all_file_peaks_di.index.unique().to_list())

704

### begin investigating for characteristic peaks

In [17]:
peak_df = all_file_peaks_di

In [18]:
# Identifying small bins of MZ values
peak_df['mz_binned_small'] = peak_df['mz'].round(decimals = 2)
unique_mz_binned = peak_df['mz_binned_small'].unique()

In [19]:
peak_df.head(5)

Unnamed: 0_level_0,level_0,index,i,i_norm,i_tic_norm,mz,mz_nl,precmz,mz_binned_small
scan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CCMSLIB00010012258,1927,0,292.0,0.064673,0.013038,145.099136,558.430864,703.53,145.1
CCMSLIB00010012258,1928,1,90.0,0.019934,0.004019,146.101257,557.428743,703.53,146.1
CCMSLIB00010012258,1929,2,341.0,0.075526,0.015226,147.116302,556.413698,703.53,147.12
CCMSLIB00010012258,1930,3,509.0,0.112735,0.022727,149.131912,554.398088,703.53,149.13
CCMSLIB00010012258,1931,4,116.0,0.025692,0.005179,150.134186,553.395814,703.53,150.13


In [20]:
# Defining parameters

intensitynormmin  = 0.05
percentoccurmin = 20

In [21]:
# identifying peaks that satisfy minimum normalized intensity parameter

filtered_peak_df_i_norm = peak_df[peak_df["i_norm"] >= intensitynormmin]

In [22]:
filtered_peak_df_i_norm

Unnamed: 0_level_0,level_0,index,i,i_norm,i_tic_norm,mz,mz_nl,precmz,mz_binned_small
scan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CCMSLIB00010012258,1927,0,292.0,0.064673,0.013038,145.099136,558.430864,703.530,145.10
CCMSLIB00010012258,1929,2,341.0,0.075526,0.015226,147.116302,556.413698,703.530,147.12
CCMSLIB00010012258,1930,3,509.0,0.112735,0.022727,149.131912,554.398088,703.530,149.13
CCMSLIB00010012258,1935,8,259.0,0.057364,0.011565,159.116196,544.413804,703.530,159.12
CCMSLIB00010012258,1938,11,507.0,0.112292,0.022638,161.130783,542.399217,703.530,161.13
...,...,...,...,...,...,...,...,...,...
CCMSLIB00000222745,41152,2,303.0,0.303303,0.110544,343.264099,48.020901,391.285,343.26
CCMSLIB00000222745,41153,3,726.0,0.726727,0.264867,345.280487,46.004513,391.285,345.28
CCMSLIB00000222745,41154,4,177.0,0.177177,0.064575,347.296692,43.988308,391.285,347.30
CCMSLIB00000222745,41155,5,156.0,0.156156,0.056914,355.264801,36.020199,391.285,355.26


In [23]:
len(filtered_peak_df_i_norm.index.unique())

704

In [24]:
# For counting percent occurrence of peaks above miniumum intensity
occurs_above_intensitynormmin = {}

# Total number of spectral IDs
total_ids = len(peak_df.index.unique())

for peak in unique_mz_binned:
    mz_df_above_intensitynormmin = filtered_peak_df_i_norm.loc[(filtered_peak_df_i_norm['mz_binned_small'] == peak)]

    # Number of spectra where peak occurs above miniumum intensity
    peak_occurs_above_intensitynormmin = len(mz_df_above_intensitynormmin)

    if peak_occurs_above_intensitynormmin/total_ids >= (percentoccurmin/100):
        occurs_above_intensitynormmin[peak] = peak_occurs_above_intensitynormmin/total_ids

In [25]:
# Filtering to only include peaks that are present in at least 20% of the scans
filtered_peak_df = filtered_peak_df_i_norm[filtered_peak_df_i_norm["mz_binned_small"].isin(occurs_above_intensitynormmin.keys())]

In [26]:
filtered_peak_df

Unnamed: 0_level_0,level_0,index,i,i_norm,i_tic_norm,mz,mz_nl,precmz,mz_binned_small
scan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CCMSLIB00010012258,1927,0,292.0,0.064673,0.013038,145.099136,558.430864,703.530,145.10
CCMSLIB00010012258,1929,2,341.0,0.075526,0.015226,147.116302,556.413698,703.530,147.12
CCMSLIB00010012258,1930,3,509.0,0.112735,0.022727,149.131912,554.398088,703.530,149.13
CCMSLIB00010012258,1935,8,259.0,0.057364,0.011565,159.116196,544.413804,703.530,159.12
CCMSLIB00010012258,1938,11,507.0,0.112292,0.022638,161.130783,542.399217,703.530,161.13
...,...,...,...,...,...,...,...,...,...
CCMSLIB00005738860,296223,21,50.0,0.342466,0.013144,187.146805,220.132195,407.279,187.15
CCMSLIB00005738860,296228,26,98.0,0.671233,0.025762,199.145798,208.133202,407.279,199.15
CCMSLIB00005738860,296229,27,68.0,0.465753,0.017876,201.162003,206.116997,407.279,201.16
CCMSLIB00005738860,296236,34,76.0,0.520548,0.019979,229.158707,178.120293,407.279,229.16


In [27]:
filtered_peak_df.index.unique()

Index(['CCMSLIB00010012258', 'CCMSLIB00010012643', 'CCMSLIB00010012658',
       'CCMSLIB00010012738', 'CCMSLIB00003136511', 'CCMSLIB00003137882',
       'CCMSLIB00003138533', 'CCMSLIB00003138823', 'CCMSLIB00003138870',
       'CCMSLIB00003139149',
       ...
       'CCMSLIB00006542617', 'CCMSLIB00006542675', 'CCMSLIB00005738045',
       'CCMSLIB00005738052', 'CCMSLIB00005738117', 'CCMSLIB00005738164',
       'CCMSLIB00005738654', 'CCMSLIB00005738719', 'CCMSLIB00005738832',
       'CCMSLIB00005738860'],
      dtype='object', name='scan', length=458)

In [28]:
# for plotting
peak_ratio_df = pd.DataFrame.from_dict(occurs_above_intensitynormmin, orient='index')
peak_ratio_df.index.name = 'mz_binned_small'
peak_ratio_df = peak_ratio_df.rename(columns={0: "percent_occurrence"})

In [29]:
peak_ratio_df

Unnamed: 0_level_0,percent_occurrence
mz_binned_small,Unnamed: 1_level_1
145.1,0.254261
147.12,0.430398
149.13,0.411932
159.12,0.399148
161.13,0.535511
175.15,0.444602
177.13,0.330966
185.13,0.211648
187.15,0.203125
201.16,0.325284


In [30]:
peak_ratio_df.sort_values(['percent_occurrence'], ascending=False).head(5)

Unnamed: 0_level_0,percent_occurrence
mz_binned_small,Unnamed: 1_level_1
339.27,0.59233
321.26,0.571023
215.18,0.544034
161.13,0.535511
175.15,0.444602


### percent occurrence check

In [31]:
len(filtered_peak_df_i_norm[(filtered_peak_df_i_norm['mz_binned_small'] == 339.27)].index.unique())/len(peak_df.index.unique())

0.5923295454545454

In [32]:
len(filtered_peak_df_i_norm[(filtered_peak_df_i_norm['mz_binned_small'] == 215.18)].index.unique())/len(peak_df.index.unique())

0.5440340909090909

In [37]:
len(filtered_peak_df_i_norm[(filtered_peak_df_i_norm['mz_binned_small'] == 321.26)].index.unique())/len(peak_df.index.unique())

0.6741935483870968

In [33]:
with open('/home/jovyan/work/notebooks/outputs/output_box_peak_all_di_M+H_df.html', 'w') as f:

        ax = px.box(filtered_peak_df, x='mz_binned_small',y = 'i_norm')

        # Use hidden line below to graph ratio of MISSING peaks
        #ax.add_bar(x=peak_ratio_df.index, y=-peak_ratio_df["ratio of missing peaks"], name = "ratio of spectra with missing peaks")
        
        ax.add_bar(x=peak_ratio_df.index, y=-peak_ratio_df["percent_occurrence"],
                   name = "ratio of spectra where at least " + str(percentoccurmin)+"% contain peak",
                   width = 1)
        
        ax.update_layout(title_text="MS/MS Peak Intensity Distribution for " +"All Dihydroxy BA")
        
        f.write(ax.to_html(full_html=False, include_plotlyjs='cdn'))