## notebook identifies characteristic peaks for carnitine compounds

- carnitines determined by annotation description
- output is a boxplot describing the characteristic peaks and their stats

In [1]:
import pandas as pd
import numpy as np
import plotly
import plotly.express as px

### get peak data
- from v_get_peaks_files.ipynb

In [2]:
all_file_peaks_part_1 = pd.read_parquet('/home/jovyan/work/notebooks/outputs/all_file_peaks_part_1.gzip')

In [3]:
all_file_peaks_part_2 = pd.read_parquet('/home/jovyan/work/notebooks/outputs/all_file_peaks_part_2.gzip')

In [4]:
all_file_peaks_part_3 = pd.read_parquet('/home/jovyan/work/notebooks/outputs/all_file_peaks_part_3.gzip')

In [5]:
all_file_peaks_part_4 = pd.read_parquet('/home/jovyan/work/notebooks/outputs/all_file_peaks_part_4.gzip')

In [6]:
all_file_peaks_part_5 = pd.read_parquet('/home/jovyan/work/notebooks/outputs/all_file_peaks_part_5.gzip')

### get carnitine data
- from v_carnitines_M+H_name_df_for_manuscript.ipynb

In [7]:
carnitine_table = pd.read_csv('/home/jovyan/work/notebooks/outputs/library_df_carnitine_case_insen_M+H.csv',sep=',', index_col='spectrum_id')

In [8]:
carnitine_table_ID = carnitine_table.index.to_list()

### identify carnitine peak data

In [9]:
all_file_peaks_part_1_carnitine = all_file_peaks_part_1[all_file_peaks_part_1.index.isin(carnitine_table_ID)]

In [10]:
all_file_peaks_part_2_carnitine = all_file_peaks_part_2[all_file_peaks_part_2.index.isin(carnitine_table_ID)]

In [11]:
all_file_peaks_part_3_carnitine = all_file_peaks_part_3[all_file_peaks_part_3.index.isin(carnitine_table_ID)]

In [12]:
all_file_peaks_part_4_carnitine = all_file_peaks_part_4[all_file_peaks_part_4.index.isin(carnitine_table_ID)]

In [13]:
all_file_peaks_part_5_carnitine = all_file_peaks_part_5[all_file_peaks_part_5.index.isin(carnitine_table_ID)]

In [14]:
all_file_peaks_carnitine = pd.concat([all_file_peaks_part_1_carnitine, all_file_peaks_part_2_carnitine, all_file_peaks_part_3_carnitine,
                                      all_file_peaks_part_4_carnitine, all_file_peaks_part_5_carnitine], axis=0)

In [15]:
all_file_peaks_carnitine

Unnamed: 0_level_0,level_0,index,i,i_norm,i_tic_norm,mz,mz_nl,precmz
scan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CCMSLIB00000216197,399442,0,2177.529053,0.056863,0.034625,73.961777,126.118223,200.080
CCMSLIB00000216197,399443,1,25.951593,0.000678,0.000413,74.977188,125.102812,200.080
CCMSLIB00000216197,399444,2,503.461060,0.013147,0.008006,84.048637,116.031363,200.080
CCMSLIB00000216197,399445,3,21725.027344,0.567317,0.345450,102.003937,98.076063,200.080
CCMSLIB00000216197,399446,4,37.872719,0.000989,0.000602,158.435135,41.644865,200.080
...,...,...,...,...,...,...,...,...
CCMSLIB00000221337,35834,3,473.000000,0.473473,0.229500,103.041199,59.071801,162.113
CCMSLIB00000221337,35835,4,999.000000,1.000000,0.484716,162.113007,-0.000007,162.113
CCMSLIB00000221529,36687,0,999.000000,1.000000,0.676371,85.029900,119.094100,204.124
CCMSLIB00000221529,36688,1,141.000000,0.141141,0.095464,145.053894,59.070106,204.124


In [16]:
len(all_file_peaks_carnitine.index.unique().to_list())

387

### begin investigating for characteristic peaks

In [17]:
peak_df = all_file_peaks_carnitine

In [18]:
# Identifying small bins of MZ values
peak_df['mz_binned_small'] = peak_df['mz'].round(decimals = 2)
unique_mz_binned = peak_df['mz_binned_small'].unique()

In [19]:
peak_df.head(5)

Unnamed: 0_level_0,level_0,index,i,i_norm,i_tic_norm,mz,mz_nl,precmz,mz_binned_small
scan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CCMSLIB00000216197,399442,0,2177.529053,0.056863,0.034625,73.961777,126.118223,200.08,73.96
CCMSLIB00000216197,399443,1,25.951593,0.000678,0.000413,74.977188,125.102812,200.08,74.98
CCMSLIB00000216197,399444,2,503.46106,0.013147,0.008006,84.048637,116.031363,200.08,84.05
CCMSLIB00000216197,399445,3,21725.027344,0.567317,0.34545,102.003937,98.076063,200.08,102.0
CCMSLIB00000216197,399446,4,37.872719,0.000989,0.000602,158.435135,41.644865,200.08,158.44


In [20]:
# Defining parameters

intensitynormmin  = 0.05
percentoccurmin = 20

In [21]:
# identifying peaks that satisfy minimum normalized intensity parameter

filtered_peak_df_i_norm = peak_df[peak_df["i_norm"] >= intensitynormmin]

In [22]:
filtered_peak_df_i_norm

Unnamed: 0_level_0,level_0,index,i,i_norm,i_tic_norm,mz,mz_nl,precmz,mz_binned_small
scan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CCMSLIB00000216197,399442,0,2177.529053,0.056863,0.034625,73.961777,126.118223,200.080,73.96
CCMSLIB00000216197,399445,3,21725.027344,0.567317,0.345450,102.003937,98.076063,200.080,102.00
CCMSLIB00000216197,399448,6,38294.347656,1.000000,0.608919,298.146057,-98.066057,200.080,298.15
CCMSLIB00000216199,399449,0,3840.893799,0.124821,0.110233,90.911133,136.038867,226.950,90.91
CCMSLIB00000216199,399452,3,30771.181641,1.000000,0.883131,158.885483,68.064517,226.950,158.89
...,...,...,...,...,...,...,...,...,...
CCMSLIB00000221337,35834,3,473.000000,0.473473,0.229500,103.041199,59.071801,162.113,103.04
CCMSLIB00000221337,35835,4,999.000000,1.000000,0.484716,162.113007,-0.000007,162.113,162.11
CCMSLIB00000221529,36687,0,999.000000,1.000000,0.676371,85.029900,119.094100,204.124,85.03
CCMSLIB00000221529,36688,1,141.000000,0.141141,0.095464,145.053894,59.070106,204.124,145.05


In [23]:
len(filtered_peak_df_i_norm.index.unique())

387

In [24]:
# For counting percent occurrence of peaks above miniumum intensity
occurs_above_intensitynormmin = {}

# Total number of spectral IDs
total_ids = len(peak_df.index.unique())

for peak in unique_mz_binned:
    mz_df_above_intensitynormmin = filtered_peak_df_i_norm.loc[(filtered_peak_df_i_norm['mz_binned_small'] == peak)]

    # Number of spectra where peak occurs above miniumum intensity
    peak_occurs_above_intensitynormmin = len(mz_df_above_intensitynormmin)

    if peak_occurs_above_intensitynormmin/total_ids >= (percentoccurmin/100):
        occurs_above_intensitynormmin[peak] = peak_occurs_above_intensitynormmin/total_ids

In [25]:
# Filtering to only include peaks that are present in at least 20% of the scans
filtered_peak_df = filtered_peak_df_i_norm[filtered_peak_df_i_norm["mz_binned_small"].isin(occurs_above_intensitynormmin.keys())]

In [26]:
filtered_peak_df

Unnamed: 0_level_0,level_0,index,i,i_norm,i_tic_norm,mz,mz_nl,precmz,mz_binned_small
scan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CCMSLIB00000577974,8496,3,5.706804e+06,0.284907,0.151077,60.081528,86.038472,146.120,60.08
CCMSLIB00000577982,9113,6,5.514857e+06,0.107815,0.081476,60.081532,102.028468,162.110,60.08
CCMSLIB00000577990,9792,13,3.082949e+07,0.105989,0.077327,60.081486,284.198514,344.280,60.08
CCMSLIB00000577990,9806,27,2.908738e+08,1.000000,0.729578,85.028687,259.251313,344.280,85.03
CCMSLIB00003134527,5818,3,5.757238e+05,1.000000,0.561997,85.028221,259.250779,344.279,85.03
...,...,...,...,...,...,...,...,...,...
CCMSLIB00000221015,34821,3,3.950000e+02,0.395395,0.124606,60.081902,102.031098,162.113,60.08
CCMSLIB00000221015,34823,5,9.990000e+02,1.000000,0.315142,85.028702,77.084298,162.113,85.03
CCMSLIB00000221337,35831,0,1.320000e+02,0.132132,0.064047,60.082901,102.030099,162.113,60.08
CCMSLIB00000221337,35832,1,2.600000e+02,0.260260,0.126152,85.030602,77.082398,162.113,85.03


In [27]:
filtered_peak_df.index.unique()

Index(['CCMSLIB00000577974', 'CCMSLIB00000577982', 'CCMSLIB00000577990',
       'CCMSLIB00003134527', 'CCMSLIB00003134539', 'CCMSLIB00003134832',
       'CCMSLIB00003134907', 'CCMSLIB00003135237', 'CCMSLIB00003135497',
       'CCMSLIB00003135535',
       ...
       'CCMSLIB00005738761', 'CCMSLIB00005738771', 'CCMSLIB00005738777',
       'CCMSLIB00005738780', 'CCMSLIB00005720412', 'CCMSLIB00005720421',
       'CCMSLIB00000221013', 'CCMSLIB00000221015', 'CCMSLIB00000221337',
       'CCMSLIB00000221529'],
      dtype='object', name='scan', length=259)

In [28]:
# for plotting
peak_ratio_df = pd.DataFrame.from_dict(occurs_above_intensitynormmin, orient='index')
peak_ratio_df.index.name = 'mz_binned_small'
peak_ratio_df = peak_ratio_df.rename(columns={0: "percent_occurrence"})

In [29]:
peak_ratio_df

Unnamed: 0_level_0,percent_occurrence
mz_binned_small,Unnamed: 1_level_1
60.08,0.475452
85.03,0.581395


### percent occurrence check

In [30]:
len(filtered_peak_df_i_norm[(filtered_peak_df_i_norm['mz_binned_small'] == 85.03)].index.unique())/len(peak_df.index.unique())

0.5581395348837209

In [31]:
len(filtered_peak_df_i_norm[(filtered_peak_df_i_norm['mz_binned_small'] == 60.08)].index.unique())/len(peak_df.index.unique())

0.4754521963824289

In [32]:
with open('/home/jovyan/work/notebooks/outputs/output_box_peak_carnitine_M+H_df.html', 'w') as f:

        ax = px.box(filtered_peak_df, x='mz_binned_small',y = 'i_norm')

        # Use hidden line below to graph ratio of MISSING peaks
        #ax.add_bar(x=peak_ratio_df.index, y=-peak_ratio_df["ratio of missing peaks"], name = "ratio of spectra with missing peaks")
        
        ax.add_bar(x=peak_ratio_df.index, y=-peak_ratio_df["percent_occurrence"], name = "ratio of spectra where at least " + str(percentoccurmin)+"% contain peak")
        
        ax.update_layout(title_text="MS/MS Peak Intensity Distribution for " +"Carnitines")
        
        f.write(ax.to_html(full_html=False, include_plotlyjs='cdn'))