In [1]:
import pandas as pd
import pathlib
import numpy as np

## Load Data

In [2]:
raw_data_dir = pathlib.Path("../../data/raw/ms_work/")

trans_res_path = raw_data_dir.joinpath('BSA_Oval_2023_06_12_Transition Results.csv')
trans_res_df_raw = pd.read_csv(trans_res_path)
trans_res_df_raw.columns = trans_res_df_raw.columns.map(str.lower)
trans_res_df_raw.head()

Unnamed: 0,peptide,protein,replicate,precursor mz,precursor charge,product mz,product charge,fragment ion,retention time,area,background,peak rank
0,GGLEPINFQTAADQAR,sp|P01012|OVAL_CHICK,Soroush_C18_SDS_D2_column12_2uL_13,844.42355,2,844.42355,2,precursor,46.54,4396397568,35182616,1
1,GGLEPINFQTAADQAR,sp|P01012|OVAL_CHICK,Soroush_C18_SDC_column12_2uL_22,844.42355,2,844.42355,2,precursor,45.45,717893184,0,1
2,GGLEPINFQTAADQAR,sp|P01012|OVAL_CHICK,Soroush_C18_SDC_D2_column12_2uL_11,844.42355,2,844.42355,2,precursor,45.85,609600704,0,1
3,GGLEPINFQTAADQAR,sp|P01012|OVAL_CHICK,Soroush_C18_SDC_D3_column12_2uL_22,844.42355,2,844.42355,2,precursor,46.14,653753856,0,1
4,GGLEPINFQTAADQAR,sp|P01012|OVAL_CHICK,Soroush_C18_SDS_column12_2uL_21,844.42355,2,844.42355,2,precursor,45.79,4406924288,26830026,1


In [3]:

trans_res_df_raw.peptide.unique()

array(['GGLEPINFQTAADQAR', 'AEFVEVTK', 'YICDNQDTISSK'], dtype=object)

## Compute Area ratio

In [4]:
def grp_area_ratio(grouped_df:pd.DataFrame):
    
    """compute the ratio of areas with higer precursor mz value as numerator."""

    df = grouped_df[['precursor mz', 'area']].copy()
    df = df.sort_values(by='precursor mz', ascending=True)
    area_ratio = df['area'].iloc[1] / df['area'].iloc[0]

    return area_ratio
cols_to_group_by = ['peptide', 'protein', 'replicate', 'product charge', 'fragment ion']
df_area_ratio = trans_res_df_raw.groupby(cols_to_group_by).apply(grp_area_ratio).reset_index(name='area_ratio')


# df_area_ratio = df_area_ratio[df_area_ratio['fragment ion'] == 'precursor']  # select only precursor
print(df_area_ratio.shape)
df_area_ratio.head()

(270, 6)


Unnamed: 0,peptide,protein,replicate,product charge,fragment ion,area_ratio
0,AEFVEVTK,sp|P02769|ALBU_BOVIN,Soroush_C18_SDC_D2_column12_2uL_11,1,b2,4.785271
1,AEFVEVTK,sp|P02769|ALBU_BOVIN,Soroush_C18_SDC_D2_column12_2uL_11,1,y2,5.21429
2,AEFVEVTK,sp|P02769|ALBU_BOVIN,Soroush_C18_SDC_D2_column12_2uL_11,1,y4,4.992701
3,AEFVEVTK,sp|P02769|ALBU_BOVIN,Soroush_C18_SDC_D2_column12_2uL_11,1,y5,5.242521
4,AEFVEVTK,sp|P02769|ALBU_BOVIN,Soroush_C18_SDC_D2_column12_2uL_11,1,y6,5.069243


## Get peptide Dilution Conc.

In [5]:
df_peptide_dilution_conc = pd.read_csv(raw_data_dir.joinpath('peptide_dilution_conc.csv'),
                                    header=None)
df_peptide_dilution_conc.columns = ['peptide', 'replicate', 'fragment ion', 'area_ratio', 'heavy_conc']
df_peptide_dilution_conc.dropna(axis=0, inplace=True)
df_peptide_dilution_conc

def get_dilution_from_replicate(rep_string:str):
    """Get the dilution symbol from the replicate name."""

    str_to_check = rep_string.split('_')[3]
    if str_to_check.startswith('col'):
        dilution = 'D0'
    else:
        dilution = str_to_check

    return dilution

def get_peptide_dilution_comb(row):
    """function to create a new column from peptide and dilution name."""
    peptide = row['peptide']
    replicate  = row['replicate']
    dilution = get_dilution_from_replicate(rep_string=replicate)

    return peptide + '_' + dilution

peptide_dilution_name_col = 'peptide_dilution_name'
df_peptide_dilution_conc[peptide_dilution_name_col] = df_peptide_dilution_conc.apply(get_peptide_dilution_comb, axis=1)
df_peptide_dilution_conc.drop_duplicates(subset=peptide_dilution_name_col, inplace=True)
df_peptide_dilution_conc.head(2)


Unnamed: 0,peptide,replicate,fragment ion,area_ratio,heavy_conc,peptide_dilution_name
0,GGLEPINFQTAADQAR,Soroush_C18_SDS_column12_2uL_21,precursor,0.004588,2.208,GGLEPINFQTAADQAR_D0
1,GGLEPINFQTAADQAR,Soroush_C18_SDS_D3_column12_2uL_21,precursor,0.253738,15.771428,GGLEPINFQTAADQAR_D3


### Merge data

In [6]:
df_area_ratio[peptide_dilution_name_col] = df_area_ratio.apply(get_peptide_dilution_comb, axis=1)



df_area_ratio_conc = pd.merge(df_area_ratio, right=df_peptide_dilution_conc, how='left', 
                              on=peptide_dilution_name_col, suffixes=('', '_y'))
df_area_ratio_conc.drop(df_area_ratio_conc.filter(regex='_y$').columns, axis=1, inplace=True)

def get_rep_plot_cat(rep_val:str):
    """Get the plot category (C18_SDS, C18_SDC, etc..)."""

    select_elements = rep_val.split('_')[1:3]

    cat = '_'.join(select_elements)
    return cat

def get_rep_plot_cat(row):
    """Get the plot category (C18_SDS, C18_SDC, etc..)."""

    replicate = row['replicate']
    peptide = row['peptide']
    fragment_ion = row['fragment ion']

    rep_elements = replicate.split('_')[0:3]

    cat_name = [peptide] + [fragment_ion] + rep_elements
    cat_name = '_'.join(cat_name)
    return cat_name

# create plot categories
df_area_ratio_conc['plot_cat'] = df_area_ratio_conc.apply(get_rep_plot_cat, axis=1)
df_area_ratio_conc.head()
# print(df_area_ratio_conc.isna().sum())

Unnamed: 0,peptide,protein,replicate,product charge,fragment ion,area_ratio,peptide_dilution_name,heavy_conc,plot_cat
0,AEFVEVTK,sp|P02769|ALBU_BOVIN,Soroush_C18_SDC_D2_column12_2uL_11,1,b2,4.785271,AEFVEVTK_D2,86.5,AEFVEVTK_b2_Soroush_C18_SDC
1,AEFVEVTK,sp|P02769|ALBU_BOVIN,Soroush_C18_SDC_D2_column12_2uL_11,1,y2,5.21429,AEFVEVTK_D2,86.5,AEFVEVTK_y2_Soroush_C18_SDC
2,AEFVEVTK,sp|P02769|ALBU_BOVIN,Soroush_C18_SDC_D2_column12_2uL_11,1,y4,4.992701,AEFVEVTK_D2,86.5,AEFVEVTK_y4_Soroush_C18_SDC
3,AEFVEVTK,sp|P02769|ALBU_BOVIN,Soroush_C18_SDC_D2_column12_2uL_11,1,y5,5.242521,AEFVEVTK_D2,86.5,AEFVEVTK_y5_Soroush_C18_SDC
4,AEFVEVTK,sp|P02769|ALBU_BOVIN,Soroush_C18_SDC_D2_column12_2uL_11,1,y6,5.069243,AEFVEVTK_D2,86.5,AEFVEVTK_y6_Soroush_C18_SDC


## Get Linear Fit parameters

In [11]:
from sklearn import linear_model, metrics
import matplotlib.pyplot as plt

def get_linear_fit(data:pd.DataFrame, plot_cat:str):

    cat_data = data[data['plot_cat'] == plot_cat]
    # print(cat_data)

    fig, ax = plt.subplots()

    x = cat_data['area_ratio'].values
    y = cat_data['heavy_conc'].values

    ax.scatter(x, y)

    x = x[:, np.newaxis]
    y = y[:, np.newaxis]

    model = linear_model.LinearRegression()

    model.fit(x, y)

    y_fit = model.predict(x)

    # ax.plot(x.flatten(), y_fit.flatten())

    r2 = metrics.r2_score(y, y_fit)
    intercept = model.intercept_.squeeze()
    grad = model.coef_[0].squeeze()


    # print(f"R^2: {r2}")
    # print(f"intercept: {intercept}")
    # print(f"gradient:{grad}")

    # plt.show()
    
    return (r2, intercept, grad)

plot_cats = df_area_ratio_conc['plot_cat'].unique()

# plot_df = df_area_ratio_conc[df_area_ratio_conc.peptide=='GGLEPINFQTAADQAR'] # get only one peptide
# plot_df = plot_df[[peptide_dilution_name_col, 'area_ratio', 'heavy_conc', 'plot_cat']]
plot_df = df_area_ratio_conc


cats = []
r2s = []
intercepts = []
grads = []

for cat in plot_cats:
    # print(cat)
    r2, intercept, grad = get_linear_fit(plot_df, cat)
    cats.append(cat)
    r2s.append(r2)
    intercepts.append(intercept)
    grads.append(grad)
    



df_plot_cat_fit_params = pd.DataFrame({'plot_cat':cats,
              'R2':r2s,
              'intercept':intercepts,
              'gradient':grads})




In [9]:
df_area_ratio_conc_n_fit_params = pd.merge(df_area_ratio_conc,
                                           right=df_plot_cat_fit_params,
                                           on='plot_cat',
                                           how='left')

df_area_ratio_conc_n_fit_params
# save to csv file
df_area_ratio_conc_n_fit_params.to_csv('area_ratio_conc_n_fit_params.csv', index=False)
    

In [10]:
df_area_ratio_conc_n_fit_params

Unnamed: 0,peptide,protein,replicate,product charge,fragment ion,area_ratio,peptide_dilution_name,heavy_conc,plot_cat,R2,intercept,gradient
0,AEFVEVTK,sp|P02769|ALBU_BOVIN,Soroush_C18_SDC_D2_column12_2uL_11,1,b2,4.785271,AEFVEVTK_D2,86.5,AEFVEVTK_b2_Soroush_C18_SDC,0.999321,-1.1851182430159426,18.288988526639038
1,AEFVEVTK,sp|P02769|ALBU_BOVIN,Soroush_C18_SDC_D2_column12_2uL_11,1,y2,5.214290,AEFVEVTK_D2,86.5,AEFVEVTK_y2_Soroush_C18_SDC,0.999455,-1.0386297514165719,16.760133904180453
2,AEFVEVTK,sp|P02769|ALBU_BOVIN,Soroush_C18_SDC_D2_column12_2uL_11,1,y4,4.992701,AEFVEVTK_D2,86.5,AEFVEVTK_y4_Soroush_C18_SDC,0.999336,-1.4140921074358346,17.575471055966215
3,AEFVEVTK,sp|P02769|ALBU_BOVIN,Soroush_C18_SDC_D2_column12_2uL_11,1,y5,5.242521,AEFVEVTK_D2,86.5,AEFVEVTK_y5_Soroush_C18_SDC,0.999419,-1.0674445664184518,16.67428663220671
4,AEFVEVTK,sp|P02769|ALBU_BOVIN,Soroush_C18_SDC_D2_column12_2uL_11,1,y6,5.069243,AEFVEVTK_D2,86.5,AEFVEVTK_y6_Soroush_C18_SDC,0.999375,-1.1161402670400307,17.309569164464367
...,...,...,...,...,...,...,...,...,...,...,...,...
265,YICDNQDTISSK,sp|P02769|ALBU_BOVIN,Soroush_MCX_SDC_column12_2uL_23,1,y9,0.000000,YICDNQDTISSK_D0,0.0,YICDNQDTISSK_y9_Soroush_MCX_SDC,0.995006,-4.687844501641244,2.0857505103374714
266,YICDNQDTISSK,sp|P02769|ALBU_BOVIN,Soroush_MCX_SDC_column12_2uL_23,2,precursor,0.001054,YICDNQDTISSK_D0,0.0,YICDNQDTISSK_precursor_Soroush_MCX_SDC,0.997090,-3.6287598335339766,1.7686713047091316
267,YICDNQDTISSK,sp|P02769|ALBU_BOVIN,Soroush_MCX_SDC_column12_2uL_23,2,precursor [M+1],0.000000,YICDNQDTISSK_D0,0.0,YICDNQDTISSK_precursor [M+1]_Soroush_MCX_SDC,0.997094,-3.624696880648294,1.9872262295651868
268,YICDNQDTISSK,sp|P02769|ALBU_BOVIN,Soroush_MCX_SDC_column12_2uL_23,2,precursor [M+2],0.010409,YICDNQDTISSK_D0,0.0,YICDNQDTISSK_precursor [M+2]_Soroush_MCX_SDC,0.997215,-3.5729942042394924,2.0797923741269617
