In [63]:
import pandas as pd
import pathlib
import numpy as np

## Load Data

In [64]:
raw_data_dir = pathlib.Path("../../data/raw/soroush_3")

trans_res_path = raw_data_dir.joinpath('JPT4_HHT_Cytosolic_W.csv')
trans_res_df_raw = pd.read_csv(trans_res_path)
trans_res_df_raw.columns = trans_res_df_raw.columns.map(str.lower)
trans_res_df_raw.head()

###

# trans_res_df_raw = trans_res_df_raw[trans_res_df_raw['peptide'] == 'GDLTHSGLWR']

Unnamed: 0,peptide,protein,replicate,precursor mz,precursor charge,product mz,product charge,fragment ion,area
0,GDLTHSGLWR,sp|009|JPT4,D0_2uL_48,571.291079,2,571.291079,2,precursor,3478692
1,GDLTHSGLWR,sp|009|JPT4,D0_2uL_6,571.291079,2,571.291079,2,precursor,9423027
2,GDLTHSGLWR,sp|009|JPT4,D0_2uL_90,571.291079,2,571.291079,2,precursor,30357
3,GDLTHSGLWR,sp|009|JPT4,D1_2uL_10,571.291079,2,571.291079,2,precursor,198451
4,GDLTHSGLWR,sp|009|JPT4,D1_2uL_52,571.291079,2,571.291079,2,precursor,577711


In [65]:

trans_res_df_raw.peptide.unique()

array(['GDLTHSGLWR', 'INHFPEDNDYDHDSSEYLLR', 'YQDLYTVEPNNAR',
       'VFTFSVGQHNYDR', 'FVVTDGGITR', 'EVESQAQQQLER', 'TSLAPIIVFVK',
       'ATHPPSSSLPNPLLSR', 'EDYSHDHVDHYASHR', 'HLNVQIAASEK',
       'TLQLVALDADTINHPAQLSK', 'ALFDFLK', 'GSELGVSPSESPAAER',
       'VAHAAATAAASLR', 'SSFDLPDTLQVPGLHR', 'HLDVVTLLR', 'TEGNLEQANEELR',
       'ADGANALGGK', 'AGSSQGDTESPSHEK', 'DVELYEHWK', 'ATIHENIGAAGFK'],
      dtype=object)

## Compute Area ratio

In [66]:

def grp_area_ratio_n_area_cols(grouped_df:pd.DataFrame, denominator_is_min=True):
    
    """compute the ratio of areas with higer precursor mz value as numerator."""

    df = grouped_df[['precursor mz', 'area']].copy()
    
    df = df.sort_values(by='precursor mz', ascending=True)

    
    area_min, area_max  =  df['area'].iloc[0], df['area'].iloc[1]
    if denominator_is_min:
        # area_ratio = df['area'].iloc[1] / df['area'].iloc[0]
        if area_min == 0:
            area_min = 1.0
        area_ratio = area_max / area_min
    else:
        if area_max == 0:
            area_max = 1.0
        area_ratio = area_min / area_max
        

    col_vals = {'area_ratio': area_ratio, 'area_min':area_min, 'area_max':area_max}

    return pd.Series(col_vals)

cols_to_group_by = ['peptide', 'protein', 'replicate', 'precursor charge', 'product charge', 'fragment ion']
# filter groups that do not contain exactlly two element
# can change number to 1 and print result to see groups that have no pairs
temp_df = trans_res_df_raw.groupby(cols_to_group_by).filter(lambda g: g['area'].count() == 2)


df_area_ratio = temp_df.groupby(cols_to_group_by).apply(grp_area_ratio_n_area_cols,).reset_index()


print(df_area_ratio.shape)
df_area_ratio.head()

(2895, 9)


Unnamed: 0,peptide,protein,replicate,precursor charge,product charge,fragment ion,area_ratio,area_min,area_max
0,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,b2,0.088383,98062.0,8667.0
1,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,b3,33248.0,1.0,33248.0
2,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,y3,0.0,2121.0,0.0
3,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,y5,0.0,1.0,0.0
4,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,y6,0.0,1.0,0.0


In [67]:
np.isinf(df_area_ratio.area_ratio).sum()

0

## Get peptide Dilution Conc.

In [68]:



df_peptide_dilution_conc = pd.read_csv(raw_data_dir.joinpath('JPT4_peptide_conc.csv'))
raw_dilution_col_names = df_peptide_dilution_conc.columns[1:]

df_peptide_dilution_conc = df_peptide_dilution_conc.melt(id_vars='Peptides', value_vars=raw_dilution_col_names, 
                                                        var_name='dilution', value_name='heavy_conc')
df_peptide_dilution_conc['dilution'] = df_peptide_dilution_conc['dilution'].apply(lambda x: x[:2])
peptide_dilution_name_col = 'peptide_dilution_name'
df_peptide_dilution_conc[peptide_dilution_name_col] = df_peptide_dilution_conc['Peptides'] + '_' + \
                                                    df_peptide_dilution_conc['dilution'] 
df_peptide_dilution_conc.head()


Unnamed: 0,Peptides,dilution,heavy_conc,peptide_dilution_name
0,GDLTHSGLWR,D1,0.043722,GDLTHSGLWR_D1
1,INHFPEDNDYDHDSSEYLLR,D1,0.094547,INHFPEDNDYDHDSSEYLLR_D1
2,YQDLYTVEPNNAR,D1,0.060487,YQDLYTVEPNNAR_D1
3,VFTFSVGQHNYDR,D1,0.059992,VFTFSVGQHNYDR_D1
4,FVVTDGGITR,D1,0.040796,FVVTDGGITR_D1


### Merge data

In [69]:
df_area_ratio

Unnamed: 0,peptide,protein,replicate,precursor charge,product charge,fragment ion,area_ratio,area_min,area_max
0,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,b2,8.838286e-02,98062.0,8667.0
1,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,b3,3.324800e+04,1.0,33248.0
2,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,y3,0.000000e+00,2121.0,0.0
3,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,y5,0.000000e+00,1.0,0.0
4,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,y6,0.000000e+00,1.0,0.0
...,...,...,...,...,...,...,...,...,...
2890,YQDLYTVEPNNAR,sp|009|JPT4,D4_2uL_91,2,1,y8,2.271160e+05,1.0,227116.0
2891,YQDLYTVEPNNAR,sp|009|JPT4,D4_2uL_91,2,1,y9,2.039140e+05,1.0,203914.0
2892,YQDLYTVEPNNAR,sp|009|JPT4,D4_2uL_91,2,2,precursor,3.037390e+01,133514.0,4055341.0
2893,YQDLYTVEPNNAR,sp|009|JPT4,D4_2uL_91,2,2,precursor [M+1],6.185941e+00,377958.0,2338026.0


In [70]:
def get_dilution_from_replicate(rep_string:str):
    """Get the dilution symbol from the replicate name."""

    str_to_check = rep_string.split('_')[0]
    if str_to_check.startswith('col'):
        dilution = 'D0'
    else:
        dilution = str_to_check

    return dilution

def get_peptide_dilution_comb(row):
    """function to create a new column from peptide and dilution name."""
    peptide = row['peptide']
    replicate  = row['replicate']
    dilution = get_dilution_from_replicate(rep_string=replicate)

    return peptide + '_' + dilution

df_area_ratio[peptide_dilution_name_col] = df_area_ratio.apply(get_peptide_dilution_comb, axis=1)



df_area_ratio_conc = pd.merge(df_area_ratio, right=df_peptide_dilution_conc, how='left', 
                              on=peptide_dilution_name_col, suffixes=('', '_y'))
df_area_ratio_conc.drop(df_area_ratio_conc.filter(regex='_y$').columns, axis=1, inplace=True)
df_area_ratio_conc = df_area_ratio_conc.dropna(axis=0) # drop nan
# drop inf
df_area_ratio_conc.replace([np.inf, -np.inf], np.nan, inplace=True)
# df_area_ratio_conc.dropna(subset=["area_ratio"], how="all", inplace=True)

# def get_rep_plot_cat(rep_val:str):
#     """Get the plot category (C18_SDS, C18_SDC, etc..)."""

#     select_elements = rep_val.split('_')[1:3]

#     cat = '_'.join(select_elements)
#     return cat

# TODO: - reverse x and y in plots: ratio:y, conc: x
#- TODO: further break replicate into 2 groups as shown in the excel, the greens are one group and the white is another.
def get_rep_plot_cat(row):
    """Get the plot category (C18_SDS, C18_SDC, etc..)."""

    replicate = row['replicate']
    peptide = row['peptide']
    fragment_ion = row['fragment ion']

    rep_elements = replicate.split('_')[0:3]

    cat_name = [peptide] + [fragment_ion] #+ rep_elements
    cat_name = '_'.join(cat_name)
    return cat_name


# create plot categories
df_area_ratio_conc['plot_cat'] = df_area_ratio_conc.apply(get_rep_plot_cat, axis=1)

df_area_ratio_conc['plot_cat_grp'] = df_area_ratio_conc['plot_cat'].apply(lambda x: '_'.join(x.split('_')[:-2]))
df_area_ratio_conc['order_comp'] = df_area_ratio_conc['replicate'].apply(lambda x: x.split('_')[-1])
df_area_ratio_conc.head()

Unnamed: 0,peptide,protein,replicate,precursor charge,product charge,fragment ion,area_ratio,area_min,area_max,peptide_dilution_name,Peptides,dilution,heavy_conc,plot_cat,plot_cat_grp,order_comp
0,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,b2,0.088383,98062.0,8667.0,ADGANALGGK_D0,ADGANALGGK,D0,0.0,ADGANALGGK_b2,,48
1,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,b3,33248.0,1.0,33248.0,ADGANALGGK_D0,ADGANALGGK,D0,0.0,ADGANALGGK_b3,,48
2,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,y3,0.0,2121.0,0.0,ADGANALGGK_D0,ADGANALGGK,D0,0.0,ADGANALGGK_y3,,48
3,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,y5,0.0,1.0,0.0,ADGANALGGK_D0,ADGANALGGK,D0,0.0,ADGANALGGK_y5,,48
4,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,y6,0.0,1.0,0.0,ADGANALGGK_D0,ADGANALGGK,D0,0.0,ADGANALGGK_y6,,48


In [71]:
from scipy.stats import rankdata

def get_order(grouped_df):

    vals_to_order = list(map(int, grouped_df['order_comp'].values))
    ranks = np.int8(rankdata(vals_to_order))

    grouped_df['order'] = ranks

    return grouped_df
    
df_area_ratio_conc['rep_partial'] = df_area_ratio_conc['replicate'].apply(lambda x: '_'.join(x.split('_')[:-1]))
df_area_ratio_conc['plot_cat_grp'] = df_area_ratio_conc['plot_cat'] + df_area_ratio_conc['rep_partial']
                 
# df_area_ratio_conc['rep_partial_last'] = df_area_ratio_conc['replicate'].apply(lambda x: x.split('_')[-1])

order = df_area_ratio_conc.groupby('plot_cat_grp').apply(get_order)['order']

df_area_ratio_conc['plot_cat'] = df_area_ratio_conc['plot_cat'] + '_' + list(map(str, list(order)))
df_area_ratio_conc.sort_values(by=['plot_cat'])

Unnamed: 0,peptide,protein,replicate,precursor charge,product charge,fragment ion,area_ratio,area_min,area_max,peptide_dilution_name,Peptides,dilution,heavy_conc,plot_cat,plot_cat_grp,order_comp,rep_partial
90,ADGANALGGK,sp|009|JPT4,D3_2uL_8,2,1,b2,3.543956e+02,8551.0,3030437.0,ADGANALGGK_D3,ADGANALGGK,D3,0.001338,ADGANALGGK_b2_1,ADGANALGGK_b2D3_2uL,8,D3_2uL
63,ADGANALGGK,sp|009|JPT4,D2_2uL_9,2,1,b2,1.551142e+02,81797.0,12687879.0,ADGANALGGK_D2,ADGANALGGK,D2,0.006691,ADGANALGGK_b2_1,ADGANALGGK_b2D2_2uL,9,D2_2uL
117,ADGANALGGK,sp|009|JPT4,D4_2uL_7,2,1,b2,7.612440e+05,1.0,761244.0,ADGANALGGK_D4,ADGANALGGK,D4,0.000268,ADGANALGGK_b2_1,ADGANALGGK_b2D4_2uL,7,D4_2uL
27,ADGANALGGK,sp|009|JPT4,D1_2uL_10,2,1,b2,4.333859e+02,63207.0,27393024.0,ADGANALGGK_D1,ADGANALGGK,D1,0.033457,ADGANALGGK_b2_1,ADGANALGGK_b2D1_2uL,10,D1_2uL
9,ADGANALGGK,sp|009|JPT4,D0_2uL_6,2,1,b2,1.745670e-02,356253.0,6219.0,ADGANALGGK_D0,ADGANALGGK,D0,0.000000,ADGANALGGK_b2_1,ADGANALGGK_b2D0_2uL,6,D0_2uL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2783,YQDLYTVEPNNAR,sp|009|JPT4,D0_2uL_90,2,1,y9,0.000000e+00,1.0,0.0,YQDLYTVEPNNAR_D0,YQDLYTVEPNNAR,D0,0.000000,YQDLYTVEPNNAR_y9_3,YQDLYTVEPNNAR_y9D0_2uL,90,D0_2uL
2891,YQDLYTVEPNNAR,sp|009|JPT4,D4_2uL_91,2,1,y9,2.039140e+05,1.0,203914.0,YQDLYTVEPNNAR_D4,YQDLYTVEPNNAR,D4,0.000484,YQDLYTVEPNNAR_y9_3,YQDLYTVEPNNAR_y9D4_2uL,91,D4_2uL
2864,YQDLYTVEPNNAR,sp|009|JPT4,D3_2uL_92,2,1,y9,1.835374e+06,1.0,1835374.0,YQDLYTVEPNNAR_D3,YQDLYTVEPNNAR,D3,0.002419,YQDLYTVEPNNAR_y9_3,YQDLYTVEPNNAR_y9D3_2uL,92,D3_2uL
2837,YQDLYTVEPNNAR,sp|009|JPT4,D2_2uL_93,2,1,y9,5.051863e+06,1.0,5051863.0,YQDLYTVEPNNAR_D2,YQDLYTVEPNNAR,D2,0.012097,YQDLYTVEPNNAR_y9_3,YQDLYTVEPNNAR_y9D2_2uL,93,D2_2uL


In [72]:
df_area_ratio_conc['plot_cat'].value_counts()

DVELYEHWK_y5_7                        15
DVELYEHWK_y7_3                        15
ATIHENIGAAGFK_y10_6                   15
ATIHENIGAAGFK_y10_3                   15
DVELYEHWK_y7_6                        15
                                      ..
YQDLYTVEPNNAR_y8_2                     5
TSLAPIIVFVK_b2_1                       5
VFTFSVGQHNYDR_y7_1                     5
GSELGVSPSESPAAER_precursor [M+1]_1     5
AGSSQGDTESPSHEK_precursor_2            5
Name: plot_cat, Length: 529, dtype: int64

In [73]:
temp_df = df_area_ratio_conc.copy()
temp_df['rep_partial'] = temp_df['replicate'].apply(lambda x: '_'.join(x.split('_')[:-1]))
temp_df['rep_partial_last'] = temp_df['replicate'].apply(lambda x: x.split('_')[-1])

grp_df = temp_df.groupby(['plot_cat', 'rep_partial'])

In [74]:
temp_df.sort_values(by='plot_cat').head()

Unnamed: 0,peptide,protein,replicate,precursor charge,product charge,fragment ion,area_ratio,area_min,area_max,peptide_dilution_name,Peptides,dilution,heavy_conc,plot_cat,plot_cat_grp,order_comp,rep_partial,rep_partial_last
90,ADGANALGGK,sp|009|JPT4,D3_2uL_8,2,1,b2,354.395626,8551.0,3030437.0,ADGANALGGK_D3,ADGANALGGK,D3,0.001338,ADGANALGGK_b2_1,ADGANALGGK_b2D3_2uL,8,D3_2uL,8
63,ADGANALGGK,sp|009|JPT4,D2_2uL_9,2,1,b2,155.114234,81797.0,12687879.0,ADGANALGGK_D2,ADGANALGGK,D2,0.006691,ADGANALGGK_b2_1,ADGANALGGK_b2D2_2uL,9,D2_2uL,9
117,ADGANALGGK,sp|009|JPT4,D4_2uL_7,2,1,b2,761244.0,1.0,761244.0,ADGANALGGK_D4,ADGANALGGK,D4,0.000268,ADGANALGGK_b2_1,ADGANALGGK_b2D4_2uL,7,D4_2uL,7
27,ADGANALGGK,sp|009|JPT4,D1_2uL_10,2,1,b2,433.385922,63207.0,27393024.0,ADGANALGGK_D1,ADGANALGGK,D1,0.033457,ADGANALGGK_b2_1,ADGANALGGK_b2D1_2uL,10,D1_2uL,10
9,ADGANALGGK,sp|009|JPT4,D0_2uL_6,2,1,b2,0.017457,356253.0,6219.0,ADGANALGGK_D0,ADGANALGGK,D0,0.0,ADGANALGGK_b2_1,ADGANALGGK_b2D0_2uL,6,D0_2uL,6


In [75]:

from scipy.stats import rankdata

def get_order(grouped_df):

    vals_to_order = list(map(int, grouped_df['order_comp'].values))
    ranks = np.int8(rankdata(vals_to_order))

    grouped_df['order'] = ranks

    return grouped_df
    
    

order = df_area_ratio_conc.groupby('plot_cat_grp').apply(get_order)['order']

df_area_ratio_conc['plot_cat'] = df_area_ratio_conc['plot_cat'] + '_' + list(map(str, list(order)))
df_area_ratio_conc.sort_values(by=['plot_cat'])

Unnamed: 0,peptide,protein,replicate,precursor charge,product charge,fragment ion,area_ratio,area_min,area_max,peptide_dilution_name,Peptides,dilution,heavy_conc,plot_cat,plot_cat_grp,order_comp,rep_partial
90,ADGANALGGK,sp|009|JPT4,D3_2uL_8,2,1,b2,3.543956e+02,8551.0,3030437.0,ADGANALGGK_D3,ADGANALGGK,D3,0.001338,ADGANALGGK_b2_1_1,ADGANALGGK_b2D3_2uL,8,D3_2uL
63,ADGANALGGK,sp|009|JPT4,D2_2uL_9,2,1,b2,1.551142e+02,81797.0,12687879.0,ADGANALGGK_D2,ADGANALGGK,D2,0.006691,ADGANALGGK_b2_1_1,ADGANALGGK_b2D2_2uL,9,D2_2uL
117,ADGANALGGK,sp|009|JPT4,D4_2uL_7,2,1,b2,7.612440e+05,1.0,761244.0,ADGANALGGK_D4,ADGANALGGK,D4,0.000268,ADGANALGGK_b2_1_1,ADGANALGGK_b2D4_2uL,7,D4_2uL
27,ADGANALGGK,sp|009|JPT4,D1_2uL_10,2,1,b2,4.333859e+02,63207.0,27393024.0,ADGANALGGK_D1,ADGANALGGK,D1,0.033457,ADGANALGGK_b2_1_1,ADGANALGGK_b2D1_2uL,10,D1_2uL
9,ADGANALGGK,sp|009|JPT4,D0_2uL_6,2,1,b2,1.745670e-02,356253.0,6219.0,ADGANALGGK_D0,ADGANALGGK,D0,0.000000,ADGANALGGK_b2_1_1,ADGANALGGK_b2D0_2uL,6,D0_2uL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2783,YQDLYTVEPNNAR,sp|009|JPT4,D0_2uL_90,2,1,y9,0.000000e+00,1.0,0.0,YQDLYTVEPNNAR_D0,YQDLYTVEPNNAR,D0,0.000000,YQDLYTVEPNNAR_y9_3_3,YQDLYTVEPNNAR_y9D0_2uL,90,D0_2uL
2891,YQDLYTVEPNNAR,sp|009|JPT4,D4_2uL_91,2,1,y9,2.039140e+05,1.0,203914.0,YQDLYTVEPNNAR_D4,YQDLYTVEPNNAR,D4,0.000484,YQDLYTVEPNNAR_y9_3_3,YQDLYTVEPNNAR_y9D4_2uL,91,D4_2uL
2864,YQDLYTVEPNNAR,sp|009|JPT4,D3_2uL_92,2,1,y9,1.835374e+06,1.0,1835374.0,YQDLYTVEPNNAR_D3,YQDLYTVEPNNAR,D3,0.002419,YQDLYTVEPNNAR_y9_3_3,YQDLYTVEPNNAR_y9D3_2uL,92,D3_2uL
2837,YQDLYTVEPNNAR,sp|009|JPT4,D2_2uL_93,2,1,y9,5.051863e+06,1.0,5051863.0,YQDLYTVEPNNAR_D2,YQDLYTVEPNNAR,D2,0.012097,YQDLYTVEPNNAR_y9_3_3,YQDLYTVEPNNAR_y9D2_2uL,93,D2_2uL


In [76]:
df_area_ratio_conc['plot_cat'].value_counts()

ATIHENIGAAGFK_y10_3_3                       15
DVELYEHWK_y5_4_4                            15
DVELYEHWK_y5_7_7                            15
DVELYEHWK_y7_3_3                            15
ATIHENIGAAGFK_y10_6_6                       15
                                            ..
HLNVQIAASEK_precursor [M+2]_3_3              5
INHFPEDNDYDHDSSEYLLR_precursor [M+1]_2_2     5
TLQLVALDADTINHPAQLSK_y16_2_2                 5
AGSSQGDTESPSHEK_precursor_2_2                5
EVESQAQQQLER_y1_2_2                          5
Name: plot_cat, Length: 529, dtype: int64

In [77]:
df_area_ratio_conc['plot_cat'][:20]

0                  ADGANALGGK_b2_2_2
1                  ADGANALGGK_b3_2_2
2                  ADGANALGGK_y3_2_2
3                  ADGANALGGK_y5_2_2
4                  ADGANALGGK_y6_2_2
5                  ADGANALGGK_y8_2_2
6           ADGANALGGK_precursor_2_2
7     ADGANALGGK_precursor [M+1]_2_2
8     ADGANALGGK_precursor [M+2]_2_2
9                  ADGANALGGK_b2_1_1
10                 ADGANALGGK_b3_1_1
11                 ADGANALGGK_y3_1_1
12                 ADGANALGGK_y5_1_1
13                 ADGANALGGK_y6_1_1
14                 ADGANALGGK_y8_1_1
15          ADGANALGGK_precursor_1_1
16    ADGANALGGK_precursor [M+1]_1_1
17    ADGANALGGK_precursor [M+2]_1_1
18                 ADGANALGGK_b2_3_3
19                 ADGANALGGK_b3_3_3
Name: plot_cat, dtype: object

In [78]:
df_area_ratio_conc['plot_cat'].unique()

array(['ADGANALGGK_b2_2_2', 'ADGANALGGK_b3_2_2', 'ADGANALGGK_y3_2_2',
       'ADGANALGGK_y5_2_2', 'ADGANALGGK_y6_2_2', 'ADGANALGGK_y8_2_2',
       'ADGANALGGK_precursor_2_2', 'ADGANALGGK_precursor [M+1]_2_2',
       'ADGANALGGK_precursor [M+2]_2_2', 'ADGANALGGK_b2_1_1',
       'ADGANALGGK_b3_1_1', 'ADGANALGGK_y3_1_1', 'ADGANALGGK_y5_1_1',
       'ADGANALGGK_y6_1_1', 'ADGANALGGK_y8_1_1',
       'ADGANALGGK_precursor_1_1', 'ADGANALGGK_precursor [M+1]_1_1',
       'ADGANALGGK_precursor [M+2]_1_1', 'ADGANALGGK_b2_3_3',
       'ADGANALGGK_b3_3_3', 'ADGANALGGK_y3_3_3', 'ADGANALGGK_y5_3_3',
       'ADGANALGGK_y6_3_3', 'ADGANALGGK_y8_3_3',
       'ADGANALGGK_precursor_3_3', 'ADGANALGGK_precursor [M+1]_3_3',
       'ADGANALGGK_precursor [M+2]_3_3', 'AGSSQGDTESPSHEK_b2_2_2',
       'AGSSQGDTESPSHEK_y5_2_2', 'AGSSQGDTESPSHEK_y6_2_2',
       'AGSSQGDTESPSHEK_y10_2_2', 'AGSSQGDTESPSHEK_y13_2_2',
       'AGSSQGDTESPSHEK_y14_2_2', 'AGSSQGDTESPSHEK_precursor_2_2',
       'AGSSQGDTESPSHEK_precursor [M+

## Get Linear Fit parameters

In [79]:
np.isclose([1,2,3,0.001], 0.001, atol=1e-6)

array([False, False, False,  True])

In [80]:
from sklearn import linear_model, metrics
import matplotlib.pyplot as plt

tol = 1e-6
def get_plot(data:pd.DataFrame, plot_cat:str, ax):

    cat_data = data[data['plot_cat'] == plot_cat].copy()
    # print(cat_data)

    # fig, ax = plt.subplots()
    cat_data.dropna(axis=0, inplace=True)

    y = cat_data['area_ratio'].values
    x = cat_data['heavy_conc'].values


    
    valid_pts = np.where(~np.isnan(x) & ~np.isinf(x) & ~np.isclose(x, 0, atol=tol))[0]
    x = x[valid_pts]
    y = y[valid_pts]

    sample_weights  = 1/x
    ax.scatter(x, y);

    x = x[:, np.newaxis]
    y = y[:, np.newaxis]

    model = linear_model.LinearRegression()

    model.fit(x, y, sample_weight=sample_weights)

    y_fit = model.predict(x)

    ax.plot(x.flatten(), y_fit.flatten());
    ax.set_title(plot_cat)
    
    return ax

def get_linear_fit(data:pd.DataFrame, plot_cat:str):

    cat_data = data[data['plot_cat'] == plot_cat].copy()
    # print(cat_data)

    # fig, ax = plt.subplots()
    cat_data.dropna(axis=0, inplace=True)

    y = cat_data['area_ratio'].values
    x = cat_data['heavy_conc'].values

  
    
    valid_pts = np.where(~np.isnan(x) & ~np.isinf(x) & ~np.isclose(x, 0, atol=tol))[0]
    x = x[valid_pts]
    y = y[valid_pts]

    # ax.scatter(x, y)
    sample_weights  = 1/x

    x = x[:, np.newaxis]
    y = y[:, np.newaxis]

    model = linear_model.LinearRegression()

    model.fit(x, y, sample_weight=sample_weights)

    y_fit = model.predict(x)

    # ax.plot(x.flatten(), y_fit.flatten())

    r2 = metrics.r2_score(y, y_fit)
    intercept = model.intercept_.squeeze()
    grad = model.coef_[0].squeeze()

    
    return (r2, intercept, grad)

plot_cats = df_area_ratio_conc['plot_cat'].unique()

# plot_df = df_area_ratio_conc[df_area_ratio_conc.peptide=='GGLEPINFQTAADQAR'] # get only one peptide
# plot_df = plot_df[[peptide_dilution_name_col, 'area_ratio', 'heavy_conc', 'plot_cat']]
plot_df = df_area_ratio_conc


cats = []
r2s = []
intercepts = []
grads = []

for cat in plot_cats:
    # print(cat)
    r2, intercept, grad = get_linear_fit(plot_df, cat)
    cats.append(cat)
    r2s.append(r2)
    intercepts.append(intercept)
    grads.append(grad)




df_plot_cat_fit_params = pd.DataFrame({'plot_cat':cats,
              'R2':r2s,
              'intercept':intercepts,
              'gradient':grads})




## Plots

### plot by plot cat

In [81]:
result_data_folder = pathlib.Path("../../reports/ms_work/soroush_3/plots_1_over_x")
for cat in plot_cats:
    fig, ax = plt.subplots()
    ax = get_plot(plot_df, cat, ax)
    fig = plt.gcf()
    fig.savefig(fname=result_data_folder.joinpath(cat))
    plt.close()

### Plots (by group)

In [82]:
result_data_folder = pathlib.Path("../../reports/ms_work/soroush_3/plots_by_grp_1_over_x")

def get_grp_plot(data:pd.DataFrame, plot_grp_cat:str, ax):

    cat_data = data[data['plot_cat_grp'] == plot_grp_cat].copy()
    # print(cat_data)

    # fig, ax = plt.subplots()
    cat_data.dropna(axis=0, inplace=True)

    y = cat_data['area_ratio'].values 
    x = cat_data['heavy_conc'].values


    valid_pts = np.where(~np.isnan(x) & ~np.isinf(x) & ~np.isclose(x, 0, atol=tol))[0]
    x = x[valid_pts]
    y = y[valid_pts]

    ax.scatter(x, y)

    sample_weights  = 1/x
    x = x[:, np.newaxis]
    y = y[:, np.newaxis]
    

    model = linear_model.LinearRegression()
    # print(np.max(x))
    model.fit(x, y, sample_weight=sample_weights)

    y_fit = model.predict(x)

    ax.plot(x.flatten(), y_fit.flatten())
    ax.set_title(plot_grp_cat)
    
    return ax

def get_plot_grp_col(plot_cat:str):

    items = plot_cat.split("_")

    del items[1]
    grp_str = "_".join(items)

    return grp_str

    


plot_df['plot_cat_grp'] = plot_df['plot_cat'].apply(get_plot_grp_col)
plot_cat_grps = plot_df['plot_cat_grp'].unique()


for grp in plot_cat_grps:
    fig, ax = plt.subplots()
    ax = get_grp_plot(data=plot_df, plot_grp_cat=grp, ax=ax)
    fig = plt.gcf()
    fig.savefig(fname=result_data_folder.joinpath(grp))
    plt.close()

In [83]:
# merge to assign group properties to individual rows
df_area_ratio_conc_n_fit_params = pd.merge(df_area_ratio_conc,
                                           right=df_plot_cat_fit_params,
                                           on='plot_cat',
                                           how='left')

df_area_ratio_conc_n_fit_params
# save to csv file
# df_area_ratio_conc_n_fit_params.to_csv('area_ratio_conc_n_fit_params.csv', index=False)
    

Unnamed: 0,peptide,protein,replicate,precursor charge,product charge,fragment ion,area_ratio,area_min,area_max,peptide_dilution_name,Peptides,dilution,heavy_conc,plot_cat,plot_cat_grp,order_comp,rep_partial,R2,intercept,gradient
0,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,b2,8.838286e-02,98062.0,8667.0,ADGANALGGK_D0,ADGANALGGK,D0,0.000000,ADGANALGGK_b2_2_2,ADGANALGGK_2_2,48,D0_2uL,0.983330,30.951483305792635,39358.48724197791
1,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,b3,3.324800e+04,1.0,33248.0,ADGANALGGK_D0,ADGANALGGK,D0,0.000000,ADGANALGGK_b3_2_2,ADGANALGGK_2_2,48,D0_2uL,0.927448,-269277.63160037785,610498439.0228177
2,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,y3,0.000000e+00,2121.0,0.0,ADGANALGGK_D0,ADGANALGGK,D0,0.000000,ADGANALGGK_y3_2_2,ADGANALGGK_2_2,48,D0_2uL,0.991188,-85677.11980995745,1169629479.5274482
3,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,y5,0.000000e+00,1.0,0.0,ADGANALGGK_D0,ADGANALGGK,D0,0.000000,ADGANALGGK_y5_2_2,ADGANALGGK_2_2,48,D0_2uL,0.992173,-49531.235013845784,582169035.4960749
4,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,y6,0.000000e+00,1.0,0.0,ADGANALGGK_D0,ADGANALGGK,D0,0.000000,ADGANALGGK_y6_2_2,ADGANALGGK_2_2,48,D0_2uL,0.990187,-170828.4022837358,1758162159.9833941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2890,YQDLYTVEPNNAR,sp|009|JPT4,D4_2uL_91,2,1,y8,2.271160e+05,1.0,227116.0,YQDLYTVEPNNAR_D4,YQDLYTVEPNNAR,D4,0.000484,YQDLYTVEPNNAR_y8_3_3,YQDLYTVEPNNAR_3_3,91,D4_2uL,0.971447,-709682.3561905639,1278803310.3587697
2891,YQDLYTVEPNNAR,sp|009|JPT4,D4_2uL_91,2,1,y9,2.039140e+05,1.0,203914.0,YQDLYTVEPNNAR_D4,YQDLYTVEPNNAR,D4,0.000484,YQDLYTVEPNNAR_y9_3_3,YQDLYTVEPNNAR_3_3,91,D4_2uL,0.972202,-527297.1760737193,1019416730.2200168
2892,YQDLYTVEPNNAR,sp|009|JPT4,D4_2uL_91,2,2,precursor,3.037390e+01,133514.0,4055341.0,YQDLYTVEPNNAR_D4,YQDLYTVEPNNAR,D4,0.000484,YQDLYTVEPNNAR_precursor_3_3,YQDLYTVEPNNAR_3_3,91,D4_2uL,0.931340,-14419660.277935717,13167742673.973993
2893,YQDLYTVEPNNAR,sp|009|JPT4,D4_2uL_91,2,2,precursor [M+1],6.185941e+00,377958.0,2338026.0,YQDLYTVEPNNAR_D4,YQDLYTVEPNNAR,D4,0.000484,YQDLYTVEPNNAR_precursor [M+1]_3_3,YQDLYTVEPNNAR_3_3,91,D4_2uL,0.922005,6.225699473829457,3302.130604833058


In [84]:
q90 = [0.941, 0.765, 0.642, 0.56, 0.507, 0.468, 0.437,
       0.412, 0.392, 0.376, 0.361, 0.349, 0.338, 0.329,
       0.32, 0.313, 0.306, 0.3, 0.295, 0.29, 0.285, 0.281,
       0.277, 0.273, 0.269, 0.266, 0.263, 0.26
      ]

q95 = [0.97, 0.829, 0.71, 0.625, 0.568, 0.526, 0.493, 0.466,
       0.444, 0.426, 0.41, 0.396, 0.384, 0.374, 0.365, 0.356,
       0.349, 0.342, 0.337, 0.331, 0.326, 0.321, 0.317, 0.312,
       0.308, 0.305, 0.301, 0.29
      ]

q99 = [0.994, 0.926, 0.821, 0.74, 0.68, 0.634, 0.598, 0.568,
       0.542, 0.522, 0.503, 0.488, 0.475, 0.463, 0.452, 0.442,
       0.433, 0.425, 0.418, 0.411, 0.404, 0.399, 0.393, 0.388,
       0.384, 0.38, 0.376, 0.372
       ]

Q90 = {n:q for n,q in zip(range(3,len(q90)+1), q90)}
Q95 = {n:q for n,q in zip(range(3,len(q95)+1), q95)}
Q99 = {n:q for n,q in zip(range(3,len(q99)+1), q99)}

def dixon_test(data, left=True, right=True, q_dict=Q95):
    """
    Keyword arguments:
        data = A ordered or unordered list of data points (int or float).
        left = Q-test of minimum value in the ordered list if True.
        right = Q-test of maximum value in the ordered list if True.
        q_dict = A dictionary of Q-values for a given confidence level,
            where the dict. keys are sample sizes N, and the associated values
            are the corresponding critical Q values. E.g.,
            {3: 0.97, 4: 0.829, 5: 0.71, 6: 0.625, ...}

    Returns a list of 2 values for the outliers, or None.
    E.g.,
       for [1,1,1] -> [None, None]
       for [5,1,1] -> [None, 5]
       for [5,1,5] -> [1, None]

    """
    assert(left or right), 'At least one of the variables, `left` or `right`, must be True.'
    assert(len(data) >= 3), 'At least 3 data points are required'
    assert(len(data) <= max(q_dict.keys())), 'Sample size too large'

    sdata = sorted(data)
    Q_mindiff, Q_maxdiff = (0,0), (0,0)

    if left:
        Q_min = (sdata[1] - sdata[0])
        try:
            Q_min /= (sdata[-1] - sdata[0])
        except ZeroDivisionError:
            pass
        Q_mindiff = (Q_min - q_dict[len(data)], sdata[0])

    if right:
        Q_max = abs((sdata[-2] - sdata[-1]))
        try:
            Q_max /= abs((sdata[0] - sdata[-1]))
        except ZeroDivisionError:
            pass
        Q_maxdiff = (Q_max - q_dict[len(data)], sdata[-1])

    if not Q_mindiff[0] > 0 and not Q_maxdiff[0] > 0:
        outliers = [None, None]

    elif Q_mindiff[0] == Q_maxdiff[0]:
        outliers = [Q_mindiff[1], Q_maxdiff[1]]

    elif Q_mindiff[0] > Q_maxdiff[0]:
        outliers = [Q_mindiff[1], None]

    else:
        outliers = [None, Q_maxdiff[1]]

    return outliers


dixon_test([0.142, 0.153, 0.135, 0.002, 0.175], left=False)

[None, None]

# Fit metric

In [85]:
def get_fit_param_cat(row):
    """Get the fit param category (C18_SDS, C18_SDC, etc..)."""

    replicate = row['replicate']
    peptide = row['peptide']

    rep_elements = replicate.split('_')[0:3]

    cat_name = [peptide]  + rep_elements
    cat_name = '_'.join(cat_name)
    return cat_name

def qtest(data, right=True):

    sorted_data = sorted(data)

    if right:
        gap = sorted_data[-1] - sorted_data[-2]

    else:
        gap = sorted_data[1] - sorted_data[0]

    try:
        range = sorted_data[-1] - sorted_data[0]
        q_val = gap / range
    except ZeroDivisionError:
        q_val = gap

    return q_val

def get_grp_fit_agg(grouped_df:pd.DataFrame):
    """Get the aggregate values for each group."""

    df = grouped_df[['intercept', 'gradient', 'R2']].copy()
    gradient = df['gradient'].values
    intercept = df['intercept'].values
    r2 = df['R2'].values

    mean_grad = gradient.mean()
    stdv_grad = gradient.std()
    cov_grad = stdv_grad / mean_grad
    qtest_grad_right = qtest(gradient, right=True)

    mean_intercept = intercept.mean()
    stdv_intercept = intercept.std()
    cov_intercept = stdv_intercept / mean_intercept
    qtest_intercept_right = qtest(intercept, right=True)

    mean_r2 = r2.mean()
    stdv_r2 = r2.std()
    cov_r2 = stdv_r2 / mean_r2
    qtest_r2_right = qtest(r2, right=True)

    cols_dict = {'mean_grad':mean_grad, 'stdv_grad':stdv_grad,
                 'cov_grad':cov_grad, 'qtest_grad':qtest_grad_right, 
                 'mean_intercept':mean_intercept, 'stdv_intercept':stdv_intercept,
                 'cov_intercept':cov_intercept, 'qtest_intercept':qtest_intercept_right,
                 'mean_r2':mean_r2, 'stdv_r2':stdv_r2,
                 'cov_r2':cov_r2, 'qtest_r2':qtest_r2_right
                 }

    return pd.Series(cols_dict)


# create plot categories
temp = df_area_ratio_conc_n_fit_params.copy()
temp['fit_param_grp'] = temp.apply(get_fit_param_cat, axis=1)

cols_to_group_by = ['fit_param_grp']
df_fit_param_agg = temp.groupby(cols_to_group_by).apply(get_grp_fit_agg).reset_index()

df_w_fit_param_agg = pd.merge(left=temp, right=df_fit_param_agg, how='left', on=cols_to_group_by)
df_w_fit_param_agg.head(3)

Unnamed: 0,peptide,protein,replicate,precursor charge,product charge,fragment ion,area_ratio,area_min,area_max,peptide_dilution_name,...,cov_grad,qtest_grad,mean_intercept,stdv_intercept,cov_intercept,qtest_intercept,mean_r2,stdv_r2,cov_r2,qtest_r2
0,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,b2,0.088383,98062.0,8667.0,ADGANALGGK_D0,...,1.106207,0.15547,-87446.996173,98604.957879,-1.127597,2.5e-05,0.973599,0.02397,0.02462,0.015205
1,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,b3,33248.0,1.0,33248.0,ADGANALGGK_D0,...,1.106207,0.15547,-87446.996173,98604.957879,-1.127597,2.5e-05,0.973599,0.02397,0.02462,0.015205
2,ADGANALGGK,sp|009|JPT4,D0_2uL_48,2,1,y3,0.0,2121.0,0.0,ADGANALGGK_D0,...,1.106207,0.15547,-87446.996173,98604.957879,-1.127597,2.5e-05,0.973599,0.02397,0.02462,0.015205


In [86]:
# The columns that have zero area as denominator
df_w_fit_param_agg[df_w_fit_param_agg.area_ratio.isna()]

Unnamed: 0,peptide,protein,replicate,precursor charge,product charge,fragment ion,area_ratio,area_min,area_max,peptide_dilution_name,...,cov_grad,qtest_grad,mean_intercept,stdv_intercept,cov_intercept,qtest_intercept,mean_r2,stdv_r2,cov_r2,qtest_r2


In [87]:
result_data_folder = pathlib.Path("../../reports/ms_work/soroush_3/")

df_w_fit_param_agg.to_csv(result_data_folder.joinpath('data_with_fit_param_aggregate_soroush_3_1_over_x.csv'), index=False)