In [1]:
import pandas as pd
import re
import os
from functools import reduce
import numpy as np

In [2]:
data_dir = '../ppi_ml/data/cfms/msblender/leca_level/arath/wwc_1/output/'
pep_files = [f for f in os.listdir(data_dir) if re.match('.*prot_list$', f)]

In [4]:
def fmt_data(file):
    # read in .prot_count file
    print(f'Processing {file} ...')
    df = pd.read_csv(file, sep='\t', header=None)

    # extract & format peptides 
    pep_list = []
    for i in df.iloc[:, 0]:
        pep = str.split(i, '.')[-1]
        pep_list.append(pep)

    # extract & format protein IDs
    id_list = []
    for i in df.iloc[:, 1]:
        ids = str.split(i, '(')[0]
        id_list.append(ids)
        
    # make new df
    df_fmt = pd.DataFrame()
    df_fmt['peptide'] = pep_list
    df_fmt['protein'] = id_list
    
    return(df_fmt)

def unique_peps(df):
    
    # get protein IDs with unique peptides
    df['num_matches'] = df['protein'].apply(lambda x: len(str.split(x, ',')))
    df = df[df['num_matches'] == 1]
    df_uniq = df.drop(['num_matches'], axis=1)
    
    return(df_uniq)

def count_peps(df, frac):
    # get peptide counts
    counts = df.groupby(['peptide']).size().sort_values(ascending=False)
    count_dict = dict()
    for i in counts.items():
        pep = i[0]
        count = i[1]
        count_dict[pep] = count
        
    # join count info back onto df
    count_col = 'frac_count'+str(frac)
    df[count_col] = [count_dict[i] for i in df['peptide']]
    df_counts = df.drop_duplicates()
    
    return(df_counts)

In [5]:
def process_fracs(data_dir, file_list)
    
    df_list = []
    frac_count = 0
    for f in file_list:
        frac_count += 1
        df_fmt = fmt_data(data_dir+f)
        df_uniq = unique_peps(df_fmt)
        df_counts = count_peps(df_uniq, frac_count)
        df_list.append(df_counts)

    if len(df_list) > 1:
        df_joined = reduce(lambda x, y: pd.merge(x, y, on=['peptide', 'protein'], how='outer'), df_list)
        df_joined.fillna(0, inplace=True)
    else:
        df_joined = df_list[0]

    count_cols = []
    for c in df_joined.columns:
        if df_joined[c].dtype == float:
            df_joined[c] = df_joined[c].astype(int)
            count_cols.append(c)

    tcol = 'total_counts'
    df_joined[tcol] = df_joined[count_cols].sum(axis=1)
    
    return(df_joined)

Processing ../ppi_ml/data/cfms/msblender/leca_level/arath/wwc_1/output/Athaliana_sproutsWWC_36a_02202017.prot_list ...
Processing ../ppi_ml/data/cfms/msblender/leca_level/arath/wwc_1/output/Athaliana_sproutsWWC_49a_02172017.prot_list ...
Processing ../ppi_ml/data/cfms/msblender/leca_level/arath/wwc_1/output/Athaliana_sproutsWWC_01a_02172017.prot_list ...
Processing ../ppi_ml/data/cfms/msblender/leca_level/arath/wwc_1/output/Athaliana_sproutsWWC_31a_02192017.prot_list ...
Processing ../ppi_ml/data/cfms/msblender/leca_level/arath/wwc_1/output/Athaliana_sproutsWWC_22a_02202017.prot_list ...
Processing ../ppi_ml/data/cfms/msblender/leca_level/arath/wwc_1/output/Athaliana_sproutsWWC_27a_02182017.prot_list ...
Processing ../ppi_ml/data/cfms/msblender/leca_level/arath/wwc_1/output/Athaliana_sproutsWWC_58a_02202017.prot_list ...
Processing ../ppi_ml/data/cfms/msblender/leca_level/arath/wwc_1/output/Athaliana_sproutsWWC_15a_02182017.prot_list ...
Processing ../ppi_ml/data/cfms/msblender/leca_le

In [7]:
def clean_peps(df)
    # get experiment-wide unique peptide assignments
    counts = df.groupby(['peptide']).size().sort_values(ascending=False)
    bad_peps = []
    for p in counts.items():
        if p[1] > 1:
            bad_peps.append(p[0])
    df_out = df[df['peptide'].apply(lambda x: x not in bad_peps)]
    
    # check for errors
    bad_pep_sum = 0
    for c in counts.items():
        if c[1] > 1:
            bad_pep_sum += c[1]

    actual_sum = len(df_joined) - len(df_out)

    if actual_sum != bad_pep_sum:
        print("Something went wrong ...")
    
def write_result(df, outfile)
    df_write = df[['peptide', 'protein', 'total_counts']]
    df_write.to_csv(outfile, index=False)

In [8]:
counts

peptide
SVQVMKTEGSTTVSLPHSAMSPVQDEER               3
TEGSTTVSLPHSAMSPVQDEERDSGK                 3
IREWCEQQVPYMCPDYQSYFR                      3
MIGIWGPAGIGK                               3
IIVRVNRPFLIAVVLKDTQSIIFLGK                 3
                                          ..
IAREILKQQDALFASR                           1
IAREIGDAVIK                                1
IAREIFKQQDALFASRPLTYAQK                    1
IAREIEAETTRDIHVAEERGLQLNENFDFDEEARYSSVR    1
YYYYQYLSSTSEAAEEKIAMLQENESLKK              1
Length: 1427513, dtype: int64

In [9]:
# get experiment-wide unique peptide assignments
bad_peps = []
for p in counts.items():
    if p[1] > 1:
        bad_peps.append(p[0])

In [15]:
len(bad_peps)

2135

In [13]:
df_out = df_joined[df_joined['peptide'].apply(lambda x: x not in bad_peps)]

In [22]:
df_write = df_out[['peptide', 'protein', 'total_counts']]

I put the above code into an argparse script (`get_pep_assignments.py`); then ran the code below on the results to get aggregate totals for each species/experiment.

In [33]:
fs = '../ppi_ml/data/meta/euk_codes_ordered.txt'
species = [line.strip() for line in open(fs, 'r')]
data_dir = '../ppi_ml/data/cfms/pep_assign/'
outdir = '../ppi_ml/data/cfms/pep_assign_totals/'

In [36]:
for s in species:
    pep_assign_files = [f for f in os.listdir(data_dir) if re.match(s, f)]
    print(pep_assign_files)
    df_list = []
    for f in pep_assign_files:
        df = pd.read_csv(data_dir+f, sep=',')
        ncol = f.replace('.pep_assign', '')
        df.columns.values[2] = ncol+'_counts'
        df_list.append(df)
    
    if len(df_list) > 1:
        final_df = reduce(lambda x, y: pd.merge(x, y, on=['peptide', 'protein'], how='outer'), df_list)
        final_df.fillna(0, inplace=True)
    else:
        final_df = df
    
    count_cols = []
    for c in final_df.columns:
        if final_df[c].dtype == float:
            final_df[c] = final_df[c].astype(int)
            count_cols.append(c)
    
    counts = final_df.groupby(['peptide']).size().sort_values(ascending=False)
    bad_peps = []
    for p in counts.items():
        if p[1] > 1:
            bad_peps.append(p[0])
    print(f'{s}: {len(bad_peps)} non-unique peps across experiments; these will be removed.')
    df_out = final_df[final_df['peptide'].apply(lambda x: x not in bad_peps)]
    
    tcol = s+'_total'
    df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)
    
    outfile = outdir+s+'_pep_assign_totals.csv'
    df_out.to_csv(outfile, index=False)

['brart_sec_1.pep_assign']
brart: 0 non-unique peps across experiments; these will be removed.
['caeel_iex_1.pep_assign', 'caeel_beads_iex_7.pep_assign', 'caeel_beads_iex_9.pep_assign', 'caeel_beads_iex_2.pep_assign', 'caeel_iex_4.pep_assign', 'caeel_iex_3.pep_assign', 'caeel_beads_iex_5.pep_assign', 'caeel_iex_2.pep_assign', 'caeel_beads_iex_4.pep_assign', 'caeel_beads_iex_8.pep_assign', 'caeel_beads_iex_3.pep_assign', 'caeel_beads_iex_6.pep_assign', 'caeel_beads_iex_1.pep_assign']
caeel: 5533 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


['dicdi_iex_1.pep_assign', 'dicdi_iex_3.pep_assign', 'dicdi_iex_2.pep_assign']
dicdi: 922 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


['drome_iex_4.pep_assign', 'drome_iex_3.pep_assign', 'drome_iex_1.pep_assign', 'drome_iex_2.pep_assign']
drome: 1417 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


['human_iex_12.pep_assign', 'human_iex_9.pep_assign', 'human_iex_2.pep_assign', 'human_iex_19.pep_assign', 'human_iex_23.pep_assign', 'human_ief_1.pep_assign', 'human_iex_5.pep_assign', 'human_iex_24.pep_assign', 'human_iex_15.pep_assign', 'human_iex_10.pep_assign', 'human_iex_21.pep_assign', 'human_ief_4.pep_assign', 'human_iex_7.pep_assign', 'human_ief_3.pep_assign', 'human_sucrose_1.pep_assign', 'human_ief_2.pep_assign', 'human_iex_6.pep_assign', 'human_iex_16.pep_assign', 'human_iex_11.pep_assign', 'human_sec_1.pep_assign', 'human_ief_5.pep_assign', 'human_iex_20.pep_assign', 'human_iex_1.pep_assign', 'human_iex_4.pep_assign', 'human_iex_14.pep_assign', 'human_sucrose_2.pep_assign', 'human_iex_13.pep_assign', 'human_iex_8.pep_assign', 'human_iex_22.pep_assign', 'human_iex_3.pep_assign', 'human_iex_18.pep_assign']
human: 9298 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


['mouse_iex_2.pep_assign', 'mouse_iex_3.pep_assign', 'mouse_sec_1.pep_assign', 'mouse_iex_1.pep_assign']
mouse: 1225 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


['nemve_iex_5.pep_assign', 'nemve_iex_2.pep_assign', 'nemve_iex_1.pep_assign', 'nemve_iex_6.pep_assign', 'nemve_iex_3.pep_assign', 'nemve_iex_4.pep_assign']
nemve: 2458 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


['pig_iex_1.pep_assign']
pig: 0 non-unique peps across experiments; these will be removed.
['strpu_iex_1.pep_assign', 'strpu_iex_6.pep_assign', 'strpu_iex_3.pep_assign', 'strpu_iex_8.pep_assign', 'strpu_wwc_1.pep_assign', 'strpu_iex_4.pep_assign', 'strpu_iex_5.pep_assign', 'strpu_iex_2.pep_assign', 'strpu_iex_9.pep_assign', 'strpu_iex_7.pep_assign', 'strpu_iex_10.pep_assign']
strpu: 3591 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


['xenla_sucrose_4.pep_assign', 'xenla_sec_1.pep_assign', 'xenla_iex_1.pep_assign', 'xenla_sucrose_3.pep_assign', 'xenla_sec_3.pep_assign', 'xenla_sucrose_1.pep_assign', 'xenla_sec_2.pep_assign', 'xenla_sucrose_2.pep_assign']
xenla: 3740 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


['yeast_iex_1.pep_assign']
yeast: 0 non-unique peps across experiments; these will be removed.
['euggr_sec_1.pep_assign']
euggr: 0 non-unique peps across experiments; these will be removed.
['tryb2_sec_1.pep_assign', 'tryb2_sax_1.pep_assign', 'tryb2_sec_2.pep_assign']
tryb2: 1591 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


['phatc_iex_1.pep_assign', 'phatc_sec_1.pep_assign']
phatc: 797 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


['plaba_bng_2.pep_assign', 'plaba_bng_1.pep_assign', 'plaba_bng_3.pep_assign']
plaba: 1177 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


['plaf7_bng_9.pep_assign', 'plaf7_bng_5.pep_assign', 'plaf7_iex_2.pep_assign', 'plaf7_bng_7.pep_assign', 'plaf7_bng_6.pep_assign', 'plaf7_iex_1.pep_assign', 'plaf7_bng_4.pep_assign', 'plaf7_bng_8.pep_assign', 'plaf7_wwc_1.pep_assign']
plaf7: 6119 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


['plakh_bng_13.pep_assign', 'plakh_bng_14.pep_assign', 'plakh_bng_11.pep_assign', 'plakh_bng_10.pep_assign', 'plakh_bng_15.pep_assign', 'plakh_bng_12.pep_assign']
plakh: 2745 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


['tetts_iex_1.pep_assign', 'tetts_sec_1.pep_assign', 'tetts_sec_3.pep_assign', 'tetts_iex_xlink_1.pep_assign', 'tetts_iex_2.pep_assign', 'tetts_sec_2.pep_assign']
tetts: 4980 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


['arath_iex_5.pep_assign', 'arath_iex_2.pep_assign', 'arath_iex_1.pep_assign', 'arath_sec_1.pep_assign', 'arath_iex_3.pep_assign', 'arath_wwc_1.pep_assign', 'arath_iex_4.pep_assign']
arath: 4484 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


['braol_iex_1.pep_assign', 'braol_sec_1.pep_assign', 'braol_wwc_1.pep_assign', 'braol_ief_1.pep_assign']
braol: 2666 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


['cansa_iex_1.pep_assign']
cansa: 0 non-unique peps across experiments; these will be removed.
['cerri_wwc_1.pep_assign', 'cerri_sec_1.pep_assign']
cerri: 15772 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


['cheqi_iex_1.pep_assign']
cheqi: 0 non-unique peps across experiments; these will be removed.
['chlre_sec_2.pep_assign', 'chlre_sec_xlink_2.pep_assign', 'chlre_sec_xlink_1.pep_assign', 'chlre_sec_1.pep_assign', 'chlre_wwc_1.pep_assign']
chlre: 2920 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


['cocnu_sec_1.pep_assign']
cocnu: 0 non-unique peps across experiments; these will be removed.
['maize_sec_1.pep_assign']
maize: 0 non-unique peps across experiments; these will be removed.
['orysj_sec_1.pep_assign', 'orysj_iex_1.pep_assign', 'orysj_iex_2.pep_assign']
orysj: 1024 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


['selml_wwc_1.pep_assign', 'selml_sec_1.pep_assign']
selml: 817 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


['sollc_iex_1.pep_assign']
sollc: 0 non-unique peps across experiments; these will be removed.
['soybn_sec_2.pep_assign', 'soybn_sec_xlink_1.pep_assign', 'soybn_sec_1.pep_assign']
soybn: 1568 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


['wheat_sec_3.pep_assign', 'wheat_iex_1.pep_assign', 'wheat_sec_2.pep_assign', 'wheat_ief_1.pep_assign']
wheat: 1068 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)
