In [1]:
import pandas as pd
import re
import os
from functools import reduce
import numpy as np

In [2]:
data_dir = '../ppi_ml/data/cfms/msblender/leca_level/arath/wwc_1/output/'
pep_files = [f for f in os.listdir(data_dir) if re.match('.*prot_list$', f)]

In [3]:
def fmt_data(file):
    # read in .prot_count file
    print(f'Processing {file} ...')
    df = pd.read_csv(file, sep='\t', header=None)

    # extract & format peptides 
    pep_list = []
    for i in df.iloc[:, 0]:
        pep = str.split(i, '.')[-1]
        pep_list.append(pep)

    # extract & format protein IDs
    id_list = []
    for i in df.iloc[:, 1]:
        ids = str.split(i, '(')[0]
        id_list.append(ids)
        
    # make new df
    df_fmt = pd.DataFrame()
    df_fmt['peptide'] = pep_list
    df_fmt['protein'] = id_list
    
    return(df_fmt)

def unique_peps(df):
    
    # get protein IDs with unique peptides
    df['num_matches'] = df['protein'].apply(lambda x: len(str.split(x, ',')))
    df = df[df['num_matches'] == 1]
    df_uniq = df.drop(['num_matches'], axis=1)
    
    return(df_uniq)

def count_peps(df, frac):
    # get peptide counts
    counts = df.groupby(['peptide']).size().sort_values(ascending=False)
    count_dict = dict()
    for i in counts.items():
        pep = i[0]
        count = i[1]
        count_dict[pep] = count
        
    # join count info back onto df
    count_col = 'frac_count'+str(frac)
    df[count_col] = [count_dict[i] for i in df['peptide']]
    df_counts = df.drop_duplicates()
    
    return(df_counts)

In [4]:
def process_fracs(data_dir, file_list):
    
    df_list = []
    frac_count = 0
    for f in file_list:
        frac_count += 1
        df_fmt = fmt_data(data_dir+f)
        df_uniq = unique_peps(df_fmt)
        df_counts = count_peps(df_uniq, frac_count)
        df_list.append(df_counts)

    if len(df_list) > 1:
        df_joined = reduce(lambda x, y: pd.merge(x, y, on=['peptide', 'protein'], how='outer'), df_list)
        df_joined.fillna(0, inplace=True)
    else:
        df_joined = df_list[0]

    count_cols = []
    for c in df_joined.columns:
        if df_joined[c].dtype == float:
            df_joined[c] = df_joined[c].astype(int)
            count_cols.append(c)

    tcol = 'total_counts'
    df_joined[tcol] = df_joined[count_cols].sum(axis=1)
    
    return(df_joined)

In [5]:
def clean_peps(df):
    # get experiment-wide unique peptide assignments
    counts = df.groupby(['peptide']).size().sort_values(ascending=False)
    bad_peps = []
    for p in counts.items():
        if p[1] > 1:
            bad_peps.append(p[0])
    df_out = df[df['peptide'].apply(lambda x: x not in bad_peps)]
    
    # check for errors
    bad_pep_sum = 0
    for c in counts.items():
        if c[1] > 1:
            bad_pep_sum += c[1]

    actual_sum = len(df_joined) - len(df_out)

    if actual_sum != bad_pep_sum:
        print("Something went wrong ...")
    
def write_result(df, outfile):
    df_write = df[['peptide', 'protein', 'total_counts']]
    df_write.to_csv(outfile, index=False)

In [None]:
# get experiment-wide unique peptide assignments
bad_peps = []
for p in counts.items():
    if p[1] > 1:
        bad_peps.append(p[0])

In [None]:
len(bad_peps)

In [None]:
df_out = df_joined[df_joined['peptide'].apply(lambda x: x not in bad_peps)]

In [None]:
df_write = df_out[['peptide', 'protein', 'total_counts']]

I put the above code into an argparse script (`get_pep_assignments.py`); then ran the code below on the results to get aggregate totals for each species/experiment.

In [7]:
import pandas as pd
import re
import os
from functools import reduce
import numpy as np

In [8]:
fs = '../ppi_ml/data/meta/euk_codes_ordered.txt'
species = [line.strip() for line in open(fs, 'r')]
data_dir = '../ppi_ml/data/cfms/pep_assign_ogs/'
outdir = '../ppi_ml/data/cfms/pep_assign_totals/'

In [9]:
species

['brart',
 'caeel',
 'dicdi',
 'drome',
 'human',
 'mouse',
 'nemve',
 'pig',
 'strpu',
 'xenla',
 'yeast',
 'euggr',
 'tryb2',
 'phatc',
 'plaba',
 'plaf7',
 'plakh',
 'tetts',
 'arath',
 'braol',
 'cansa',
 'cerri',
 'cheqi',
 'chlre',
 'cocnu',
 'maize',
 'orysj',
 'selml',
 'sollc',
 'soybn',
 'wheat']

In [12]:
fix_list = []
for s in species:
    pep_assign_files = [f for f in os.listdir(data_dir) if re.match(s, f)]
    df_list = []
    for f in pep_assign_files:
        df = pd.read_csv(data_dir+f, sep=',')
        ncol = f.replace('.pep_assign', '')
        df.columns.values[2] = ncol+'_counts'
        df_list.append(df)
    
    if len(df_list) > 1:
        final_df = reduce(lambda x, y: pd.merge(x, y, on=['peptide', 'protein'], how='outer'), df_list)
        final_df.fillna(0, inplace=True)
    else:
        final_df = df
        fix_list.append(s)
    
    count_cols = []
    for c in final_df.columns:
        if final_df[c].dtype.kind in 'iufc':
            final_df[c] = final_df[c].astype(int)
            count_cols.append(c)
    
    counts = final_df.groupby(['peptide']).size().sort_values(ascending=False)
    bad_peps = []
    for p in counts.items():
        if p[1] > 1:
            bad_peps.append(p[0])
    print(f'{s}: {len(bad_peps)} non-unique peps across experiments; these will be removed.')
    df_out = final_df[final_df['peptide'].apply(lambda x: x not in bad_peps)]
    
    tcol = s+'_total'
    df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)
    
    # if len(count_cols) > 1:
    #     df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)
    # else:
    #     df_out[tcol] = df[count_cols[0]]
    
    outfile = outdir+s+'_pep_assign_totals.csv'
    df_out.to_csv(outfile, index=False)

brart: 0 non-unique peps across experiments; these will be removed.
caeel: 5533 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


dicdi: 922 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


drome: 1417 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


human: 9298 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


mouse: 1225 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


nemve: 2458 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


pig: 0 non-unique peps across experiments; these will be removed.
strpu: 3591 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


xenla: 3740 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


yeast: 0 non-unique peps across experiments; these will be removed.
euggr: 0 non-unique peps across experiments; these will be removed.
tryb2: 1591 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


phatc: 797 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


plaba: 1177 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


plaf7: 6119 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


plakh: 2745 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


tetts: 4980 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


arath: 4484 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


braol: 2666 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


cansa: 0 non-unique peps across experiments; these will be removed.
cerri: 15772 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


cheqi: 0 non-unique peps across experiments; these will be removed.
chlre: 2920 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


cocnu: 0 non-unique peps across experiments; these will be removed.
maize: 0 non-unique peps across experiments; these will be removed.
orysj: 1024 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


selml: 817 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


sollc: 0 non-unique peps across experiments; these will be removed.
soybn: 1568 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


wheat: 1068 non-unique peps across experiments; these will be removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out[tcol] = df_out[count_cols].sum(axis=1).astype(int)


In [13]:
fix_list

['brart', 'pig', 'yeast', 'euggr', 'cansa', 'cheqi', 'cocnu', 'maize', 'sollc']