In [1]:
import pandas as pd
import numpy as np
import os
import re
from functools import reduce

In [2]:
data_dir = '../ppi_ml/data/cfms/pep_assign_posthoc/'
pep_files = [f for f in os.listdir(data_dir)]
uniq = True
pep_files

['yeast_ogs.back_assign_peps',
 'wheat_ogs.back_assign_peps',
 'maize_ogs.back_assign_peps',
 'caeel_ogs.back_assign_peps',
 'drome_ogs.back_assign_peps',
 'mouse_ogs.back_assign_peps',
 'selml_ogs.back_assign_peps',
 'tetts_ogs.back_assign_peps',
 'sollc_ogs.back_assign_peps',
 'cerri_ogs.back_assign_peps',
 'dicdi_ogs.back_assign_peps',
 'cheqi_ogs.back_assign_peps',
 'xenla_ogs.back_assign_peps',
 'human_ogs.back_assign_peps',
 'euggr_ogs.back_assign_peps',
 'pig_ogs.back_assign_peps',
 'chlre_ogs.back_assign_peps',
 'cansa_ogs.back_assign_peps',
 'cocnu_ogs.back_assign_peps',
 'plakh_ogs.back_assign_peps',
 'arath_ogs.back_assign_peps',
 'plaf7_ogs.back_assign_peps',
 'tryb2_ogs.back_assign_peps',
 'plaba_ogs.back_assign_peps',
 'soybn_ogs.back_assign_peps',
 'nemve_ogs.back_assign_peps',
 'orysj_ogs.back_assign_peps',
 'braol_ogs.back_assign_peps',
 'phatc_ogs.back_assign_peps',
 'brart_ogs.back_assign_peps',
 'strpu_ogs.back_assign_peps']

## TODO: need to fix fern IDs

In [3]:
def get_uniq_peps(df):
    peps = (df
            .groupby(['orthogroup','peptide'])
            .size()
            .reset_index()
            .rename(columns={0:'n'})
           )
    uniq = peps[peps['n'] == 1]['peptide'].to_list()
    return(uniq)

def label_peps(df, uniq_list):
    df_uniq = df[df['peptide'].isin(uniq_list)]
    df_uniq = df_uniq.assign(status = 'unique')
    df_non = df[~df['peptide'].isin(uniq_list)]
    df_non = df_non.assign(status = 'non-unique')
    df_lab = pd.concat([df_uniq, df_non]).reset_index(drop=True)
    return(df_lab)

def split_ids(x, sid):
    try:
        id_lst = re.split('\||_', x)
        up = id_lst[1]
        gene = id_lst[2]
        species = id_lst[3].lower()
        if 'Fern' in up:
            up = x
            gene = x
            species = sid
    except IndexError:
        up = x
        gene = x
        species = sid
    return(up, gene, species)

def summarize_peps(data_dir, file, sid, uniq=True):
    if not data_dir.endswith('/'):
        data_dir = data_dir+'/'
    df = pd.read_csv(data_dir+file)
    uniq_peps = get_uniq_peps(df)
    df_lab = label_peps(df, uniq_peps)
    df_out = (df_lab
           .groupby(['orthogroup','protein_match','status'])['count']
           .sum()
           .reset_index()
           .sort_values(['orthogroup', 'count'], ascending=[False, False])
           .reset_index(drop=True)
          )
    df_out[['uniprot_id','gene_name','species']] = [split_ids(i, sid) for i in df_out['protein_match']]
    df_out = df_out.drop('protein_match', axis=1)
    return(df_out)

In [4]:
df_lst = []
for f in pep_files:
    sid = f.split('_')[0]
    print(f'Parsing {f} [species="{sid}"] ...')
    df = summarize_peps(data_dir, f, sid, uniq=True)
    df_lst.append(df)
final_df = pd.concat(df_lst)

Parsing yeast_ogs.back_assign_peps [species="yeast"] ...
Parsing wheat_ogs.back_assign_peps [species="wheat"] ...
Parsing maize_ogs.back_assign_peps [species="maize"] ...
Parsing caeel_ogs.back_assign_peps [species="caeel"] ...
Parsing drome_ogs.back_assign_peps [species="drome"] ...
Parsing mouse_ogs.back_assign_peps [species="mouse"] ...
Parsing selml_ogs.back_assign_peps [species="selml"] ...
Parsing tetts_ogs.back_assign_peps [species="tetts"] ...
Parsing sollc_ogs.back_assign_peps [species="sollc"] ...
Parsing cerri_ogs.back_assign_peps [species="cerri"] ...
Parsing dicdi_ogs.back_assign_peps [species="dicdi"] ...
Parsing cheqi_ogs.back_assign_peps [species="cheqi"] ...
Parsing xenla_ogs.back_assign_peps [species="xenla"] ...
Parsing human_ogs.back_assign_peps [species="human"] ...
Parsing euggr_ogs.back_assign_peps [species="euggr"] ...
Parsing pig_ogs.back_assign_peps [species="pig"] ...
Parsing chlre_ogs.back_assign_peps [species="chlre"] ...
Parsing cansa_ogs.back_assign_peps 

In [5]:
final_df.to_csv('../ppi_ml/data/cfms/pep_assign_posthoc_summarized.csv', index=False)
final_df

Unnamed: 0,orthogroup,status,count,uniprot_id,gene_name,species
0,KOG4853,unique,14,P69852,HSK3,yeast
1,KOG4853,unique,14,Q8TGM6,TAR1,yeast
2,KOG4852,unique,35,Q07532,IWR1,yeast
3,KOG4851,unique,112,Q12247,FYV7,yeast
4,KOG4844,unique,53,P53305,RT27,yeast
...,...,...,...,...,...,...
34704,ENOG502QPJD,unique,793,W4Y9C4,W4Y9C4,strpu
34705,ENOG502QPJD,unique,165,W4XEJ2,W4XEJ2,strpu
34706,ENOG502QPJA,unique,179,W4YYE0,W4YYE0,strpu
34707,ENOG502QPJ1,unique,3838,W4XZA9,W4XZA9,strpu


In [6]:
df_lst[9]

Unnamed: 0,orthogroup,status,count,uniprot_id,gene_name,species
0,KOG4853,unique,45,p.McWhite201602_FernFrond.K55ms_294_16/16_0.21...,p.McWhite201602_FernFrond.K55ms_294_16/16_0.21...,cerri
1,KOG4845,non-unique,57,p.McWhite201602_FernGamet.K35ms_11860_3/5_0.30...,p.McWhite201602_FernGamet.K35ms_11860_3/5_0.30...,cerri
2,KOG4845,non-unique,55,p.McWhite201602_FernFrond.K45ms_6157_1/5_0.273...,p.McWhite201602_FernFrond.K45ms_6157_1/5_0.273...,cerri
3,KOG4845,unique,37,p.McWhite201602_FernGamet.K55ms_19350_1/1_1.00...,p.McWhite201602_FernGamet.K55ms_19350_1/1_1.00...,cerri
4,KOG4845,unique,3,p.McWhite201602_FernFrond.K45ms_6157_1/5_0.273...,p.McWhite201602_FernFrond.K45ms_6157_1/5_0.273...,cerri
...,...,...,...,...,...,...
126511,ENOG502QPHU,non-unique,48,p.McWhite201602_FernSpore.K45p_12868_1/1_1.000...,p.McWhite201602_FernSpore.K45p_12868_1/1_1.000...,cerri
126512,ENOG502QPHU,non-unique,28,p.McWhite201602_FernFrond.K35ms_5_1059/17452_1...,p.McWhite201602_FernFrond.K35ms_5_1059/17452_1...,cerri
126513,ENOG502QPHU,unique,21,p.McWhite201602_FernFrond.K55ms_7819_1/1_1.000...,p.McWhite201602_FernFrond.K55ms_7819_1/1_1.000...,cerri
126514,ENOG502QPHU,unique,5,p.McWhite201602_FernFrond.K45p_8836_1/1_1.000_...,p.McWhite201602_FernFrond.K45p_8836_1/1_1.000_...,cerri
