In [1]:
import pandas as pd
from Bio import SeqIO

In [2]:
data_dir = '../ppi_ml/data/'
pep_file = data_dir+'cfms/pep_assign_totals/dicdi_pep_assign_totals.csv'
fasta_file = data_dir+'proteomes/dicdi.fasta'
mapping_file = data_dir+'og_proteomes/nog_mapping/dicdi.euNOG.diamond.mapping.2759'
outfile = data_dir+'cfms/pep_assign_posthoc/dicdi_ogs.back_assign_peps

In [19]:
# read in peptide assignments
pep_df = pd.read_csv(pep_file)
count_col = pep_df.columns[-1]
pep_df.head()

Unnamed: 0,peptide,protein,dicdi_iex_1_counts,dicdi_iex_3_counts,dicdi_iex_2_counts,dicdi_total
0,KLSTGEELYR,ENOG502RFSM,3,5,1,9
1,KPLKNYDYR,KOG3904,2,4,4,10
2,KKCYHGLLR,ENOG502RDNB,22,2,13,37
3,VVRSETVLDR,KOG1042,36,1,29,66
4,KNMPYWIVK,KOG1542,3,0,1,4


In [4]:
# read in eggnog mapping file
map_df = pd.read_csv(mapping_file, sep='\t')
map_df.head()

Unnamed: 0,ProteinID,ID
0,tr|A0A2C9SN78|A0A2C9SN78_DICDI,KOG0293
1,sp|A1XDC0|TFP11_DICDI,KOG2184
2,tr|B0G0Y4|B0G0Y4_DICDI,KOG0351
3,sp|B0G0Y5|MYBAA_DICDI,KOG0048
4,tr|B0G0Y7|B0G0Y7_DICDI,ENOG502RIEY


In [98]:
og_lookup = [o for o in pep_df['protein'].tolist() if o.startswith(('KOG', 'ENOG'))]
family = map_df[map_df['ID'] == og]['ProteinID'].tolist()
pep_lookup = pep_df[pep_df['protein'] == og]['peptide'].tolist()

In [102]:
df_list = []
for og in set(og_lookup[0:3]):
    family = map_df[map_df['ID'] == og]['ProteinID'].tolist()
    pep_lookup = pep_df[pep_df['protein'] == og]['peptide'].tolist()
    lookup_res = []
    for record in SeqIO.parse(open(fasta_file ,"r"), "fasta"):
        prot_id = record.id
        seq = str(record.seq.upper())
        if prot_id in family:
            for pep in pep_lookup:
                if pep in seq:
                    count = pep_df.loc[pep_df['peptide'] == pep][count_col].tolist()
                    lookup_res.append([test_og, pep, prot_id, count[0]])
    df = pd.DataFrame(lookup_res, columns=['orthogroup', 'peptide', 'protein_match', 'count'])
    print(og, ': ', len(df))
    df_list.append(df)
final_df = pd.concat(df_list, ignore_index=True, sort=False)
final_df

KOG3904 :  62
ENOG502RDNB :  538
ENOG502RFSM :  198


Unnamed: 0,orthogroup,peptide,protein_match,count
0,ENOG502RFSM,KPLKNYDYR,tr|Q54GC2|Q54GC2_DICDI,10
1,ENOG502RFSM,NRIITRSR,tr|Q54GC2|Q54GC2_DICDI,9
2,ENOG502RFSM,WGVDKLPK,tr|Q54GC2|Q54GC2_DICDI,7
3,ENOG502RFSM,LSSTVIVLLK,tr|Q54GC2|Q54GC2_DICDI,16
4,ENOG502RFSM,WGVDKLPKSINK,tr|Q54GC2|Q54GC2_DICDI,4
...,...,...,...,...
793,ENOG502RFSM,LEESILYCEKCDFK,tr|Q54CJ2|Q54CJ2_DICDI,1
794,ENOG502RFSM,SANIDPSR,tr|Q54CJ2|Q54CJ2_DICDI,1
795,ENOG502RFSM,SAISSSSTTKVLINNGK,tr|Q54CJ2|Q54CJ2_DICDI,1
796,ENOG502RFSM,WKSANIDPSRK,tr|Q54CJ2|Q54CJ2_DICDI,1


In [86]:
df

Unnamed: 0,orthogroup,peptide,protein_match,count
0,ENOG502RFSM,NPSTANLLK,tr|Q54CJ1|Q54CJ1_DICDI,5
1,ENOG502RFSM,QNEEIKNKTTVDK,tr|Q54CJ1|Q54CJ1_DICDI,2
2,ENOG502RFSM,QQVLQQQQQR,tr|Q54CJ1|Q54CJ1_DICDI,9
3,ENOG502RFSM,SDMSSSESDNDQLK,tr|Q54CJ1|Q54CJ1_DICDI,23
4,ENOG502RFSM,LPRDERLEHSYRCRIK,tr|Q54CJ1|Q54CJ1_DICDI,1
...,...,...,...,...
193,ENOG502RFSM,LEESILYCEKCDFK,tr|Q54CJ2|Q54CJ2_DICDI,1
194,ENOG502RFSM,SANIDPSR,tr|Q54CJ2|Q54CJ2_DICDI,1
195,ENOG502RFSM,SAISSSSTTKVLINNGK,tr|Q54CJ2|Q54CJ2_DICDI,1
196,ENOG502RFSM,WKSANIDPSRK,tr|Q54CJ2|Q54CJ2_DICDI,1


In [87]:
pep_df[pep_df['peptide'] == 'QQVLQQQQQR']

Unnamed: 0,peptide,protein,dicdi_iex_1_counts,dicdi_iex_3_counts,dicdi_iex_2_counts,dicdi_total
50318,QQVLQQQQQR,ENOG502RFSM,2,6,1,9


In [89]:
!grep 'ENOG502RFSM' ../ppi_ml/data/og_proteomes/nog_mapping/dicdi.euNOG.diamond.mapping.2759

tr|Q54CJ1|Q54CJ1_DICDI	ENOG502RFSM
tr|Q54CJ2|Q54CJ2_DICDI	ENOG502RFSM
