In [35]:
from Bio import SeqIO
import pandas as pd
from pathlib import Path

from ppi_utils.general import to_fasta

In [36]:
ppi_dir = Path('.').resolve().parents[1] / 'ppi' / 'ppi_data' / 'v2'
ppi_dir, ppi_dir.is_dir()

(PosixPath('/home/quirin/PYTHON/ppi/ppi_data/v2'), True)

In [37]:
pairs = pd.read_csv(ppi_dir / '1:1_small' / 'huri_test.tsv', sep='\t')
pairs.head()

Unnamed: 0,hash_A,hash_B,species,label,cclass
0,CRC-001576EA78C61C85,CRC-045C9C9CAE0FD639,9606,1,3
1,CRC-001576EA78C61C85,CRC-12E45FF6696080F8,9606,1,3
2,CRC-001576EA78C61C85,CRC-2743716544288776,9606,1,3
3,CRC-001576EA78C61C85,CRC-2EDBC7981868EB12,9606,1,3
4,CRC-001576EA78C61C85,CRC-40897D5AF9B186A9,9606,1,3


In [38]:
fasta = {r.id: str(r.seq) for r in SeqIO.parse(ppi_dir / '1:1_small' / 'huri_test.fasta', 'fasta')}
next(iter(fasta.items()))

('CRC-001576EA78C61C85',
 'MSFKREGDDWSQLNVLKKRRVGDLLASYIPEDEALMLRDGRFACAICPHRPVLDTLAMLTAHRAGKKHLSSLQLFYGKKQPGKERKQNPKHQNELRREETKAEAPLLTQTRLITQSALHRAPHYNSCCRRKYRPEAPGPSVSLSPMPPSEVKLQSGKISREPEPAAGPQAEESATVSAPAPMSPTRRRALDHYLTLRSSGWIPDGRGRWVKDENVEFDSDEEEPPDLPLD')

In [45]:
def to_ppi_line(_ida, _idb) -> str:
    return f'>{ida}_{idb}\n{fasta[ida]}:{fasta[idb]}\n'

In [39]:
with (ppi_dir / '1:1_small' / 'huri_1332.fasta').open('w') as out_fasta:
    for _, (ida, idb) in pairs.iloc[:, [0, 1]].T.items():
        to_fasta(f'{ida}_{idb}', f'{fasta[ida]}:{fasta[idb]}', out_fasta)

In [41]:
!head -n 40 $ppi_dir/1:1_small/huri_1332.fasta

>CRC-001576EA78C61C85_CRC-045C9C9CAE0FD639
MSFKREGDDWSQLNVLKKRRVGDLLASYIPEDEALMLRDGRFACAICPHRPVLDTLAMLT
AHRAGKKHLSSLQLFYGKKQPGKERKQNPKHQNELRREETKAEAPLLTQTRLITQSALHR
APHYNSCCRRKYRPEAPGPSVSLSPMPPSEVKLQSGKISREPEPAAGPQAEESATVSAPA
PMSPTRRRALDHYLTLRSSGWIPDGRGRWVKDENVEFDSDEEEPPDLPLD:MAPRSRRRR
HKKPPSSVAPIIMAPTTIVTPVPLTPSKPGPSIDTLGFFSLDDNVPGLSQLILQKLNMKS
YEEYKLVVDGGTPVSGFGFRCPQEMFQRMEDTFRFCAHCRALPSGLSDSKVLRHCKRCRN
VYYCGPECQKSDWPAHRRVCQELRLVAVDRLMEWLLVTGDFVLPSGPWPWPPEAVQDWDS
WFSMKGLHLDATLDAVLVSHAVTTLWASVGRPRPDPDVLQGSLKRLLTDVLSRPLTLGLG
LRALGIDVRRTGGSTVHVVGASHVETFLTRPGDYDELGYMFPGHLGLRVVMVGVDVATGF
SQSTSTSPLEPGTIQLSAHRGLYHDFWEEQVETGQTHHPDLVAAFHPGFHSSPDLMEAWL
PTLLLLRDYKIPTLITVYSHQELVSSLQILVELDTHITAFGSNPFMSLKPEQVYSSPNKQ
PVYCSAYYIMFLGSSCQLDNRQLEEKVDGGI
>CRC-001576EA78C61C85_CRC-12E45FF6696080F8
MSFKREGDDWSQLNVLKKRRVGDLLASYIPEDEALMLRDGRFACAICPHRPVLDTLAMLT
AHRAGKKHLSSLQLFYGKKQPGKERKQNPKHQNELRREETKAEAPLLTQTRLITQSALHR
APHYNSCCRRKYRPEAPGPSVSLSPMPPSEVKLQSGKISREPEPAAGPQAEESATVSAPA
PMSPTRRRALD

In [46]:
with (ppi_dir / '1:1_small' / 'huri_1332_nolinebreaks.fasta').open('w') as out_fasta:
    for _, (ida, idb) in pairs.iloc[:, [0, 1]].T.items():
        out_fasta.write(to_ppi_line(ida, idb))

In [47]:
!head -n 4 $ppi_dir/1:1_small/huri_1332_nolinebreaks.fasta

>CRC-001576EA78C61C85_CRC-045C9C9CAE0FD639
MSFKREGDDWSQLNVLKKRRVGDLLASYIPEDEALMLRDGRFACAICPHRPVLDTLAMLTAHRAGKKHLSSLQLFYGKKQPGKERKQNPKHQNELRREETKAEAPLLTQTRLITQSALHRAPHYNSCCRRKYRPEAPGPSVSLSPMPPSEVKLQSGKISREPEPAAGPQAEESATVSAPAPMSPTRRRALDHYLTLRSSGWIPDGRGRWVKDENVEFDSDEEEPPDLPLD:MAPRSRRRRHKKPPSSVAPIIMAPTTIVTPVPLTPSKPGPSIDTLGFFSLDDNVPGLSQLILQKLNMKSYEEYKLVVDGGTPVSGFGFRCPQEMFQRMEDTFRFCAHCRALPSGLSDSKVLRHCKRCRNVYYCGPECQKSDWPAHRRVCQELRLVAVDRLMEWLLVTGDFVLPSGPWPWPPEAVQDWDSWFSMKGLHLDATLDAVLVSHAVTTLWASVGRPRPDPDVLQGSLKRLLTDVLSRPLTLGLGLRALGIDVRRTGGSTVHVVGASHVETFLTRPGDYDELGYMFPGHLGLRVVMVGVDVATGFSQSTSTSPLEPGTIQLSAHRGLYHDFWEEQVETGQTHHPDLVAAFHPGFHSSPDLMEAWLPTLLLLRDYKIPTLITVYSHQELVSSLQILVELDTHITAFGSNPFMSLKPEQVYSSPNKQPVYCSAYYIMFLGSSCQLDNRQLEEKVDGGI
>CRC-001576EA78C61C85_CRC-12E45FF6696080F8
MSFKREGDDWSQLNVLKKRRVGDLLASYIPEDEALMLRDGRFACAICPHRPVLDTLAMLTAHRAGKKHLSSLQLFYGKKQPGKERKQNPKHQNELRREETKAEAPLLTQTRLITQSALHRAPHYNSCCRRKYRPEAPGPSVSLSPMPPSEVKLQSGKISREPEPAAGPQAEESATVSAPAPMSPTRRRALDHYLTLRSSGWIPDGRGRWVKDENVEFDS