In [23]:
from Bio import SeqIO
import pandas as pd
import numpy as np
from pathlib import Path

import json

from ppi_utils.general import to_fasta, to_lines

In [2]:
ppi_dir = Path('.').resolve().parents[1] / 'ppi' / 'ppi_data' / 'v2.1'
ppi_dir, ppi_dir.is_dir()

(PosixPath('/home/quirin/PYTHON/ppi/ppi_data/v2.1'), True)

In [3]:
pairs = pd.read_csv(ppi_dir / '1:1_small' / 'huri_test.tsv', sep='\t')
pairs.head()

Unnamed: 0,hash_A,hash_B,species,label,cclass
0,CRC-001576EA78C61C85,CRC-045C9C9CAE0FD639,9606,1,3
1,CRC-001576EA78C61C85,CRC-12E45FF6696080F8,9606,1,3
2,CRC-001576EA78C61C85,CRC-2743716544288776,9606,1,3
3,CRC-001576EA78C61C85,CRC-2EDBC7981868EB12,9606,1,3
4,CRC-001576EA78C61C85,CRC-40897D5AF9B186A9,9606,1,3


In [12]:
fasta = {r.id: str(r.seq) for r in SeqIO.parse(
    ppi_dir / '1:1_small' / 'huri_test.fasta', 'fasta')}
next(iter(fasta.items()))

('CRC-001576EA78C61C85',
 'MSFKREGDDWSQLNVLKKRRVGDLLASYIPEDEALMLRDGRFACAICPHRPVLDTLAMLTAHRAGKKHLSSLQLFYGKKQPGKERKQNPKHQNELRREETKAEAPLLTQTRLITQSALHRAPHYNSCCRRKYRPEAPGPSVSLSPMPPSEVKLQSGKISREPEPAAGPQAEESATVSAPAPMSPTRRRALDHYLTLRSSGWIPDGRGRWVKDENVEFDSDEEEPPDLPLD')

## ColabFold concatenation

In [13]:
def to_ppi_line(_ida, _idb) -> str:
    return f'>{_ida}_{_idb}\n{fasta[_ida]}:{fasta[_idb]}\n'

In [6]:
with (ppi_dir / '1:1_small' / 'huri_1332.fasta').open('w') as out_fasta:
    for _, (ida, idb) in pairs.iloc[:, [0, 1]].T.items():
        to_fasta(f'{ida}_{idb}', f'{fasta[ida]}:{fasta[idb]}', out_fasta)

In [7]:
!head -n 40 "$ppi_dir/1:1_small/huri_1332.fasta"

>CRC-001576EA78C61C85_CRC-045C9C9CAE0FD639
MSFKREGDDWSQLNVLKKRRVGDLLASYIPEDEALMLRDGRFACAICPHRPVLDTLAMLT
AHRAGKKHLSSLQLFYGKKQPGKERKQNPKHQNELRREETKAEAPLLTQTRLITQSALHR
APHYNSCCRRKYRPEAPGPSVSLSPMPPSEVKLQSGKISREPEPAAGPQAEESATVSAPA
PMSPTRRRALDHYLTLRSSGWIPDGRGRWVKDENVEFDSDEEEPPDLPLD:MAPRSRRRR
HKKPPSSVAPIIMAPTTIVTPVPLTPSKPGPSIDTLGFFSLDDNVPGLSQLILQKLNMKS
YEEYKLVVDGGTPVSGFGFRCPQEMFQRMEDTFRFCAHCRALPSGLSDSKVLRHCKRCRN
VYYCGPECQKSDWPAHRRVCQELRLVAVDRLMEWLLVTGDFVLPSGPWPWPPEAVQDWDS
WFSMKGLHLDATLDAVLVSHAVTTLWASVGRPRPDPDVLQGSLKRLLTDVLSRPLTLGLG
LRALGIDVRRTGGSTVHVVGASHVETFLTRPGDYDELGYMFPGHLGLRVVMVGVDVATGF
SQSTSTSPLEPGTIQLSAHRGLYHDFWEEQVETGQTHHPDLVAAFHPGFHSSPDLMEAWL
PTLLLLRDYKIPTLITVYSHQELVSSLQILVELDTHITAFGSNPFMSLKPEQVYSSPNKQ
PVYCSAYYIMFLGSSCQLDNRQLEEKVDGGI
>CRC-001576EA78C61C85_CRC-12E45FF6696080F8
MSFKREGDDWSQLNVLKKRRVGDLLASYIPEDEALMLRDGRFACAICPHRPVLDTLAMLT
AHRAGKKHLSSLQLFYGKKQPGKERKQNPKHQNELRREETKAEAPLLTQTRLITQSALHR
APHYNSCCRRKYRPEAPGPSVSLSPMPPSEVKLQSGKISREPEPAAGPQAEESATVSAPA
PMSPTRRRALD

In [22]:
with (ppi_dir / '1:1_small' / 'huri_1332_nolinebreaks'
                              '.fasta').open('w') as out_fasta:
    for _, (ida, idb) in pairs.iloc[:, [0, 1]].T.items():
        out_fasta.write(to_ppi_line(ida, idb))

In [8]:
!head -n 4 "$ppi_dir/1:1_small/huri_1332_nolinebreaks.fasta"

>CRC-001576EA78C61C85_CRC-045C9C9CAE0FD639
MSFKREGDDWSQLNVLKKRRVGDLLASYIPEDEALMLRDGRFACAICPHRPVLDTLAMLTAHRAGKKHLSSLQLFYGKKQPGKERKQNPKHQNELRREETKAEAPLLTQTRLITQSALHRAPHYNSCCRRKYRPEAPGPSVSLSPMPPSEVKLQSGKISREPEPAAGPQAEESATVSAPAPMSPTRRRALDHYLTLRSSGWIPDGRGRWVKDENVEFDSDEEEPPDLPLD:MAPRSRRRRHKKPPSSVAPIIMAPTTIVTPVPLTPSKPGPSIDTLGFFSLDDNVPGLSQLILQKLNMKSYEEYKLVVDGGTPVSGFGFRCPQEMFQRMEDTFRFCAHCRALPSGLSDSKVLRHCKRCRNVYYCGPECQKSDWPAHRRVCQELRLVAVDRLMEWLLVTGDFVLPSGPWPWPPEAVQDWDSWFSMKGLHLDATLDAVLVSHAVTTLWASVGRPRPDPDVLQGSLKRLLTDVLSRPLTLGLGLRALGIDVRRTGGSTVHVVGASHVETFLTRPGDYDELGYMFPGHLGLRVVMVGVDVATGFSQSTSTSPLEPGTIQLSAHRGLYHDFWEEQVETGQTHHPDLVAAFHPGFHSSPDLMEAWLPTLLLLRDYKIPTLITVYSHQELVSSLQILVELDTHITAFGSNPFMSLKPEQVYSSPNKQPVYCSAYYIMFLGSSCQLDNRQLEEKVDGGI
>CRC-001576EA78C61C85_CRC-12E45FF6696080F8
MSFKREGDDWSQLNVLKKRRVGDLLASYIPEDEALMLRDGRFACAICPHRPVLDTLAMLTAHRAGKKHLSSLQLFYGKKQPGKERKQNPKHQNELRREETKAEAPLLTQTRLITQSALHRAPHYNSCCRRKYRPEAPGPSVSLSPMPPSEVKLQSGKISREPEPAAGPQAEESATVSAPAPMSPTRRRALDHYLTLRSSGWIPDGRGRWVKDENVEFDS

## EMBER3D concatenation

In [8]:
ember_dir = Path.home() / 'PYTHON/EMBER3D'
ember_dir, ember_dir.is_dir()

(PosixPath('/home/quirin/PYTHON/EMBER3D'), True)

In [9]:
sample_size = 4

In [10]:
def to_ember_line(_ida, _idb, linker='N' * 20) -> str:
    return f'>{_ida}_{_idb}\n{fasta[_ida]}{linker}{fasta[_idb]}\n'

In [11]:
it = iter(fasta.items())
next(it), next(it)

(('CRC-001576EA78C61C85',
  'MSFKREGDDWSQLNVLKKRRVGDLLASYIPEDEALMLRDGRFACAICPHRPVLDTLAMLTAHRAGKKHLSSLQLFYGKKQPGKERKQNPKHQNELRREETKAEAPLLTQTRLITQSALHRAPHYNSCCRRKYRPEAPGPSVSLSPMPPSEVKLQSGKISREPEPAAGPQAEESATVSAPAPMSPTRRRALDHYLTLRSSGWIPDGRGRWVKDENVEFDSDEEEPPDLPLD'),
 ('CRC-0026DBDE1FEAD2B3',
  'MENQPVRWRALPGLPRPPGLPAAPWLLLGVLLLPGTLRLAGGQSVTHTGLPIMASLANTAISFSCRITYPYTPQFKVFTVSYFHEDLQGQRSPKKPTNCHPGLGTENQSHTLDCQVTLVLPGASATGTYYCSVHWPHSTVRGSGTFILVRDAGYREPPQSPQKLLLFGFTGLLSVLSVVGTALLLWNKKRMRGPGKDPTRKCPDPRSASSPKQHPSESVYTALQRRETEVYACIENEDGSSPTAKQSPLSQERPHRFEDDGELNLVYENL'))

In [13]:
pairs['len'] = pairs.hash_A.apply(lambda ha: len(fasta[ha])) + pairs.hash_B.apply(lambda ha: len(fasta[ha]))
ppis = pairs.loc[pairs.label == 1].sort_values(by='len').head(sample_size)
ppis

Unnamed: 0,hash_A,hash_B,species,label,cclass,len
561,CRC-9DE436A7FF533443,CRC-B2ED164C9F379EDD,9606,1,3,241
580,CRC-AB9BC8BFE2850E92,CRC-AC24810FC35B6F09,9606,1,3,243
198,CRC-2B890673EA35BF5A,CRC-D0AE8929DEB2AC06,9606,1,2,245
489,CRC-817752F20E262FD3,CRC-B1163143D1292259,9606,1,2,245


In [14]:
negs = pairs.loc[pairs.label == 0].sort_values(by='len').head(sample_size)
negs

Unnamed: 0,hash_A,hash_B,species,label,cclass,len
919,CRC-306C2057359CD049,CRC-44AD1817D6C536D6,9606,0,3,200
859,CRC-2AF8FDD248A86C41,CRC-AC24810FC35B6F09,9606,0,3,216
1262,CRC-B1163143D1292259,CRC-C3CF7EDEF95D2241,9606,0,1,245
858,CRC-2AF8FDD248A86C41,CRC-9DE436A7FF533443,9606,0,3,254


Also load the homodimer set

In [14]:
fasta |= {r.id: str(r.seq) for r in SeqIO.parse(
    ppi_dir / 'homodimer.fasta', 'fasta')}

In [16]:
hds = pd.read_csv(ppi_dir / 'homodimer.tsv', sep='\t')
hds['len'] = 2 * hds.hash_A.apply(lambda ha: len(fasta[ha]))
hd_ppis = hds.loc[hds.label == 1].sort_values(by='len').head(sample_size)
hd_ppis

Unnamed: 0,hash_A,hash_B,species,source,label,cclass,len
1428,CRC-893BD610A349A08D,CRC-893BD610A349A08D,3702,apid,1,1,104
1287,CRC-7BD48340D6D2966C,CRC-7BD48340D6D2966C,83333,apid,1,1,114
1946,CRC-B82DFCCB29A611A4,CRC-B82DFCCB29A611A4,559292,apid,1,1,118
3171,CRC-FEE7DA7A6E02B08E,CRC-FEE7DA7A6E02B08E,9606,huri,1,3,120


In [17]:
hd_negs = hds.loc[hds.label == 0].sort_values(by='len').head(sample_size)
hd_negs

Unnamed: 0,hash_A,hash_B,species,source,label,cclass,len
10711,CRC-7298618F8CCE8CCD,CRC-7298618F8CCE8CCD,192222,apid,0,1,100
14610,CRC-E1419ED5FEC45E14,CRC-E1419ED5FEC45E14,559292,apid,0,1,100
15382,CRC-574B060DBAA0414F,CRC-574B060DBAA0414F,7955,swissprot,0,3,100
5584,CRC-B6D641A991D520FB,CRC-B6D641A991D520FB,6239,apid,0,1,100


In [93]:
with (ember_dir / 'experi' / 'five.fasta').open('w') as out_fasta:
    for df in [ppis, negs, hd_ppis, hd_negs]:
        for _, (ida, idb, label) in df[[
            'hash_A', 'hash_B', 'label']].T.items():
            for spacer_len in [0, 10, 20, 60, 200]:
                for rev in [False, True]:
                    to_fasta(
                        f'{ida}_{idb}_{spacer_len}'
                        f'{"_i" if label else "_ni"}'
                        f'{"_rev" if rev else "_fw"}',
                        f'{fasta[ida]}{"N" * spacer_len}{fasta[idb][::(-1 if rev else 1)]}',
                        out_fasta)

## Negatome

In [15]:
negatome = pd.read_csv(ppi_dir.parent / 'negatome' /
                       'manual_stringent.txt', sep='\t', header=None)
negatome.head()

Unnamed: 0,0,1,2,3
0,Q6ZNK6,Q9Y4K3,15047173,MI:0019 - coimmunoprecipitation
1,Q9NR31,Q15797,17356069,MI:0019 - coimmunoprecipitation
2,P11627,P53986,20155396,MI:0411- enzyme linked immunosorbent assay
3,P33176,Q96EK5,16225668,MI:0059 - gst pull down
4,Q9NPY3,P02745,11994479,MI:0411- enzyme linked immunosorbent assay


In [16]:
negatome = negatome.iloc[:, [0, 1]]
negatome.columns = ['hash_A', 'hash_B']
negatome.head()

Unnamed: 0,hash_A,hash_B
0,Q6ZNK6,Q9Y4K3
1,Q9NR31,Q15797
2,P11627,P53986
3,P33176,Q96EK5
4,Q9NPY3,P02745


In [17]:
negatome_ids = sorted(set(np.unique(negatome)))
negatome_ids[:4], len(negatome_ids)

(['A1E347', 'A1X4Q1', 'A2AM29', 'A2ASQ1'], 1733)

In [18]:
from ppi_utils import api
from ppi_utils import extract
from ppi_utils import reduce

In [19]:
npath = Path('negatome')

In [None]:
hlookup = api.uniprot_api_fetch(negatome_ids, npath / 'manual_stringent')

In [24]:
with open(npath / 'manual_stringent.json', 'r') as json_file:
    hlookup = json.load(json_file)

In [25]:
hlookup

{'A1E347': 'CRC-13E28B257556496D',
 'A1X4Q1': 'CRC-05BE58D8A6FDC2A2',
 'A2AM29': 'CRC-120A0604B13675EC',
 'A2ASQ1': 'CRC-0679A3F6D8BD1286',
 'A4IFA9': 'CRC-88B7D038C61C8791',
 'A5JSJ9': 'CRC-07269000E6C214F0',
 'A6NIX2': 'CRC-B95E162B3009B886',
 'B1P6E5': 'CRC-5BB61D5D0E309EBB',
 'B2RWS6': 'CRC-0E9B2D9508237671',
 'B2ZZS9': 'CRC-57E722C76602443E',
 'B3VK31': 'CRC-29F02F9C09B3C032',
 'B7SGZ7': 'CRC-2A6C43E1A4B724A1',
 'B7Z018': 'CRC-F1C8FD23A67609A8',
 'B9EKI3': 'CRC-D208A815FF740CF1',
 'C4ZQ83': 'CRC-195147734CDF8B04',
 'C6EGH1': 'CRC-12A14B75A3CAEA19',
 'C7GKY3': 'CRC-8370B24806510774',
 'C7GP96': 'CRC-3D2162EEF5E52D69',
 'C7GX20': 'CRC-36395E15D32F33BD',
 'C9WF17': 'CRC-089A7E14F77333CC',
 'D3ZUY8': 'CRC-77D1C8AC17B00EA4',
 'D3ZWB1': 'CRC-CA3E6E67454E7465',
 'E5LBX1': 'CRC-EA80817CB9BD0100',
 'F1NVD2': 'CRC-8A8D713031B70F8E',
 'F6IB71': 'CRC-FF0167AC03BFCB97',
 'G0TQZ9': 'CRC-18C0D7E01B851150',
 'G3V608': 'CRC-ADAC63664E97A520',
 'G5DGD4': 'CRC-159FE70E5CD32ED5',
 'G8JZS0': 'CRC-A568

In [20]:
hash_negatome = extract.ppis_to_hashes(
    negatome, npath / 'manual_stringent.json')
len(hash_negatome)

1932

In [21]:
hash_negatome.head()

Unnamed: 0,hash_A,hash_B
1730,CRC-002B96F5A998B5CD,CRC-530D84854E943CD4
1546,CRC-003B3F654F0C60DF,CRC-1AF2ABBBC79191DD
1255,CRC-003B3F654F0C60DF,CRC-46DF5E8B371321AC
1524,CRC-003B3F654F0C60DF,CRC-723D63962FB29BD9
1395,CRC-003B3F654F0C60DF,CRC-95406DB5FC0AA4C9


In [22]:
len(reduce.dedup_pairs(hash_negatome))

1932

In [26]:
fasta |= {r.id: str(r.seq) for r in SeqIO.parse(
    npath / 'manual_stringent.hash.fasta', 'fasta')}

In [27]:
hash_negatome.head()

Unnamed: 0,hash_A,hash_B
1730,CRC-002B96F5A998B5CD,CRC-530D84854E943CD4
1546,CRC-003B3F654F0C60DF,CRC-1AF2ABBBC79191DD
1255,CRC-003B3F654F0C60DF,CRC-46DF5E8B371321AC
1524,CRC-003B3F654F0C60DF,CRC-723D63962FB29BD9
1395,CRC-003B3F654F0C60DF,CRC-95406DB5FC0AA4C9


## cut in: write a combined TSV for Horia

In [44]:
unn = negatome.copy()
unn.columns = ['ida', 'idb']
unn.head()

Unnamed: 0,ida,idb
0,Q6ZNK6,Q9Y4K3
1,Q9NR31,Q15797
2,P11627,P53986
3,P33176,Q96EK5
4,Q9NPY3,P02745


In [45]:
unn['crca'] = unn.ida.apply(hlookup.get)
unn['crcb'] = unn.idb.apply(hlookup.get)
unn[['pdockq', 'label', 'dataset']] = pd.NA, 0, 'negatome'
unn.head()

Unnamed: 0,ida,idb,crca,crcb,pdockq,label,dataset
0,Q6ZNK6,Q9Y4K3,CRC-FE37A8C33F60BB42,CRC-5AB9C255CCFEE749,,0,negatome
1,Q9NR31,Q15797,CRC-38A869175CBA54F3,CRC-2DD34B7F434DBC7E,,0,negatome
2,P11627,P53986,CRC-22BE57001CB2A538,CRC-8B6DAB7741340DD7,,0,negatome
3,P33176,Q96EK5,CRC-A1FE5760C3250C8B,CRC-DA86308364D31335,,0,negatome
4,Q9NPY3,P02745,CRC-EECA0FEAC55FCAC2,CRC-8FF6B6AE02D49C4C,,0,negatome


In [46]:
# now sort horizontally by the crc64 hash
unn.loc[unn.ida > unn.idb, ['ida', 'idb', 'crca', 'crcb']] = unn.loc[unn.ida > unn.idb, ['idb', 'ida', 'crcb', 'crca']].values
unn = unn.sort_values(by=['dataset', 'crca', 'crcb', 'ida', 'idb'])

In [47]:
unn.head()

Unnamed: 0,ida,idb,crca,crcb,pdockq,label,dataset
1730,P39447,Q8VHG2,CRC-002B96F5A998B5CD,CRC-530D84854E943CD4,,0,negatome
1546,Q01094,Q06546,CRC-003B3F654F0C60DF,CRC-1AF2ABBBC79191DD,,0,negatome
1524,Q01094,Q06547,CRC-003B3F654F0C60DF,CRC-723D63962FB29BD9,,0,negatome
1395,Q01094,Q15796,CRC-003B3F654F0C60DF,CRC-95406DB5FC0AA4C9,,0,negatome
806,P25311,P61769,CRC-006A153A8E32A0B1,CRC-AFD2DBEF07DCEF27,,0,negatome


In [49]:
unn.to_csv('negatome_scores.tsv', sep='\t', header=True, index=False)

### continue as if nothing happened

In [None]:
hash_negatome['len'] = hash_negatome.hash_A.apply(lambda ha: len(fasta[ha])) + hash_negatome.hash_B.apply(lambda ha: len(fasta[ha]))
hash_negatome

In [61]:
nega = hash_negatome.sort_values(by='len').head(sample_size).copy()
nega['label'] = 0
nega

Unnamed: 0,hash_A,hash_B,len,label
1877,CRC-21167891FDE804F1,CRC-21167891FDE804F1,160,0
1695,CRC-21167891FDE804F1,CRC-623591A09A6ABACE,175,0
1468,CRC-82B5C8830E64C992,CRC-82B5C8830E64C992,182,0
1797,CRC-860CBB1416ACBCA1,CRC-AD5E53AF326B25D2,184,0


In [29]:
# 100 not-too-long ones for colabfold
n_negatome_pairs = 100

crop = hash_negatome.loc[hash_negatome.len < 3000]

rng = np.random.default_rng(42)
chc = rng.choice(len(crop), size=100,
                 replace=True, shuffle=True)
crop = crop.iloc[sorted(chc)].sort_values(by='len')
crop

Unnamed: 0,hash_A,hash_B,len
246,CRC-3AB8269966FDB513,CRC-D93C88E3C26D64B9,288
1970,CRC-408C59F0CCAA6781,CRC-F073C2BA91FE0059,353
1246,CRC-6B4BC3FCDE10727B,CRC-9FD56ACB878FBCCA,381
365,CRC-4874D6CC0B1646F5,CRC-E660843361C28EBA,425
547,CRC-44AE51967C58DDFF,CRC-C3F2A3E71D972E2D,434
...,...,...,...
57,CRC-6DDD0F49C4E490D3,CRC-CE69BBB287D35AB5,2323
1676,CRC-8D082AA2E768C065,CRC-CA838BCD2AF3CA68,2468
446,CRC-6C436A7A5FEE6DEB,CRC-A2E181ED28DD6082,2879
313,CRC-3B66BCC464B393FB,CRC-3BEA9B8558BA1A5E,2883


In [167]:
with (npath / f'negatome_{n_negatome_pairs}'
           f'_colabfold.fasta').open('w') as out_fasta:
    for _, (ida, idb) in crop.iloc[:, [0, 1]].T.items():
        to_fasta(f'{ida}_{idb}', f'{fasta[ida]}:{fasta[idb]}', out_fasta)

In [48]:
# the 10 shortest for EMBER3D
n_negatome_pairs = 20

ngts = hash_negatome.sort_values(by='len').head(n_negatome_pairs)
ngts

Unnamed: 0,hash_A,hash_B,len
1877,CRC-21167891FDE804F1,CRC-21167891FDE804F1,160
1695,CRC-21167891FDE804F1,CRC-623591A09A6ABACE,175
1468,CRC-82B5C8830E64C992,CRC-82B5C8830E64C992,182
1797,CRC-860CBB1416ACBCA1,CRC-AD5E53AF326B25D2,184
961,CRC-623591A09A6ABACE,CRC-82B5C8830E64C992,186
1539,CRC-15D26E6B75AA2EE1,CRC-82B5C8830E64C992,187
1359,CRC-623591A09A6ABACE,CRC-623591A09A6ABACE,190
1379,CRC-15D26E6B75AA2EE1,CRC-15D26E6B75AA2EE1,192
1156,CRC-1729ED680290CFE4,CRC-AD5E53AF326B25D2,199
1754,CRC-206B0DF0D95CD4D2,CRC-206B0DF0D95CD4D2,204


In [49]:
from Bio.Seq import reverse_complement

In [50]:
reverse_complement('SDADLL')

'LLHTHS'

In [51]:
def orient(seq: str, direction: str) -> str:
    if direction == 'fw':
        return seq
    elif direction == 'rev':
        return seq[::-1]
    elif direction == 'rc':
        return reverse_complement(seq)
    assert False, 'illegal orientation'

In [47]:
with (ember_dir / 'experi' / f'negatome_{n_negatome_pairs}_ember'
                             '.fasta').open('w') as out_fasta:
    for _, (ida, idb) in ngts[[
        'hash_A', 'hash_B']].T.items():
        for spacer_len in [0, 10, 20, 60, 200]:
            for direction in ['fw', 'rev']:
                to_fasta(
                    f'{ida}_{idb}_{spacer_len}_ni_{direction}',
                    f'{fasta[ida]}{"N" * spacer_len}{orient(fasta[idb], direction)}',
                    out_fasta)

In [40]:
!python predict.py --output-2d --no-pdb --save-distance-array --t5-dir "/mnt/project/kaindl/ppi/embed_data/t5_xl_weights" -i "experi/five.fasta" -o "experi/five"

python: can't open file '/home/quirin/PYTHON/ppi/colabfold/predict.py': [Errno 2] No such file or directory


## re-try without idiotic reversing, but literature linkers

In [65]:
with (ember_dir / 'experi' / 'six.fasta').open('w') as out_fasta:
    for df in [ppis, negs, hd_ppis, hd_negs, nega]:
        for _, (ida, idb, label) in df[[
            'hash_A', 'hash_B', 'label']].T.items():
            for spacer_name, spacer in zip(
                    ['20n', '200n', '50g', 'ggs'],
                    ['N' * 20, 'N' * 200, 'G' * 50, 'GGS' * 7]):
                to_fasta(
                    f'{ida}_{idb}_{spacer_name}'
                    f'{"_i" if label else "_ni"}',
                    f'{fasta[ida]}{spacer}{fasta[idb]}',
                    out_fasta)

In [83]:
with (ember_dir / 'experi' / 'seven.fasta').open('w') as out_fasta, (
        ember_dir / 'experi' / 'seven.tsv').open('w') as out_tsv:
    written = set()
    out_tsv.write('hash_A\thash_B\tlabel\n')
    for df in [ppis, negs, hd_ppis, hd_negs, nega]:
        for _, (ida, idb, label) in df[[
            'hash_A', 'hash_B', 'label']].T.items():
            out_tsv.write(f'{ida}\t{idb}\t{label}\n')
            for i in [ida, idb]:
                if i not in written:
                    to_fasta(i, fasta[i], out_fasta)
                    written.add(i)