## Read the mapping descriptor files to sequence segments

In [1]:
import os

with open("all-output.txt", 'r') as f:
    lines = f.readlines()

lines = [l.split() for l in lines]
lines = [l for l in lines if len(l)>=7 and len(l[1])>=12]
pairs = dict([(l[1], int(l[2])) for l in lines])
seqs = dict([(l[1], l[-pairs[l[1]]:]) for l in lines])
len_seqs = dict([(k, len("".join(v))) for k, v in seqs.items()])
pairs

{'1A9N_1_Q_Q_2_U': 2,
 '1A9N_1_Q_Q_3_G': 2,
 '1A9N_1_Q_Q_4_G': 2,
 '1A9N_1_Q_Q_5_U': 2,
 '1A9N_1_Q_Q_6_A': 2,
 '1A9N_1_Q_Q_7_U': 2,
 '1A9N_1_Q_Q_8_U': 2,
 '1A9N_1_Q_Q_9_G': 1,
 '1A9N_1_Q_Q_10_C': 1,
 '1A9N_1_Q_Q_11_A': 1,
 '1A9N_1_Q_Q_12_G': 1,
 '1A9N_1_Q_Q_13_U': 1,
 '1A9N_1_Q_Q_14_A': 1,
 '1A9N_1_Q_Q_15_C': 1,
 '1A9N_1_Q_Q_16_C': 1,
 '1A9N_1_Q_Q_17_U': 2,
 '1A9N_1_Q_Q_18_C': 2,
 '1A9N_1_Q_Q_19_C': 2,
 '1A9N_1_Q_Q_20_A': 2,
 '1A9N_1_Q_Q_21_G': 2,
 '1B23_1_R_R_3_C': 3,
 '1B23_1_R_R_4_G': 3,
 '1B23_1_R_R_5_C': 2,
 '1B23_1_R_R_11_C': 3,
 '1B23_1_R_R_12_A': 3,
 '1B23_1_R_R_13_A': 2,
 '1B23_1_R_R_14_A': 3,
 '1B23_1_R_R_15_G': 2,
 '1B23_1_R_R_16_C': 2,
 '1B23_1_R_R_24_G': 2,
 '1B23_1_R_R_25_U': 2,
 '1B23_1_R_R_26_A': 2,
 '1B23_1_R_R_27_G': 3,
 '1B23_1_R_R_28_C': 2,
 '1B23_1_R_R_29_G': 2,
 '1B23_1_R_R_42_G': 2,
 '1B23_1_R_R_43_U': 2,
 '1B23_1_R_R_44_C': 3,
 '1B23_1_R_R_45_U': 3,
 '1B23_1_R_R_46_A': 3,
 '1B23_1_R_R_48_G': 3,
 '1B23_1_R_R_49_U': 4,
 '1B23_1_R_R_50_C': 2,
 '1B23_1_R_R_51_C': 2,

## Read test set files and find mapping to descriptor segments

In [2]:
from pathlib import Path
test_pkls = Path("../data/eval-pdb-all/all")
test_files = os.listdir(test_pkls)
test_files = [f.replace(".pkl", "") for f in test_files]
print(len(test_files), "test files found")
ones = [f for f in test_files if pairs.get(f, 0) == 1]
twos = [f for f in test_files if pairs.get(f, 0) == 2]
threes = [f for f in test_files if pairs.get(f, 0) == 3]
print(len(ones), "files with 1 segment")
print(len(twos), "files with 2 segments")
print(len(threes), "files with 3 segments")

10583 test files found
1423 files with 1 segment
4836 files with 2 segments
4324 files with 3 segments


##### Sequences for Boltz

In [8]:
out1_path = "/home/mjustyna/boltz/all_1seg_fastas"
out2_path = "/home/mjustyna/boltz/all_2seg_fastas"
out3_path = "/home/mjustyna/boltz/all_3seg_fastas"

os.makedirs(out1_path, exist_ok=True)
os.makedirs(out2_path, exist_ok=True)
os.makedirs(out3_path, exist_ok=True)

for one in ones:
    seq_one = f'>A|rna|{one}|empty\n{seqs[one][0].replace(",", "")}'
    fasta = seq_one
    with open(os.path.join(out1_path, f'{one}.fasta'), 'w') as f:
        f.write(fasta)
    print(fasta)

for two in twos:
    seq_one = f'>A|rna|{two}|empty\n{seqs[two][0].replace(',', "")}'
    seq_two = f'>B|rna|{two}|empty\n{seqs[two][1].replace(',', "")}'
    fasta = "\n".join([seq_one, seq_two])
    with open(os.path.join(out2_path, f'{two}.fasta'), 'w') as f:
        f.write(fasta)
    print(fasta)

for three in threes:
    seq_one = f'>A|rna|{three}|empty\n{seqs[three][0].replace(",", "")}'
    seq_two = f'>B|rna|{three}|empty\n{seqs[three][1].replace(",", "")}'
    seq_three = f'>C|rna|{three}|empty\n{seqs[three][2].replace(",", "")}'
    fasta = "\n".join([seq_one, seq_two, seq_three])
    with open(os.path.join(out3_path, f'{three}.fasta'), 'w') as f:
        f.write(fasta)
    print(fasta)

>A|rna|1UN6_1_F_F_18_C|empty
ACCUACGGGGC
>A|rna|8D29_1_J_A_17_C|empty
AUACCAGCGAAACACGCC
>A|rna|8D29_1_F_A_16_A|empty
ACCAGCGAAACACGCC
>A|rna|5B2P_1_B_B_83_U|empty
UCUGUUUGACACG
>A|rna|4RUM_1_A_A_31_G|empty
ACCAGAGCGGUC
>A|rna|5F5H_1_C_C_15_G|empty
CCGUUCUAGGUGC
>A|rna|2BH2_1_D_D_1955_U|empty
CUUGUCGGGUAAGUUCCGA
>A|rna|7OS0_1_D_D_23_A|empty
GCCAAGACGACGGCGG
>A|rna|7QQP_1_A_A_70_C|empty
UCAACUUGAAAAAGUGC
>A|rna|1S03_1_A_A_23_G|empty
AUGAGGCAAUUCAUG
>A|rna|4ILM_1_C_C_14_A|empty
UACUAUAGA
>A|rna|7QR8_1_A_A_57_A|empty
AGGCUAGUCCGUU
>A|rna|6TW1_1_V-M_V_6_U|empty
AGUAGUAACAA
>A|rna|5VW1_1_C_C_79_G|empty
ACUUGAAAAAGUGUC
>A|rna|4R4V_1_A_A_624_G|empty
CGAAGGGCGUCGUCGGCCCAAGC
>A|rna|2XDB_1_G_G_21_G|empty
AGGUGAUUUGCUACCUUUAAGUGCA
>A|rna|5NFV_1_B_B_-10_C|empty
AAUUUCUACUGUUGUAGA
>A|rna|3NVI_1_F_F_20_A|empty
UCUGACCGAAAGGCGUGAUGAGC
>A|rna|7YGA_1_N_A_384_G|empty
UAUGCGAAAGUAU
>A|rna|7DCO_1_H-F_F_12_A|empty
GAAGUAACCCUUC
>A|rna|6DU4_1_B_B_15_A|empty
CGUAGGCUACAGAGAAGC
>A|rna|6FF4_1_6_6_15_A|empty
CU

##### Sequences for RhoFold

In [3]:
out_path = '/home/mjustyna/RhoFold/seqs'
os.makedirs(out_path, exist_ok=True)

for one in ones:
    single_fasta = f'>{one}\n{";".join(seqs[one])}'
    with open(os.path.join(out_path, f'{one}.fasta'), 'w') as f:
        f.write(single_fasta)
    print(single_fasta)

>1UN6_1_F_F_18_C
ACCUACGGGGC
>8D29_1_J_A_17_C
AUACCAGCGAAACACGCC
>8D29_1_F_A_16_A
ACCAGCGAAACACGCC
>5B2P_1_B_B_83_U
UCUGUUUGACACG
>4RUM_1_A_A_31_G
ACCAGAGCGGUC
>5F5H_1_C_C_15_G
CCGUUCUAGGUGC
>2BH2_1_D_D_1955_U
CUUGUCGGGUAAGUUCCGA
>7OS0_1_D_D_23_A
GCCAAGACGACGGCGG
>7QQP_1_A_A_70_C
UCAACUUGAAAAAGUGC
>1S03_1_A_A_23_G
AUGAGGCAAUUCAUG
>4ILM_1_C_C_14_A
UACUAUAGA
>7QR8_1_A_A_57_A
AGGCUAGUCCGUU
>6TW1_1_V-M_V_6_U
AGUAGUAACAA
>5VW1_1_C_C_79_G
ACUUGAAAAAGUGUC
>4R4V_1_A_A_624_G
CGAAGGGCGUCGUCGGCCCAAGC
>2XDB_1_G_G_21_G
AGGUGAUUUGCUACCUUUAAGUGCA
>5NFV_1_B_B_-10_C
AAUUUCUACUGUUGUAGA
>3NVI_1_F_F_20_A
UCUGACCGAAAGGCGUGAUGAGC
>7YGA_1_N_A_384_G
UAUGCGAAAGUAU
>7DCO_1_H-F_F_12_A
GAAGUAACCCUUC
>6DU4_1_B_B_15_A
CGUAGGCUACAGAGAAGC
>6FF4_1_6_6_15_A
CUCGCUUCGGCAGCAC
>5MGA_1_B_B_-6_G
AAUUUCUACUGUUGUAGA
>5NFV_1_B_B_-9_U
CUACUGUUGUAGA
>6AGB_1_A_A_135_C
GUCGCUUUUGGC
>3SNP_1_D_D_15_A
CAACAGUGUUUGAA
>3TS2_1_U_U_9_U
GUCUAUGAUACCACCC
>5W1H_1_B_B_-12_G
CCAAGAAAGAGGG
>3SIU_1_F_F_45_A
GCGCAUAGUGAG
>1KH6_1_A_A_30_A
CCGAAAG

##### sequences for DRFold

In [5]:
out_path = "/home/mjustyna/software/DRfold/seqs"
os.makedirs(out_path, exist_ok=True)

for one in ones:
    single_fasta = f'>{one}\n{";".join(seqs[one])}'
    single_fasta = "\n".join([single_fasta] * 3)
    with open(os.path.join(out_path, f'{one}.fasta'), 'w') as f:
        f.write(single_fasta)
    print(single_fasta)

>1UN6_1_F_F_18_C
ACCUACGGGGC
>1UN6_1_F_F_18_C
ACCUACGGGGC
>1UN6_1_F_F_18_C
ACCUACGGGGC
>8D29_1_J_A_17_C
AUACCAGCGAAACACGCC
>8D29_1_J_A_17_C
AUACCAGCGAAACACGCC
>8D29_1_J_A_17_C
AUACCAGCGAAACACGCC
>8D29_1_F_A_16_A
ACCAGCGAAACACGCC
>8D29_1_F_A_16_A
ACCAGCGAAACACGCC
>8D29_1_F_A_16_A
ACCAGCGAAACACGCC
>5B2P_1_B_B_83_U
UCUGUUUGACACG
>5B2P_1_B_B_83_U
UCUGUUUGACACG
>5B2P_1_B_B_83_U
UCUGUUUGACACG
>4RUM_1_A_A_31_G
ACCAGAGCGGUC
>4RUM_1_A_A_31_G
ACCAGAGCGGUC
>4RUM_1_A_A_31_G
ACCAGAGCGGUC
>5F5H_1_C_C_15_G
CCGUUCUAGGUGC
>5F5H_1_C_C_15_G
CCGUUCUAGGUGC
>5F5H_1_C_C_15_G
CCGUUCUAGGUGC
>2BH2_1_D_D_1955_U
CUUGUCGGGUAAGUUCCGA
>2BH2_1_D_D_1955_U
CUUGUCGGGUAAGUUCCGA
>2BH2_1_D_D_1955_U
CUUGUCGGGUAAGUUCCGA
>7OS0_1_D_D_23_A
GCCAAGACGACGGCGG
>7OS0_1_D_D_23_A
GCCAAGACGACGGCGG
>7OS0_1_D_D_23_A
GCCAAGACGACGGCGG
>7QQP_1_A_A_70_C
UCAACUUGAAAAAGUGC
>7QQP_1_A_A_70_C
UCAACUUGAAAAAGUGC
>7QQP_1_A_A_70_C
UCAACUUGAAAAAGUGC
>1S03_1_A_A_23_G
AUGAGGCAAUUCAUG
>1S03_1_A_A_23_G
AUGAGGCAAUUCAUG
>1S03_1_A_A_23_G
AUGAGGCAAUUCAUG
>4IL