In [1]:
import pandas as pd
from Bio import PDB
import csv
import os
from Bio.PDB.MMCIFParser import MMCIFParser

In [3]:
data = pd.read_csv('../HL_43074.18.csv', sep=',')

dimension = data.shape
motifs_ids = []
motifs_nucs = []
discared = []

for i in range(dimension[0]):                #rows
    nucs, nucs_str = [], ''
    for j in range(dimension[1]):            #columns
        cell = data.iloc[i,j].split('|')
        if cell[-1] in ['A', 'B', 'C', 'D', 'E', 'F']:
            discared.append(cell[0])
            break
        nucs.append(int(cell[-1]))
        nucs_str += cell[-2]
    if cell[0] not in discared:
        motifs_ids.append([cell[0], cell[-3], nucs])
        motifs_nucs.append([cell[0], cell[-3], nucs_str])

motifs_ids.sort(key=lambda x: x[0])
motifs_nucs.sort(key=lambda x: x[0])
print(motifs_ids)
print(motifs_nucs)
print(discared)

[['1G1X', 'D', [590, 591, 592, 593, 594, 649]], ['1G1X', 'J', [726, 727, 728, 729, 730, 731]], ['1G1X', 'E', [726, 727, 728, 729, 730, 731]], ['1G1X', 'I', [590, 591, 592, 593, 594, 649]], ['1JID', 'B', [146, 147, 148, 149, 150, 151]], ['1KH6', 'A', [28, 29, 30, 31, 32, 33]], ['1KXK', 'A', [33, 34, 35, 36, 37, 38]], ['1L9A', 'B', [197, 198, 199, 200, 201, 202]], ['1L9A', 'B', [146, 147, 148, 149, 150, 151]], ['1M5K', 'B', [74, 75, 76, 77, 78, 79]], ['1MFQ', 'A', [197, 198, 199, 200, 201, 202]], ['1MFQ', 'A', [146, 147, 148, 149, 150, 151]], ['1MZP', 'B', [25, 26, 27, 28, 29, 30]], ['1NBS', 'B', [204, 205, 206, 207, 208, 209]], ['1Q96', 'A', [13, 14, 15, 16, 17, 18]], ['1RLG', 'D', [9, 10, 11, 12, 13, 14]], ['1U6B', 'B', [23, 24, 25, 26, 27, 28]], ['1U6B', 'B', [153, 154, 155, 156, 157, 158]], ['1U9S', 'A', [204, 205, 206, 207, 208, 209]], ['1U9S', 'A', [99, 100, 101, 102, 103, 104]], ['1UN6', 'F', [82, 83, 84, 85, 86, 94]], ['2OIU', 'P', [10, 11, 12, 13, 14, 15]], ['2OIU', 'P', [59, 60

In [4]:
df = pd.DataFrame(index = range(len(motifs_ids)), columns=['N1', 'N2', 'N3', 'N4', 'N5', 'N6'])
df

Unnamed: 0,N1,N2,N3,N4,N5,N6
0,,,,,,
1,,,,,,
2,,,,,,
3,,,,,,
4,,,,,,
...,...,...,...,...,...,...
244,,,,,,
245,,,,,,
246,,,,,,
247,,,,,,


In [7]:
parser = MMCIFParser(QUIET=True)

def get_atom_coordinates(df, seqs, motifs_ids, atom, ref):
    for i, motif in enumerate(motifs_ids):
        ref_seq = ref[i][2]
        motif_cords = []
        file_name = motif[0].lower() + '.cif'
        structure = parser.get_structure(file_name.rstrip('.cif'), f"../mmCIF_files/{file_name}")
        #if len(structure.get_list()) > 1: raise Exception('More than one model')
        model = structure[0]
        chain = model[motif[1]]
        for j, motif_nuc in enumerate(motif[2]):
            res = chain[motif_nuc]
            if res.resname != ref_seq[j]: 
                print(f'Resname in pdb: {res.resname}    my resanme: {ref_seq[j]}  in {motif[0]}')
                #raise Exception('WRONG RESIDUE')
            if atom not in res: raise Exception('Given atom not found in a residue') #at=res['C'] 
            else: at = res[atom]
            motif_cords.append(at.get_coord())
        seqs.append(ref_seq)
        df.iloc[i] = motif_cords
    return df, seqs

In [9]:
database, sequences = get_atom_coordinates(df, [], motifs_ids, "P", motifs_nucs)
print(database)
print(sequences)

                              N1                           N2  \
0       [71.679, 27.885, 13.362]      [75.126, 29.191, 8.911]   
1       [11.903, 62.561, 20.255]      [12.89, 67.843, 20.548]   
2       [52.497, 35.902, 45.669]      [57.25, 38.924, 45.457]   
3        [-1.686, 76.373, 54.02]     [-1.723, 78.977, 59.081]   
4         [8.277, 2.841, 35.275]      [12.066, 1.936, 39.164]   
..                           ...                          ...   
244   [90.763, -22.623, -11.049]   [92.213, -24.133, -16.157]   
245  [104.388, -28.521, -84.394]  [102.175, -29.458, -89.851]   
246      [7.841, -42.465, 50.44]      [9.21, -37.347, 49.237]   
247  [87.375, -117.074, -32.511]  [90.545, -113.433, -35.177]   
248   [52.339, -23.045, -58.864]   [56.951, -26.375, -57.946]   

                              N3                           N4  \
0        [80.361, 30.047, 8.043]     [82.139, 32.527, 13.272]   
1       [11.485, 73.132, 20.547]      [7.281, 72.722, 16.628]   
2       [62.475, 39.792,

In [10]:
for column in database.columns:
    database[column] = database[column].apply(lambda x: ','.join(map(str, x)))

database['class'] = 1
database

Unnamed: 0,N1,N2,N3,N4,N5,N6,class
0,"71.679,27.885,13.362","75.126,29.191,8.911","80.361,30.047,8.043","82.139,32.527,13.272","83.259,38.334,15.381","81.294,43.869,14.935",1
1,"11.903,62.561,20.255","12.89,67.843,20.548","11.485,73.132,20.547","7.281,72.722,16.628","5.763,72.86,10.991","7.417,72.774,5.347",1
2,"52.497,35.902,45.669","57.25,38.924,45.457","62.475,39.792,45.684","63.627,35.696,49.801","64.509,34.358,55.361","63.005,35.335,60.758",1
3,"-1.686,76.373,54.02","-1.723,78.977,59.081","-0.723,84.602,63.317","-1.274,87.084,57.332","5.354,88.888,55.743","12.345,90.194,56.876",1
4,"8.277,2.841,35.275","12.066,1.936,39.164","15.723,4.163,42.831","18.227,9.024,40.773","22.603,10.952,36.499","25.817,10.06,32.144",1
...,...,...,...,...,...,...,...
244,"90.763,-22.623,-11.049","92.213,-24.133,-16.157","90.551,-23.767,-21.328","84.69,-23.522,-20.343","79.986,-27.323,-20.876","77.667,-32.861,-20.317",1
245,"104.388,-28.521,-84.394","102.175,-29.458,-89.851","101.665,-27.911,-94.949","102.179,-21.984,-93.994","98.762,-17.15,-93.642","93.545,-14.985,-91.353",1
246,"7.841,-42.465,50.44","9.21,-37.347,49.237","9.272,-33.763,45.172","9.899,-37.668,40.687","14.471,-39.674,37.477","19.708,-41.493,36.389",1
247,"87.375,-117.074,-32.511","90.545,-113.433,-35.177","93.248,-108.89,-34.711","89.758,-106.055,-30.585","85.596,-101.754,-30.875","81.4,-99.066,-33.184",1


In [11]:
import csv

database.to_csv("database_F.csv")

with open("database_F_seqs.csv", 'w', newline='') as f:
     
    # using csv.writer method from CSV package
    write = csv.writer(f)
    for item in sequences:
        write.writerow([item])

## brak motywu

#### stems

In [12]:
stems = pd.read_csv('../rnasolo-3.218-dbn-csv/stems_u.txt', sep=' ', header=None, names=['ID', 'start_1', 'end_1', 'seq_1', 'braq_1', 'start_2', 'end_2', 'seq_2', 'braq_2'])
stems['chain'] = list(map(lambda x: x.split('_')[2].rsplit('-motifs.txt:Stem')[0], stems['ID']))
stems['ID'] = list(map(lambda x: x[:x.find('_')], stems['ID']))
stems = stems[stems['braq_1'] != '[[[']
stems = stems[~stems['chain'].str.contains('-')]
stems = stems[~stems['ID'].duplicated()]
stems = stems.reset_index(drop=True)
stems

Unnamed: 0,ID,start_1,end_1,seq_1,braq_1,start_2,end_2,seq_2,braq_2,chain
0,1A60,18,20,CCC,(((,30,32,GGG,))),A
1,1B23,9,11,ACA,(((,19,21,UGU,))),R
2,1BVJ,7,9,GGU,(((,16,18,AUC,))),A
3,1C2W,30,32,GCC,(((,473,475,GGC,))),B
4,1C2X,19,21,CGG,(((,62,64,CCG,))),C
...,...,...,...,...,...,...,...,...,...,...
478,7P3K,26,28,CCG,(((,40,42,CGG,))),V
479,7QEP,40,42,GGA,(((,81,83,UCU,))),1
480,7R81,10,12,GCC,(((,23,25,GGC,))),t1
481,7RQB,1,3,AGA,(((,2859,2861,UCU,))),1A


In [13]:
stem_ids = [
    '1BVJ', '1CQL', '1F84', '1HS4', '1KUQ', '1KXK', '1LC6', '1M5K', '1MZP', '1P5P',
    '2OIU', '3AM1', '3D0U', '3G9C', '3IGI', '3IWN', '3JBV', '3RW6', '3SUX', '3V7E',
    '4AOB', '4BW0', '4LVW', '4OQU', '4V5O', '4V8M', '4WSM', '5B2P', '5FJ4', '5FJC',
    '5J7L', '5TBW', '5U3G', '5XTM', '5Y7M', '6DU5', '6DVK', '6JDV', '6N2V', '6ORD',
    '6UZ7', '6Z6M', '6ZYM', '7A0S', '7AC7', '7FJ0', '7O80'
]

stems_complete = []
for index, row in stems.iterrows():
    if row['ID'] in stem_ids:
        if len(stems_complete) == 120:
            break
        seqs = list(range(int(row['start_1']), int(row['end_1']+1))) + list(range(int(row['start_2']), int(row['end_2']+1)))
        stems_complete.append([row['ID'], row['chain'], seqs])

In [14]:
print(len(stems_complete))
stems_complete

47


[['1BVJ', 'A', [7, 8, 9, 16, 17, 18]],
 ['1CQL', 'A', [12, 13, 14, 33, 34, 35]],
 ['1F84', 'A', [10, 11, 12, 19, 20, 21]],
 ['1HS4', 'A', [1, 2, 3, 10, 11, 12]],
 ['1KUQ', 'B', [6, 7, 8, 13, 14, 15]],
 ['1KXK', 'A', [6, 7, 8, 62, 63, 64]],
 ['1LC6', 'A', [7, 8, 9, 15, 16, 17]],
 ['1M5K', 'B', [68, 69, 70, 81, 82, 83]],
 ['1MZP', 'B', [23, 24, 25, 30, 31, 32]],
 ['1P5P', 'A', [23, 24, 25, 58, 59, 60]],
 ['2OIU', 'P', [8, 9, 10, 15, 16, 17]],
 ['3AM1', 'B', [30, 31, 32, 37, 38, 39]],
 ['3D0U', 'A', [79, 80, 81, 108, 109, 110]],
 ['3G9C', 'Q', [28, 29, 30, 42, 43, 44]],
 ['3IGI', 'A', [27, 28, 29, 37, 38, 39]],
 ['3IWN', 'A', [40, 41, 42, 82, 83, 84]],
 ['3JBV', 'W', [10, 11, 12, 23, 24, 25]],
 ['3RW6', 'H', [17, 18, 19, 40, 41, 42]],
 ['3SUX', 'X', [21, 22, 23, 74, 75, 76]],
 ['3V7E', 'C', [15, 16, 17, 38, 39, 40]],
 ['4AOB', 'A', [15, 16, 17, 38, 39, 40]],
 ['4BW0', 'A', [1, 2, 3, 23, 24, 25]],
 ['4LVW', 'A', [18, 19, 20, 65, 66, 67]],
 ['4OQU', 'A', [22, 23, 24, 48, 49, 50]],
 ['4V5O',

In [17]:
parser = MMCIFParser(QUIET=True)

def get_atom_coordinates_stem(motifs_ids, atom, ref):
    df = pd.DataFrame(index = range(len(motifs_ids)), columns=['N1', 'N2', 'N3', 'N4', 'N5', 'N6'])
    ids = []
    sequences_stem = []
    for i, motif in enumerate(motifs_ids):
        motif_cords = []
        file_name = motif[0].lower() + '.cif'
        ref_seq = ref[ref['ID'] == motif[0]]['seq_1'] + ref[ref['ID'] == motif[0]]['seq_2']
        ref_seq = ref_seq.reset_index(drop=True)
        if file_name == '7lm1.cif': continue
        structure = parser.get_structure(file_name.rstrip('.cif'), f"../mmCIF_files/{file_name}")
        model = structure[0]
        chain = model[motif[1]]
        if motif[2][0] in chain:
            res = chain[motif[2][0]]
            if ref_seq[0][0] != res.resname:
                motif[2] = [x+1 for x in motif[2]]
        for j, motif_nuc in enumerate(motif[2]):
            if motif_nuc in chain:
                res = chain[motif_nuc]
                if ref_seq[0][j] != res.resname:
                    # print('NO')
                    # print(motif[0])
                    break
                    # res = chain[motif_nuc+1]
                    # if res.resname != ref_seq[0][j]:
                    #     print('NOOOOOOOOOOOOO')
                    #     print(motif[0])
                    #     break
                if atom not in res:
                    break
                else: 
                    at = res[atom]
                    motif_cords.append(at.get_coord())
            else:
                break
        if len(motif_cords) == 6:
            df.iloc[i] = motif_cords
            ids.append(motif[0])
            sequences_stem.append(ref_seq[0])
    df = df.dropna()
    df['id'] = ids
    return df, sequences_stem

In [18]:
stems_databse, stem_sequences = get_atom_coordinates_stem(stems_complete, "P", stems)

In [19]:
print(stems_databse.shape)
stems_databse

(45, 7)


Unnamed: 0,N1,N2,N3,N4,N5,N6,id
0,"[-0.14, 3.004, -39.68]","[-5.049, 1.391, -37.046]","[-9.883, 1.595, -34.022]","[3.295, 1.582, -20.37]","[-1.698, -1.861, -18.784]","[-5.331, -6.058, -22.032]",1BVJ
1,"[15.105, 3.81, -0.964]","[11.93, 9.749, -0.451]","[8.352, 14.086, 2.648]","[20.125, 8.025, 14.865]","[14.823, 6.74, 17.444]","[9.147, 4.991, 16.454]",1CQL
2,"[-2.566, 3.315, 99.209]","[1.683, 6.981, 96.344]","[3.299, 10.516, 91.403]","[-13.611, 8.875, 93.277]","[-12.52, 7.508, 88.27]","[-9.158, 5.5, 83.444]",1F84
4,"[-17.179, 39.113, 22.991]","[-19.54, 41.508, 19.35]","[-23.08, 41.776, 15.324]","[-12.353, 31.164, 4.868]","[-12.039, 38.057, 3.241]","[-12.829, 43.837, 3.946]",1KUQ
5,"[-5.402, 53.062, 161.745]","[-8.598, 48.13, 160.02]","[-13.109, 44.664, 160.337]","[-19.909, 57.063, 171.814]","[-19.084, 52.025, 174.554]","[-15.592, 47.346, 175.459]",1KXK
6,"[3.058, 4.89, -2.71]","[6.462, 6.043, -6.95]","[6.573, 10.524, -10.887]","[-9.061, 14.797, -3.014]","[-11.207, 12.914, -8.493]","[-10.212, 8.098, -12.265]",1LC6
7,"[108.484, 12.787, 21.031]","[110.976, 17.542, 19.963]","[115.159, 21.235, 22.81]","[113.749, 10.915, 38.417]","[111.446, 16.985, 38.92]","[108.501, 21.987, 37.178]",1M5K
8,"[5.284, 50.213, -2.598]","[4.229, 45.461, 2.558]","[3.194, 42.625, 7.234]","[-12.492, 51.757, 6.878]","[-8.553, 54.713, 10.811]","[-3.581, 56.133, 13.093]",1MZP
9,"[-6.499, 20.764, -8.322]","[-10.533, 24.042, -9.561]","[-15.113, 24.985, -14.36]","[-4.083, 12.349, -21.701]","[-2.76, 18.575, -24.461]","[-1.974, 23.537, -25.145]",1P5P
10,"[-12.834, -4.798, 4.605]","[-11.781, -8.855, 9.894]","[-10.913, -12.044, 14.206]","[2.79, -20.86, 4.627]","[-2.717, -23.734, 4.497]","[-8.479, -24.924, 5.421]",2OIU


In [20]:
stems_database = stems_databse.drop('id', axis=1)

for column in stems_database.columns:
    stems_database[column] = stems_database[column].apply(lambda x: ','.join(map(str, x)))
    
stems_database['class'] = 0
stems_database

Unnamed: 0,N1,N2,N3,N4,N5,N6,class
0,"-0.14,3.004,-39.68","-5.049,1.391,-37.046","-9.883,1.595,-34.022","3.295,1.582,-20.37","-1.698,-1.861,-18.784","-5.331,-6.058,-22.032",0
1,"15.105,3.81,-0.964","11.93,9.749,-0.451","8.352,14.086,2.648","20.125,8.025,14.865","14.823,6.74,17.444","9.147,4.991,16.454",0
2,"-2.566,3.315,99.209","1.683,6.981,96.344","3.299,10.516,91.403","-13.611,8.875,93.277","-12.52,7.508,88.27","-9.158,5.5,83.444",0
4,"-17.179,39.113,22.991","-19.54,41.508,19.35","-23.08,41.776,15.324","-12.353,31.164,4.868","-12.039,38.057,3.241","-12.829,43.837,3.946",0
5,"-5.402,53.062,161.745","-8.598,48.13,160.02","-13.109,44.664,160.337","-19.909,57.063,171.814","-19.084,52.025,174.554","-15.592,47.346,175.459",0
6,"3.058,4.89,-2.71","6.462,6.043,-6.95","6.573,10.524,-10.887","-9.061,14.797,-3.014","-11.207,12.914,-8.493","-10.212,8.098,-12.265",0
7,"108.484,12.787,21.031","110.976,17.542,19.963","115.159,21.235,22.81","113.749,10.915,38.417","111.446,16.985,38.92","108.501,21.987,37.178",0
8,"5.284,50.213,-2.598","4.229,45.461,2.558","3.194,42.625,7.234","-12.492,51.757,6.878","-8.553,54.713,10.811","-3.581,56.133,13.093",0
9,"-6.499,20.764,-8.322","-10.533,24.042,-9.561","-15.113,24.985,-14.36","-4.083,12.349,-21.701","-2.76,18.575,-24.461","-1.974,23.537,-25.145",0
10,"-12.834,-4.798,4.605","-11.781,-8.855,9.894","-10.913,-12.044,14.206","2.79,-20.86,4.627","-2.717,-23.734,4.497","-8.479,-24.924,5.421",0


In [21]:
stems_database.to_csv("stems_database_P'.csv")

In [22]:
with open("stems_database_P_seqs.csv", 'w', newline='') as f:
     
    # using csv.writer method from CSV package
    write = csv.writer(f)
    for item in stem_sequences:
        write.writerow([item])

### hairpins

In [24]:
hairpins = pd.read_csv('../rnasolo-3.218-dbn-csv/hairpins_u.txt', sep=' ', header=None, names=['ID', 'start', 'end', 'seq', 'braq'])
hairpins['chain'] = list(map(lambda x: x.split('_')[2].rsplit('-motifs.txt:Hairpin')[0], hairpins['ID']))
hairpins['ID'] = list(map(lambda x: x[:x.find('_')], hairpins['ID']))
hairpins = hairpins[~hairpins['chain'].str.contains('-')]
hairpins = hairpins[~hairpins['ID'].duplicated()]
haripins = hairpins.reset_index(drop=True)
hairpins

Unnamed: 0,ID,start,end,seq,braq,chain
0,17RA,9,14,GAUUAC,(....),A
1,1A1T,8,13,CGGAGG,(....),B
3,1A9L,17,22,CGAGAG,(....),A
4,1AFX,4,9,GUGAAC,(....),A
5,1AJF,7,12,GGAAAC,(....),A
...,...,...,...,...,...,...
785,7R81,223,228,CUUCGG,(....),A2
787,7RQB,116,121,GGGAAC,(....),1A
789,7RYG,122,127,GGGAAC,(....),A
791,7SAM,22,27,UUACAA,(....),A


In [26]:
hairpin_ids = [
    '4V8M', '3DHS', '2N6T', '1FYP', '4XWF', '4ZNP', '2LI4', '2M5U', '4PCJ', '3E5C',
    '1N8X', '5TBW', '1K4B', '5IEM', '2PCV', '4PLX', '1JUR', '1U63', '2JSE', '2LU0',
    '1QWA', '1Q96', '2HW8', '4W90', '4UER', '3V7E', '3IGI', '2NCI', '2CZJ', '3G9C',
    '5Y7M', '1I6U', '4V5O', '1ZBN', '5UF3', '1Z2J', '3J7P', '5FJC', '4A4S', '3P22',
    '2N6X', '3CUL', '1ZIG', '4JF2', '1CQL', '5U3G', '1OOA', '2L2J', '1F9L'
]

hairpins_complete = []
for id in hairpin_ids:
    row = hairpins[hairpins['ID'] == id].reset_index()
    seqs = list(range(int(row['start']), int(row['end']+1)))
    hairpins_complete.append([row['ID'][0], row['chain'][0], seqs])

In [27]:
print(len(hairpins_complete))
hairpins_complete

49


[['4V8M', 'AA', [282, 283, 284, 285, 286, 287]],
 ['3DHS', 'A', [31, 32, 33, 34, 35, 36]],
 ['2N6T', 'A', [19, 20, 21, 22, 23, 24]],
 ['1FYP', 'A', [11, 12, 13, 14, 15, 16]],
 ['4XWF', 'A', [24, 25, 26, 27, 28, 29]],
 ['4ZNP', 'A', [26, 27, 28, 29, 30, 31]],
 ['2LI4', 'A', [14, 15, 16, 17, 18, 19]],
 ['2M5U', 'A', [9, 10, 11, 12, 13, 14]],
 ['4PCJ', 'A', [15, 16, 17, 18, 19, 20]],
 ['3E5C', 'A', [17, 18, 19, 20, 21, 22]],
 ['1N8X', 'A', [15, 16, 17, 18, 19, 20]],
 ['5TBW', 'AS', [36, 37, 38, 39, 40, 41]],
 ['1K4B', 'A', [5, 6, 7, 8, 9, 10]],
 ['5IEM', 'A', [25, 26, 27, 28, 29, 30]],
 ['2PCV', 'A', [15, 16, 17, 18, 19, 20]],
 ['4PLX', 'A', [23, 24, 25, 26, 27, 28]],
 ['1JUR', 'A', [8, 9, 10, 11, 12, 13]],
 ['1U63', 'D', [20, 21, 22, 23, 24, 25]],
 ['2JSE', 'A', [9, 10, 11, 12, 13, 14]],
 ['2LU0', 'A', [14, 15, 16, 17, 18, 19]],
 ['1QWA', 'A', [8, 9, 10, 11, 12, 13]],
 ['1Q96', 'A', [12, 13, 14, 15, 16, 17]],
 ['2HW8', 'B', [14, 15, 16, 17, 18, 19]],
 ['4W90', 'C', [14, 15, 16, 17, 18, 1

In [51]:
parser = MMCIFParser(QUIET=True)

def get_atom_coordinates_hairpins(motifs_ids, atom, ref):
    df = pd.DataFrame(index = range(len(motifs_ids)), columns=['N1', 'N2', 'N3', 'N4', 'N5', 'N6'])
    ids = []
    har_seqs = []
    for i, motif in enumerate(motifs_ids):
        motif_cords = []
        file_name = motif[0].lower() + '.cif'
        ref_seq = ref[ref['ID'] == motif[0]]['seq'] 
        ref_seq = ref_seq.reset_index(drop=True)
        #if file_name == '7lm1.cif': continue
        try:
            structure = parser.get_structure(file_name.rstrip('.cif'), f"../mmCIF_files/{file_name}")
        except:
            continue
        model = structure[0]
        try:
            chain = model[motif[1]]
        except:
            continue
        if motif[2][0] in chain:
            res = chain[motif[2][0]]
            if ref_seq[0][0] != res.resname:
                motif[2] = [x+1 for x in motif[2]]
        for j, motif_nuc in enumerate(motif[2]):
            if motif_nuc in chain:
                res = chain[motif_nuc]
                if ref_seq[0][j] != res.resname:
                    # print('NO')
                    # print(motif[0])
                    break
                    # try:
                    #     res = chain[motif_nuc+1]
                    # except:
                    #     print('no res')
                    #     break
                    # if res.resname != ref_seq[0][j]:
                    #     print('NOOOOOOOOOOOOO')
                    #     print(motif[0])
                    #     break
                if atom not in res:
                    print("ajjj")
                    break
                else: 
                    at = res[atom]
                    motif_cords.append(at.get_coord())
            else:
                break
        if len(motif_cords) == 6:
            df.iloc[i] = motif_cords
            ids.append(motif[0])
            har_seqs.append(ref_seq[0])
    df = df.dropna()
    df['id'] = ids
    return df, har_seqs

In [30]:
hairpins_database, hairpins_seqs = get_atom_coordinates_hairpins(hairpins_complete, "P", hairpins)
print(hairpins_database.shape)
hairpins_database

(49, 7)


Unnamed: 0,N1,N2,N3,N4,N5,N6,id
0,"[30.583, 153.958, 41.567]","[32.518, 156.801, 36.863]","[33.015, 157.402, 31.447]","[31.741, 152.956, 31.207]","[34.464, 147.178, 30.292]","[38.178, 143.315, 30.389]",4V8M
1,"[37.665, 99.768, -14.559]","[43.055, 101.788, -14.954]","[48.666, 101.759, -12.995]","[49.328, 99.236, -6.699]","[47.64, 98.13, -2.118]","[48.274, 102.173, 1.228]",3DHS
2,"[-2.329, 28.356, -4.974]","[-7.328, 30.395, -3.204]","[-8.623, 33.998, 2.128]","[-5.107, 35.782, 7.615]","[0.144, 32.402, 10.407]","[-1.238, 26.001, 11.974]",2N6T
3,"[9.761, -12.055, 22.195]","[7.705, -15.136, 27.208]","[2.684, -18.003, 28.481]","[-3.741, -18.494, 25.977]","[-5.442, -13.259, 22.069]","[-7.996, -9.527, 25.503]",1FYP
4,"[-48.749, 1.789, -25.454]","[-51.506, 1.35, -19.703]","[-51.344, 3.489, -13.665]","[-48.145, 5.93, -8.205]","[-41.949, 3.852, -7.602]","[-37.061, 1.878, -8.857]",4XWF
5,"[64.107, -26.222, -10.107]","[58.4, -27.557, -10.257]","[53.14, -27.14, -10.17]","[53.118, -22.395, -6.276]","[53.315, -21.704, -0.619]","[54.243, -20.099, 5.877]",4ZNP
6,"[7.008, 23.042, -1.793]","[4.052, 27.131, -5.104]","[-0.102, 31.285, -5.383]","[-2.802, 27.996, -0.25]","[-5.626, 23.542, 2.047]","[-7.736, 17.699, 1.256]",2LI4
7,"[5.175, 3.151, 12.233]","[8.102, 8.47, 11.871]","[5.537, 14.166, 8.715]","[-0.813, 12.691, 7.791]","[-7.52, 12.166, 9.293]","[-6.083, 11.516, 3.782]",2M5U
8,"[6.432, 2.262, -16.533]","[3.472, 6.091, -19.285]","[3.73, 10.561, -22.371]","[8.495, 13.489, -20.251]","[9.826, 17.828, -15.927]","[7.947, 19.87, -10.829]",4PCJ
9,"[13.068, 21.498, 3.114]","[8.223, 22.899, 1.018]","[3.334, 25.306, 1.75]","[2.65, 24.956, 7.704]","[-0.916, 21.037, 11.307]","[-2.911, 15.878, 12.022]",3E5C


In [57]:
hairpins_database = hairpins_database.drop('id', axis=1)

for column in hairpins_database.columns:
    hairpins_database[column] = hairpins_database[column].apply(lambda x: ','.join(map(str, x)))
    
hairpins_database['class'] = 0
hairpins_database

Unnamed: 0,N1,N2,N3,N4,N5,N6,class
0,"30.583,153.958,41.567","32.518,156.801,36.863","33.015,157.402,31.447","31.741,152.956,31.207","34.464,147.178,30.292","38.178,143.315,30.389",0
1,"37.665,99.768,-14.559","43.055,101.788,-14.954","48.666,101.759,-12.995","49.328,99.236,-6.699","47.64,98.13,-2.118","48.274,102.173,1.228",0
2,"-2.329,28.356,-4.974","-7.328,30.395,-3.204","-8.623,33.998,2.128","-5.107,35.782,7.615","0.144,32.402,10.407","-1.238,26.001,11.974",0
3,"9.761,-12.055,22.195","7.705,-15.136,27.208","2.684,-18.003,28.481","-3.741,-18.494,25.977","-5.442,-13.259,22.069","-7.996,-9.527,25.503",0
4,"-48.749,1.789,-25.454","-51.506,1.35,-19.703","-51.344,3.489,-13.665","-48.145,5.93,-8.205","-41.949,3.852,-7.602","-37.061,1.878,-8.857",0
5,"64.107,-26.222,-10.107","58.4,-27.557,-10.257","53.14,-27.14,-10.17","53.118,-22.395,-6.276","53.315,-21.704,-0.619","54.243,-20.099,5.877",0
6,"7.008,23.042,-1.793","4.052,27.131,-5.104","-0.102,31.285,-5.383","-2.802,27.996,-0.25","-5.626,23.542,2.047","-7.736,17.699,1.256",0
7,"5.175,3.151,12.233","8.102,8.47,11.871","5.537,14.166,8.715","-0.813,12.691,7.791","-7.52,12.166,9.293","-6.083,11.516,3.782",0
8,"6.432,2.262,-16.533","3.472,6.091,-19.285","3.73,10.561,-22.371","8.495,13.489,-20.251","9.826,17.828,-15.927","7.947,19.87,-10.829",0
9,"13.068,21.498,3.114","8.223,22.899,1.018","3.334,25.306,1.75","2.65,24.956,7.704","-0.916,21.037,11.307","-2.911,15.878,12.022",0


In [58]:
hairpins_database.to_csv("hairpins_database_P.csv")

with open("hairpins_database_P_seqs.csv", 'w', newline='') as f:
     
    # using csv.writer method from CSV package
    write = csv.writer(f)
    for item in hairpins_seqs:
        write.writerow([item])

### internal loops

In [33]:
loops = pd.read_csv('../rnasolo-3.218-dbn-csv/loops_u.txt', sep=' ', header=None, names=['ID', 'start_1', 'end_1', 'seq_1', 'braq_1', 'start_2', 'end_2', 'seq_2', 'braq_2'])
loops['chain'] = list(map(lambda x: x.split('_')[2].rsplit('-motifs.txt:Loop')[0], loops['ID']))
loops['ID'] = list(map(lambda x: x[:x.find('_')], loops['ID']))
#loops = loops[~loops['braq_1'].str.contains('[')]
loops = loops[~loops['chain'].str.contains('-')]
loops = loops[~loops['ID'].duplicated()]
loops = loops.reset_index(drop=True)
loops

Unnamed: 0,ID,start_1,end_1,seq_1,braq_1,start_2,end_2,seq_2,braq_2,chain
0,1C2W,77,79,GUC,(.(,107,109,GGC,).),B
1,1C2X,8,10,CGG,(.(,110,112,CUG,).),C
2,1D0T,3,5,GUC,(.(,12,14,GUC,).),A
3,1F1T,2,4,GAC,(.(,30,32,GAC,).),A
4,1FYP,4,6,GUC,(.(,22,24,GUC,).),A
...,...,...,...,...,...,...,...,...,...,...
336,7R81,10,12,GAU,[.[,1113,1115,AAU,].],A2
337,7RQB,574,576,CCU,(.(,585,587,AUG,).),1A
338,7RYG,275,277,UUA,(.(,351,353,UAA,).),A
339,7SAM,96,98,GGC,(.(,110,112,GUC,).),A


In [34]:
loops_ids = [
    "1C2X", "1FYP", "1JO7", "1MZP", "1P5M", "1P5P", "1RHT", "1XWU", "2GV4", "2HGH",
    "2L2J", "2LUN", "2LWK", "2N6S", "3D0U", "3IGI", "3IYR", "3J7O", "3JBV", "3RW6",
    "3V7E", "4AOB", "4P8Z", "4Y1M", "5AOX", "5B2P", "5F5F", "5F9R", "5FJC", "5O60",
    "5T7V", "5ZEY", "6B44", "6CHR", "6DU5", "6JQ6", "6O8W", "6ORD", "6Q9A", "6SGB",
    "6UZ7", "6Y2L", "6YDW", "6ZMO", "7AC7", "7AQC", "7D1A", "7JQQ", "7L0Z", "7MDZ",
    "7NSH", "7O80"
]

loops_complete = []
for index, row in loops.iterrows():
    if row['ID'] in loops_ids:
        if len(loops_complete) == 150:
            break
        seqs = list(range(int(row['start_1']), int(row['end_1']+1))) + list(range(int(row['start_2']), int(row['end_2']+1)))
        loops_complete.append([row['ID'], row['chain'], seqs])          

print(len(loops_complete))
loops_complete

52


[['1C2X', 'C', [8, 9, 10, 110, 111, 112]],
 ['1FYP', 'A', [4, 5, 6, 22, 23, 24]],
 ['1JO7', 'A', [2, 3, 4, 28, 29, 30]],
 ['1MZP', 'B', [4, 5, 6, 50, 51, 52]],
 ['1P5M', 'A', [25, 26, 27, 34, 35, 36]],
 ['1P5P', 'A', [25, 26, 27, 56, 57, 58]],
 ['1RHT', 'A', [3, 4, 5, 19, 20, 21]],
 ['1XWU', 'A', [4, 5, 6, 10, 11, 12]],
 ['2GV4', 'A', [5, 6, 7, 16, 17, 18]],
 ['2HGH', 'B', [30, 31, 32, 37, 38, 39]],
 ['2L2J', 'A', [4, 5, 6, 37, 38, 39]],
 ['2LUN', 'A', [4, 5, 6, 23, 24, 25]],
 ['2LWK', 'A', [8, 9, 10, 24, 25, 26]],
 ['2N6S', 'A', [14, 15, 16, 21, 22, 23]],
 ['3D0U', 'A', [112, 113, 114, 134, 135, 136]],
 ['3IGI', 'A', [162, 163, 164, 211, 212, 213]],
 ['3IYR', 'A', [58, 59, 60, 73, 74, 75]],
 ['3J7O', '7', [68, 69, 70, 105, 106, 107]],
 ['3JBV', 'W', [2, 3, 4, 69, 70, 71]],
 ['3RW6', 'H', [23, 24, 25, 32, 33, 34]],
 ['3V7E', 'C', [25, 26, 27, 98, 99, 100]],
 ['4AOB', 'A', [25, 26, 27, 66, 67, 68]],
 ['4P8Z', 'A', [49, 50, 51, 139, 140, 141]],
 ['4Y1M', 'B', [3, 4, 5, 102, 103, 104]],
 

In [35]:
loops_databse, loops_seqs = get_atom_coordinates_stem(loops_complete, "P", loops)

In [36]:
print(loops_databse.shape)
loops_databse

(52, 7)


Unnamed: 0,N1,N2,N3,N4,N5,N6,id
0,"[33.631, 103.346, 9.224]","[34.982, 97.909, 8.354]","[37.068, 92.945, 10.114]","[43.34, 103.435, 23.356]","[47.872, 102.094, 20.23]","[50.527, 101.6, 15.248]",1C2X
1,"[-1.161, 6.133, 31.961]","[-5.561, 3.317, 28.943]","[-7.642, 1.368, 23.128]","[9.697, 9.568, 16.819]","[5.931, 11.905, 15.516]","[-0.52, 14.52, 16.128]",1FYP
2,"[-2.089, -10.556, 4.6]","[0.771, -8.884, -0.456]","[0.464, -7.901, -6.063]","[-11.95, -16.324, -0.096]","[-11.513, -20.134, -4.047]","[-6.697, -23.943, -5.146]",1JO7
3,"[9.493, 45.378, 26.905]","[9.755, 51.746, 28.545]","[12.136, 57.3, 28.852]","[27.471, 48.919, 21.013]","[26.526, 52.369, 26.469]","[25.537, 54.289, 31.301]",1MZP
4,"[-28.163, 0.251, 0.136]","[-30.679, -2.933, -4.577]","[-31.107, -7.735, -6.389]","[-16.862, -14.714, 1.42]","[-14.956, -12.927, -3.82]","[-15.288, -9.395, -8.374]",1P5M
5,"[-15.113, 24.985, -14.36]","[-17.029, 24.257, -20.465]","[-17.997, 19.01, -23.94]","[-9.347, 10.618, -12.278]","[-5.836, 10.074, -17.594]","[-4.083, 12.349, -21.701]",1P5P
6,"[6.047, -12.222, 5.815]","[0.932, -11.139, 1.298]","[-4.862, -9.239, 0.223]","[-8.995, 9.366, 1.774]","[-9.94, 5.09, 6.934]","[-7.688, 1.175, 10.323]",1RHT
7,"[9.016, 3.128, -2.986]","[4.483, 6.905, -5.173]","[-1.655, 9.737, -4.988]","[-11.284, 4.725, 8.28]","[-9.17, -2.256, 8.812]","[-9.181, -4.54, 2.194]",1XWU
8,"[-299.289, -18.106, 547.261]","[-296.526, -12.91, 547.67]","[-291.935, -8.767, 547.304]","[-284.461, -26.985, 544.941]","[-282.282, -24.019, 549.743]","[-283.947, -21.383, 555.059]",2GV4
9,"[-10.148, -17.35, 8.405]","[-11.985, -18.564, 2.716]","[-13.017, -20.922, -2.126]","[3.897, -27.791, 1.161]","[3.824, -22.897, -3.219]","[2.167, -18.219, -5.61]",2HGH


In [37]:
loops_database = loops_databse.drop('id', axis=1)

for column in loops_database.columns:
    loops_database[column] = loops_database[column].apply(lambda x: ','.join(map(str, x)))
    
loops_database['class'] = 0
loops_database.to_csv("loops_database_P.csv")

In [38]:
with open("loops_database_P_seqs.csv", 'w', newline='') as f:
     
    # using csv.writer method from CSV package
    write = csv.writer(f)
    for item in loops_seqs:
        write.writerow([item])

### ends

In [40]:
ends = pd.read_csv('../rnasolo-3.218-dbn-csv/single_ends.txt', sep=' ', header=None, names=['ID', 'start', 'end', 'seq', 'braq'])
ends['chain'] = list(map(lambda x: x.split('_')[2].rsplit('-motifs.txt:SingleStrand3p')[0], ends['ID']))
ends.loc[ends['chain'].str.len() > 6, 'chain']  = ends[ends['chain'].str.len() > 6 ]['chain'].map(lambda x: x[:x.find('-')])
ends['ID'] = list(map(lambda x: x[:x.find('_')], ends['ID']))
ends = ends[~ends['chain'].str.contains('-')]
#ends = ends[~ends['ID'].duplicated()]
ends = ends.reset_index(drop=True)
ends

Unnamed: 0,ID,start,end,seq,braq,chain
0,1F7U,59,64,AAGCCA,).....,B
1,1KXK,1,6,GUCUAC,.....(,A
2,1QTQ,69,74,CAGCCA,).....,B
3,1XOK,22,27,GCCCCU,).....,A
4,2R8S,1,6,GGAAUU,.....(,R
...,...,...,...,...,...,...
77,7NHM,72,77,CAACCA,).....,D
78,7NWI,71,76,ACACCA,).....,2
79,7OGM,44,49,UUUUUU,).....,P
80,7OSM,70,75,GCACCA,).....,ASIT


In [41]:
ends_complete = []
for index, row in ends.iterrows():
    seqs = list(range(int(row['start']), int(row['end']+1)))
    ends_complete.append([row['ID'], row['chain'], seqs])

print(len(ends_complete))
ends_complete

82


[['1F7U', 'B', [59, 60, 61, 62, 63, 64]],
 ['1KXK', 'A', [1, 2, 3, 4, 5, 6]],
 ['1QTQ', 'B', [69, 70, 71, 72, 73, 74]],
 ['1XOK', 'A', [22, 23, 24, 25, 26, 27]],
 ['2R8S', 'R', [1, 2, 3, 4, 5, 6]],
 ['3DKN', 'F', [1, 2, 3, 4, 5, 6]],
 ['3J0L', '1', [45, 46, 47, 48, 49, 50]],
 ['3J0P', 'W', [72, 73, 74, 75, 76, 77]],
 ['3J16', 'J', [1, 2, 3, 4, 5, 6]],
 ['3J5S', 'B', [1, 2, 3, 4, 5, 6]],
 ['3J92', '2', [69, 70, 71, 72, 73, 74]],
 ['3TRZ', 'U', [15, 16, 17, 18, 19, 20]],
 ['3TS2', 'U', [18, 19, 20, 21, 22, 23]],
 ['3TS2', 'V', [18, 19, 20, 21, 22, 23]],
 ['3ZN8', 'G', [83, 84, 85, 86, 87, 88]],
 ['4ATO', 'G', [1, 2, 3, 4, 5, 6]],
 ['4ATO', 'G', [28, 29, 30, 31, 32, 33]],
 ['4ENC', 'A', [47, 48, 49, 50, 51, 52]],
 ['4ILM', 'C', [1, 2, 3, 4, 5, 6]],
 ['4JXZ', 'B', [66, 67, 68, 69, 70, 71]],
 ['4K4U', 'B', [1, 2, 3, 4, 5, 6]],
 ['4QOZ', 'A', [1, 2, 3, 4, 5, 6]],
 ['4QOZ', 'A', [21, 22, 23, 24, 25, 26]],
 ['4TUE', 'QV', [72, 73, 74, 75, 76, 77]],
 ['4TUW', 'C', [23, 24, 25, 26, 27, 28]],
 ['

In [42]:
ends_databse, ends_seqs = get_atom_coordinates_hairpins(ends_complete, "P", ends)

ajjj
ajjj
ajjj
ajjj
ajjj
ajjj
ajjj


In [43]:
print(ends_databse.shape)
ends_databse

(20, 7)


Unnamed: 0,N1,N2,N3,N4,N5,N6,id
17,"[0.398, 2.535, -14.457]","[-1.132, 2.929, -20.016]","[-2.183, 6.277, -24.394]","[-0.996, 11.207, -26.881]","[0.138, 17.986, -26.947]","[5.448, 21.065, -26.083]",4ENC
24,"[-7.563, -26.974, 5.094]","[-11.12, -22.95, 7.218]","[-15.938, -19.654, 7.068]","[-21.659, -18.84, 5.414]","[-26.286, -21.847, 4.737]","[-29.039, -25.947, 6.804]",4TUW
26,"[-127.737, -19.668, -103.48]","[-122.826, -21.89, -100.351]","[-117.754, -21.564, -99.102]","[-113.956, -16.351, -98.506]","[-111.244, -16.764, -92.407]","[-107.947, -15.612, -90.623]",4V4R
34,"[-101.61, -23.813, 9.458]","[-99.201, -29.345, 10.989]","[-99.129, -35.316, 10.655]","[-99.106, -39.636, 6.236]","[-93.671, -41.365, 7.71]","[-90.337, -45.384, 6.952]",4WT8
41,"[54.076, -24.948, -1.353]","[54.684, -30.863, 1.01]","[55.839, -31.721, 6.371]","[54.038, -29.348, 11.329]","[54.96, -33.3, 16.343]","[56.702, -35.639, 21.075]",5K36
46,"[50.994, 77.831, 92.006]","[53.401, 79.0, 86.598]","[55.06, 81.172, 81.142]","[60.055, 83.465, 81.393]","[64.178, 84.327, 77.181]","[58.684, 87.952, 78.591]",5XWY
50,"[-1.623, 72.881, 26.443]","[-4.047, 76.631, 20.75]","[-6.927, 80.232, 24.473]","[-10.422, 84.528, 23.521]","[-13.449, 84.047, 19.002]","[-12.9, 80.171, 14.457]",6D6V
51,"[239.373, 183.975, 252.072]","[236.484, 178.783, 250.053]","[232.728, 174.674, 248.391]","[227.538, 172.711, 246.107]","[224.237, 166.69, 246.366]","[228.875, 162.185, 249.429]",6HIW
54,"[222.692, 207.989, 261.734]","[228.207, 205.585, 263.114]","[233.248, 201.539, 262.142]","[235.875, 196.935, 259.052]","[235.408, 191.635, 256.049]","[235.79, 190.693, 250.272]",6OKK
58,"[222.991, 230.25, 206.21]","[222.682, 224.891, 205.307]","[225.259, 219.627, 204.944]","[230.355, 217.105, 207.211]","[226.457, 211.507, 209.269]","[226.615, 207.428, 210.947]",6Q9A


In [44]:
ends_database = ends_databse.drop('id', axis=1)
for column in ends_database.columns:
    ends_database[column] = ends_database[column].apply(lambda x: ','.join(map(str, x)))
    
ends_database['class'] = 0
ends_database.to_csv("ends_database_P.csv")
ends_database

with open("ends_database_P_seqs.csv", 'w', newline='') as f:
     
    # using csv.writer method from CSV package
    write = csv.writer(f)
    for item in ends_seqs:
        write.writerow([item])

### mix

In [45]:
import os
from Bio.PDB.MMCIFParser import MMCIFParser

files = os.listdir('../rnasolo-3.218-dbn-csv/rnasolo-3.218/dbn/')
files = [file for file in files if file.endswith('.dbn')]
files.sort()
print(files)
print(len(files))

['124D_1_B.dbn', '176D_1_B.dbn', '17RA_7_A.dbn', '1A1T_1_B.dbn', '1A3M_1_A-B.dbn', '1A4D_1_A-B.dbn', '1A4T_1_A.dbn', '1A51_3_A.dbn', '1A60_1_A.dbn', '1A9L_1_A.dbn', '1A9N_1_Q.dbn', '1AC3_1_B.dbn', '1AFX_2_A.dbn', '1AJF_1_A.dbn', '1AJL_1_A-B.dbn', '1AJT_1_A-B.dbn', '1AJU_9_A.dbn', '1AKX_1_A.dbn', '1AL5_1_A-B.dbn', '1AM0_2_A.dbn', '1ANR_2_A.dbn', '1AQO_3_A.dbn', '1ARJ_7_N.dbn', '1ATO_1_A.dbn', '1ATV_1_A.dbn', '1ATW_1_A.dbn', '1AUD_8_B.dbn', '1AV6_1_B.dbn', '1B23_1_R.dbn', '1B36_8_A.dbn', '1B7F_1_P.dbn', '1B7F_1_Q.dbn', '1BAU_1_A-B.dbn', '1BGZ_1_A.dbn', '1BIV_5_A.dbn', '1BJ2_1_A-B.dbn', '1BMV_1_M.dbn', '1BN0_2_A.dbn', '1BR3_1_A.dbn', '1BVJ_1_A.dbn', '1BYX_1_A.dbn', '1BZ2_1_A.dbn', '1BZT_1_A.dbn', '1BZU_1_A.dbn', '1C0O_1_A.dbn', '1C2W_1_B.dbn', '1C2X_1_C.dbn', '1C4L_1_A-B.dbn', '1C9S_1_W.dbn', '1CGM_1_I.dbn', '1CQL_10_A.dbn', '1CSL_1_B-A.dbn', '1CVJ_1_M.dbn', '1CVJ_1_N.dbn', '1CVJ_1_Q.dbn', '1CWP_1_D.dbn', '1CWP_1_F.dbn', '1CX5_1_B.dbn', '1D0T_1_A.dbn', '1D0U_14_A.dbn', '1D4R_1_B-A.dbn', '

In [46]:
import random
import re
mixs_seqs = []
for file in files:
    try:
        f = open(f"../rnasolo-3.218-dbn-csv/rnasolo-3.218/dbn/{file}", "r")
        seq = f.read().split('\n')
        if len(seq) == 2:
             if len(seq[1]) > 11:
                num = random.randint(2, len(seq[1])-7)
                br = seq[1][num:num+6]
                if '(' in br and '.' in br:
                    if len(re.findall(r'\(', br)) >= 2 and len(re.findall(r'\.', br)) >= 2:
                        mixs_seqs.append([file.rstrip('.dbn'), seq[0][num:num+6], seq[1][num:num+6], num, num+5])
    finally:
        f.close()
print(mixs_seqs)
print(len(mixs_seqs))

[['17RA_7_A', 'GGAUUA', '((....', 7, 12], ['1A1T_1_B', 'AGCGGA', '(((...', 5, 10], ['1A3M_1_A-B', 'CACCUU', '((((..', 7, 12], ['1A51_3_A', 'GGUAGU', '....((', 7, 12], ['1A9N_1_Q', 'UGGUAU', '(((...', 2, 7], ['1AJU_9_A', 'UUGAGC', '..((((', 7, 12], ['1AKX_1_A', 'GAGCCU', '((((..', 9, 14], ['1ARJ_7_N', 'GAGCCU', '((((..', 9, 14], ['1B23_1_R', 'AGUCCG', '..((((', 38, 43], ['1BVJ_1_A', 'GGUGUA', '(((...', 6, 11], ['1C0O_1_A', 'UCUUCG', '((....', 3, 8], ['1CQL_10_A', 'CAGGUC', '....((', 14, 19], ['1D4R_1_B-A', 'GGAGCG', '....((', 11, 16], ['1DFU_1_M-N', 'AGUAGG', '....((', 10, 15], ['1E4P_8_A', 'GACGAA', '(((...', 7, 12], ['1E7K_1_C', 'AUGAGG', '....((', 4, 9], ['1E8O_1_E', 'UAGUCC', '...(((', 25, 30], ['1EBS_1_A', 'AGCUUC', '(((...', 9, 14], ['1EHT_3_A', 'GAUACC', '(...((', 3, 8], ['1EKZ_4_B', 'CCCUUC', '(((...', 10, 15], ['1F27_1_A', 'CGUCAG', '(((...', 2, 7], ['1F5H_1_A-B', 'AGGCGU', '(.((.(', 2, 7], ['1F84_10_A', 'AGUGUU', '.(((..', 8, 13], ['1F85_1_A', 'CCUGAU', '(((...', 2, 7], ['1FEU

In [47]:
mixs2 = pd.DataFrame(columns=['ID', 'start', 'end', 'seq', 'braq', 'chain'])
for i, mix in enumerate(mixs_seqs):
    mixs2.loc[i] = [ mix[0][:mix[0].find('_')], mix[-2], mix[-1], mix[1], mix[2], mix[0].split('_')[2]]
mixs = mixs2[~mixs2['chain'].str.contains('-')]
mixs = mixs[~mixs['ID'].duplicated()]
mixs = mixs.reset_index(drop=True)
mixs

Unnamed: 0,ID,start,end,seq,braq,chain
0,17RA,7,12,GGAUUA,((....,A
1,1A1T,5,10,AGCGGA,(((...,B
2,1A51,7,12,GGUAGU,....((,A
3,1A9N,2,7,UGGUAU,(((...,Q
4,1AJU,7,12,UUGAGC,..((((,A
...,...,...,...,...,...,...
359,7OSM,10,15,CUCAGU,(.(...,ASIT
360,7OZQ,10,15,GGUCGA,((((..,H
361,7P3K,50,55,GGUUCA,((..].,V
362,7QEP,728,733,AAUGCG,((.((.,1


In [48]:
mixs = pd.DataFrame(columns=['ID', 'start', 'end', 'seq', 'braq', 'chain'])
for i, mix in enumerate(mixs_seqs):
    mixs.loc[i] = [ mix[0][:mix[0].find('_')], mix[-2], mix[-1], mix[1], mix[2], mix[0].split('_')[2]]
mixs = mixs[~mixs['chain'].str.contains('-')]
mixs = mixs[~mixs['ID'].duplicated()]
mixs = mixs.reset_index(drop=True)
mixs

Unnamed: 0,ID,start,end,seq,braq,chain
0,17RA,7,12,GGAUUA,((....,A
1,1A1T,5,10,AGCGGA,(((...,B
2,1A51,7,12,GGUAGU,....((,A
3,1A9N,2,7,UGGUAU,(((...,Q
4,1AJU,7,12,UUGAGC,..((((,A
...,...,...,...,...,...,...
359,7OSM,10,15,CUCAGU,(.(...,ASIT
360,7OZQ,10,15,GGUCGA,((((..,H
361,7P3K,50,55,GGUUCA,((..].,V
362,7QEP,728,733,AAUGCG,((.((.,1


In [49]:
mix_complete = []
for index, row in mixs.iterrows():
    seqs = list(range(int(row['start']), int(row['end']+1)))
    mix_complete.append([row['ID'], row['chain'], seqs])

print(len(mix_complete))
mix_complete

364


[['17RA', 'A', [7, 8, 9, 10, 11, 12]],
 ['1A1T', 'B', [5, 6, 7, 8, 9, 10]],
 ['1A51', 'A', [7, 8, 9, 10, 11, 12]],
 ['1A9N', 'Q', [2, 3, 4, 5, 6, 7]],
 ['1AJU', 'A', [7, 8, 9, 10, 11, 12]],
 ['1AKX', 'A', [9, 10, 11, 12, 13, 14]],
 ['1ARJ', 'N', [9, 10, 11, 12, 13, 14]],
 ['1B23', 'R', [38, 39, 40, 41, 42, 43]],
 ['1BVJ', 'A', [6, 7, 8, 9, 10, 11]],
 ['1C0O', 'A', [3, 4, 5, 6, 7, 8]],
 ['1CQL', 'A', [14, 15, 16, 17, 18, 19]],
 ['1E4P', 'A', [7, 8, 9, 10, 11, 12]],
 ['1E7K', 'C', [4, 5, 6, 7, 8, 9]],
 ['1E8O', 'E', [25, 26, 27, 28, 29, 30]],
 ['1EBS', 'A', [9, 10, 11, 12, 13, 14]],
 ['1EHT', 'A', [3, 4, 5, 6, 7, 8]],
 ['1EKZ', 'B', [10, 11, 12, 13, 14, 15]],
 ['1F27', 'A', [2, 3, 4, 5, 6, 7]],
 ['1F84', 'A', [8, 9, 10, 11, 12, 13]],
 ['1F85', 'A', [2, 3, 4, 5, 6, 7]],
 ['1FQZ', 'A', [7, 8, 9, 10, 11, 12]],
 ['1G70', 'A', [11, 12, 13, 14, 15, 16]],
 ['1HJI', 'A', [3, 4, 5, 6, 7, 8]],
 ['1HLX', 'A', [5, 6, 7, 8, 9, 10]],
 ['1HWQ', 'A', [5, 6, 7, 8, 9, 10]],
 ['1I3Y', 'A', [4, 5, 6, 7, 8, 

In [52]:
mix_databse, mix_sequences = get_atom_coordinates_hairpins(mix_complete, "P", mixs)

ajjj
ajjj


In [53]:
mix_databse

Unnamed: 0,N1,N2,N3,N4,N5,N6,id
0,"[15.175, 7.152, 4.632]","[19.503, 11.838, 5.902]","[24.574, 15.17, 5.346]","[28.576, 17.995, 0.621]","[29.104, 20.03, -5.029]","[29.931, 14.883, -10.657]",17RA
8,"[-0.14, 3.004, -39.68]","[-5.049, 1.391, -37.046]","[-9.883, 1.595, -34.022]","[-11.96, 4.54, -28.925]","[-10.014, 9.153, -24.612]","[-7.041, 14.033, -24.523]",1BVJ
9,"[1.67, 0.139, -8.289]","[-2.844, 3.575, -7.03]","[-4.561, 8.537, -4.263]","[-2.546, 13.122, -0.376]","[1.748, 13.961, 5.103]","[5.24, 8.254, 6.148]",1C0O
10,"[8.684, 17.098, 7.823]","[12.52, 19.928, 11.917]","[15.11, 22.057, 17.867]","[20.912, 22.951, 20.545]","[23.422, 28.331, 16.492]","[24.44, 31.345, 10.851]",1CQL
11,"[174.954, -154.448, -92.421]","[174.096, -147.764, -90.656]","[173.81, -141.718, -88.916]","[172.211, -136.916, -90.784]","[168.435, -135.27, -96.194]","[168.344, -141.446, -98.495]",1E4P
...,...,...,...,...,...,...,...
325,"[213.413, 246.037, 231.173]","[211.219, 247.474, 225.957]","[206.225, 246.934, 222.896]","[200.943, 245.181, 220.443]","[195.687, 242.353, 223.466]","[191.665, 237.485, 224.187]",6YDP
332,"[150.932, 276.305, 185.842]","[145.45, 274.849, 188.189]","[141.295, 272.049, 191.246]","[140.338, 268.944, 196.323]","[141.186, 267.96, 201.285]","[145.966, 271.737, 204.778]",7AQC
351,"[239.882, 287.814, 260.478]","[238.802, 288.054, 254.756]","[240.153, 285.787, 249.222]","[242.802, 282.222, 246.043]","[248.389, 280.423, 246.221]","[249.264, 277.834, 251.351]",7NSH
355,"[244.255, 120.143, 323.05]","[241.278, 124.988, 321.578]","[241.413, 131.147, 321.088]","[235.636, 134.747, 319.95]","[237.777, 138.378, 324.462]","[236.782, 142.968, 328.811]",7OHQ


In [54]:
print(mix_databse.shape)
mix_databse

(76, 7)


Unnamed: 0,N1,N2,N3,N4,N5,N6,id
0,"[15.175, 7.152, 4.632]","[19.503, 11.838, 5.902]","[24.574, 15.17, 5.346]","[28.576, 17.995, 0.621]","[29.104, 20.03, -5.029]","[29.931, 14.883, -10.657]",17RA
8,"[-0.14, 3.004, -39.68]","[-5.049, 1.391, -37.046]","[-9.883, 1.595, -34.022]","[-11.96, 4.54, -28.925]","[-10.014, 9.153, -24.612]","[-7.041, 14.033, -24.523]",1BVJ
9,"[1.67, 0.139, -8.289]","[-2.844, 3.575, -7.03]","[-4.561, 8.537, -4.263]","[-2.546, 13.122, -0.376]","[1.748, 13.961, 5.103]","[5.24, 8.254, 6.148]",1C0O
10,"[8.684, 17.098, 7.823]","[12.52, 19.928, 11.917]","[15.11, 22.057, 17.867]","[20.912, 22.951, 20.545]","[23.422, 28.331, 16.492]","[24.44, 31.345, 10.851]",1CQL
11,"[174.954, -154.448, -92.421]","[174.096, -147.764, -90.656]","[173.81, -141.718, -88.916]","[172.211, -136.916, -90.784]","[168.435, -135.27, -96.194]","[168.344, -141.446, -98.495]",1E4P
...,...,...,...,...,...,...,...
325,"[213.413, 246.037, 231.173]","[211.219, 247.474, 225.957]","[206.225, 246.934, 222.896]","[200.943, 245.181, 220.443]","[195.687, 242.353, 223.466]","[191.665, 237.485, 224.187]",6YDP
332,"[150.932, 276.305, 185.842]","[145.45, 274.849, 188.189]","[141.295, 272.049, 191.246]","[140.338, 268.944, 196.323]","[141.186, 267.96, 201.285]","[145.966, 271.737, 204.778]",7AQC
351,"[239.882, 287.814, 260.478]","[238.802, 288.054, 254.756]","[240.153, 285.787, 249.222]","[242.802, 282.222, 246.043]","[248.389, 280.423, 246.221]","[249.264, 277.834, 251.351]",7NSH
355,"[244.255, 120.143, 323.05]","[241.278, 124.988, 321.578]","[241.413, 131.147, 321.088]","[235.636, 134.747, 319.95]","[237.777, 138.378, 324.462]","[236.782, 142.968, 328.811]",7OHQ


In [55]:
mix_database = mix_databse.drop('id', axis=1)
for column in mix_database.columns:
    mix_database[column] = mix_database[column].apply(lambda x: ','.join(map(str, x)))
    
mix_database['class'] = 0
mix_database.to_csv("mix_database_P'.csv")
mix_database

Unnamed: 0,N1,N2,N3,N4,N5,N6,class
0,"15.175,7.152,4.632","19.503,11.838,5.902","24.574,15.17,5.346","28.576,17.995,0.621","29.104,20.03,-5.029","29.931,14.883,-10.657",0
8,"-0.14,3.004,-39.68","-5.049,1.391,-37.046","-9.883,1.595,-34.022","-11.96,4.54,-28.925","-10.014,9.153,-24.612","-7.041,14.033,-24.523",0
9,"1.67,0.139,-8.289","-2.844,3.575,-7.03","-4.561,8.537,-4.263","-2.546,13.122,-0.376","1.748,13.961,5.103","5.24,8.254,6.148",0
10,"8.684,17.098,7.823","12.52,19.928,11.917","15.11,22.057,17.867","20.912,22.951,20.545","23.422,28.331,16.492","24.44,31.345,10.851",0
11,"174.954,-154.448,-92.421","174.096,-147.764,-90.656","173.81,-141.718,-88.916","172.211,-136.916,-90.784","168.435,-135.27,-96.194","168.344,-141.446,-98.495",0
...,...,...,...,...,...,...,...
325,"213.413,246.037,231.173","211.219,247.474,225.957","206.225,246.934,222.896","200.943,245.181,220.443","195.687,242.353,223.466","191.665,237.485,224.187",0
332,"150.932,276.305,185.842","145.45,274.849,188.189","141.295,272.049,191.246","140.338,268.944,196.323","141.186,267.96,201.285","145.966,271.737,204.778",0
351,"239.882,287.814,260.478","238.802,288.054,254.756","240.153,285.787,249.222","242.802,282.222,246.043","248.389,280.423,246.221","249.264,277.834,251.351",0
355,"244.255,120.143,323.05","241.278,124.988,321.578","241.413,131.147,321.088","235.636,134.747,319.95","237.777,138.378,324.462","236.782,142.968,328.811",0


In [56]:
with open("mix_database_P_seqs.csv", 'w', newline='') as f:
     
    # using csv.writer method from CSV package
    write = csv.writer(f)
    for item in mix_sequences:
        write.writerow([item])