(info about pisces dataset) 
Here using the cif files from pisces dataset we are extracting the angles (phi and psi omega ?) and the sequences. The sequences are also have been put in the embedding space using Prot-Bert model for the ease of use. All angles are below 129 residues. 

## Importing libraries

In [1]:
#accessing files
import os

#manipulation
import numpy as np
import torch
import pandas as pd

#for python object structure
import pickle

#biopython for parsing the cif files and manipulation of the cif files
from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB import PDBIO
from Bio import PDB

#data structures for manipulation
from typing import Dict, Tuple

#transformer model for embedding space creation
from transformers import BertModel, BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


## Changing to GPU

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Getting the sequences with length less than 128

In [3]:
file_path = 'cullpdb_pc30_0_res0_0_2_0_noBrks_noDsdr_len40_10000_R0_25_Nmr_d2024'
df = pd.read_csv(file_path, delim_whitespace=True)

#pdb_ids = df['PDBchain'].tolist()

df = df[df['len'] <= 128]
pdb_ids = df['PDBchain'].tolist()
print(pdb_ids)

#we need the chain acces information
pdb_id_map = {pdb_id[:-1]: pdb_id for pdb_id in pdb_ids}
print(pdb_id_map)

mapping_file = os.path.join( 'new_training/pdb_id_mapping.csv')
#pd.Series(pdb_id_map).to_csv(mapping_file, header=False)
pd.Series({k.lower(): v for k, v in pdb_id_map.items()}).to_csv(mapping_file, header=False)

['1E0ZA', '1RY4A', '1UG7A', '1WHNA', '2AYAA', '2D87A', '2DB5A', '2DKPA', '2EBKA', '2ETTA', '2FM4A', '2I4KA', '2JQ5A', '2K0RA', '2K5TA', '2LR4A', '2LS01', '2LVLA', '2MAHA', '2NCNA', '2RLOA', '2RV8A', '2YTUA', '4AR0A', '5AIWA', '5H7UA', '5N9QA', '8E22X', '1CMOA', '1DDFA', '1EALA', '1PQNA', '1QKLA', '1UEYA', '1W2QA', '1WH4A', '1WJRA', '1X5MA', '1ZU1A', '2CSOA', '2DMKA', '2EE7A', '2L1LB', '2LOEA', '2MF7A', '2MPLA', '2NBOA', '2TMPA', '4UEIA', '5M1UA', '7XRWA', '8OILA', '1RSFA', '1V5SA', '1X5EA', '1ZO0A', '2A4HA', '2CR4A', '2DAVA', '2IF1A', '2JP2A', '2JVNA', '2K2VA', '2L3GA', '2L57A', '2LBCA', '2LLNA', '2M26A', '3MSPA', '7F7NA', '7KNVA', '8SQXA', '1BUQA', '1BW3A', '1FR0A', '1GIOA', '1MH6A', '1PSYA', '1UENA', '1WYJA', '2AI6A', '2DHJA', '2K4VA', '2K4ZA', '2KA5A', '2KFPA', '2KYTA', '2L2CA', '2LDUA', '2LM0A', '2MDKA', '2MV4A', '2N2HB', '2NBGA', '2PRFA', '4A53A', '5KNWA', '5T3YA', '5WOEA', '6GWMA', '6TH8A', '6UF2A', '6XEHA', '7BQCA', '7PRDA', '7QYIA', '1DC7A', '1E8EA', '1EO1A', '1PIRA', '1T3VA', 

  df = pd.read_csv(file_path, delim_whitespace=True)


In [6]:

tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False ) # change model and tokenizer to t5
model_embedd = BertModel.from_pretrained("Rostlab/prot_bert")
model_embedd = model_embedd.to(device)


## Dividing data into batches

In [5]:

def extract_input(pdb_file, model, tokenizer):
    '''
    Input =>
    pdb_file : file containing all the information for the given protein (.pdb or .cif)
    tokenizer : tokenizer for the sequence tokenization
    model : transformer model used for creating embedding space
    
    Output =>
    sequence : extracted from the file
    angle_tensor : tensor containing phi and psi angles for the given sequence
    encoded_input : encoded sequence using the tokenizer
    embeddings : embedding created using the model for the given sequence
    '''
    #Parsing the cif file
    cif_parser = MMCIFParser()
    structure = cif_parser.get_structure("protein", pdb_file) # getting structure
    model0 = structure[0]

    #This is to access the dictionary keys, which are the same as the file name, 
    #Once the keys are accessed we can find which chain to take
    mapping_file = os.path.join('new_training/pdb_id_mapping.csv') 
    
    pdb_id_map = pd.read_csv(mapping_file, header=None, index_col=0).squeeze().to_dict()

    #extracting the file name as it contains the id
    filename = os.path.basename(pdb_file)
    pdb_id = filename[:4] # the first 4 letters of the file name is the dictionary key,whose value is the chain to be used

    full_pdb_id = pdb_id_map.get(pdb_id, None) #pdb id with the correct chain

    chain_A = model0[full_pdb_id[-1]]  # and we get chain A , the last letter of the id is the chain
    print("full pd ib:", full_pdb_id)

    #This is a very common need in bioinformatics of proteins
    d3to1 = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
    'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',
    'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',
    'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'}

    sequence = []
    for residue in chain_A:
        #For simplicity we can use X for heteroatoms (ions and water)
        sequence.append(d3to1.get(residue.get_resname(), 'X'))  #converts water and ions to X
    

    structure.atom_to_internal_coordinates() # turns xyz coordinates into angles and bond lengths

    #Creating a dictionary to collect chain ids and turn them into indices
    chain_id_to_num = {}
    for num, chain in enumerate(model0.get_chains()):
        chain_id_to_num[chain.id] = num

    #Iterator of chains, turns it into list, [0] first chain
    chain:PDB.Chain.Chain = list(model0.get_chains())[chain_id_to_num.get(full_pdb_id[-1])]
    #This accesses the internal chain coords of the chain object
    ic_chain: PDB.internal_coords.IC_Chain = chain.internal_coord 

    d: Dict[Tuple[PDB.internal_coords.AtomKey,
              PDB.internal_coords.AtomKey,
              PDB.internal_coords.AtomKey,
              PDB.internal_coords.AtomKey],
        PDB.internal_coords.Dihedron] = ic_chain.dihedra

    phi_angles_list = []
    psi_angles_list = []

    for key in d:
      if key[0].akl[3] == 'N' and key[1].akl[3] == 'CA' and key[2].akl[3] == 'C' and key[3].akl[3] == 'N':
          phi_angles_list.append(d[key].angle)
      elif key[0].akl[3] == 'C' and key[1].akl[3] == 'N' and key[2].akl[3] == 'CA' and key[3].akl[3] == 'C':
        psi_angles_list.append(d[key].angle)

    structure.internal_to_atom_coordinates(verbose = False)
    io = PDBIO() #this is to write a pdb/cif file again
    io.set_structure(structure)#setting the structure to the desired strcuture in the given file
    phi_angles_list.append(0)
    psi_angles_list.append(0)

    phi = np.asarray(phi_angles_list,dtype=np.float32)
    psi = np.asarray(psi_angles_list,dtype=np.float32)
    angles = np.vstack((psi,phi))
    angle_tensor = torch.tensor(angles, dtype=torch.float32).to(device)

    #Encoding the sequence
    encoded_input = tokenizer.encode(sequence, padding=, return_tensors='pt').to(device)  

    #Create the embedding using the model
    with torch.no_grad():
        outputs = model(input_ids=encoded_input)

    embeddings = outputs.last_hidden_state

    return sequence, angle_tensor, encoded_input, embeddings 

In [7]:
len(df) # number of proteins we need to extract in PISCES data

1723

You need to upload a huge folder of all the data here. The pdb files folder in directory path. In target path upload the inputs folder

In [8]:
#Data path
directory_path = 'small_proteins'
#Processed data path
target_path = 'new_training/'

embeddings_file = os.path.join(target_path, 'embeddings.pt')
angles_file = os.path.join(target_path, 'angles.pt')
sequences_file = os.path.join(target_path, 'sequences.pkl') 

#Embeddings tensor and angles tensor initialized with zero tensors
#The calculated values are concatenated, this way the padding is achieved to have equal length for each sequence
#128(max length) + 1 token start + 1 token end => 131 (python length starting from 0)
embeddings = torch.zeros(0, 131, 1024).to(device) 
angles = torch.zeros(0, 2, 131).to(device)
sequences = []

def pad_sequence(sequence, target_length=129, pad_char="X"): # may not need this as im using a padding in the tokenizer itself
    padding_length = target_length - len(sequence)
    padding = [pad_char] * padding_length
    return sequence + padding

#Get list of files in the directory containing pdb/cif files
file_list = os.listdir(directory_path)

for i,filename in enumerate(file_list):
    file_path = os.path.join(directory_path, filename)
   
    if os.path.isfile(file_path):
        sequence, angle, inputs, outputs, embedding, pdb_id= extract_input(file_path, model_embedd, tokenizer) # the key part that extracts all the information into a file
        if sequence is None or embedding is None or angle is None:
            print(f"Skipping {file_path} due to missing data")
            continue
        print(len(sequence))
        
        padded_sequence = pad_sequence(sequence)  # may not need this as im using a padding in the tokenizer itself
        sequences.append(padded_sequence)

        # Append the new data to the existing tensors
        embedding_length = min(embedding.size(1), 131)
        #print(np.shape(embedding))
        if i >= embeddings.size(0):
            embeddings = torch.cat([embeddings.to(device), torch.zeros(1, 131, 1024).to(device)], dim=0)
        embeddings[i, :embedding_length] = embedding[0, :, :]
        #print(angle.size)
        angle_length = min(angle.size(1), 131)
        if i >= angles.size(0):

            angles = torch.cat([angles.to(device), torch.zeros(1, 2, 131).to(device)], dim=0)
        
        angles[i, :, :angle_length] = angle[:, :]

        i += 1
        print(i)


# Save the updated tensors back to the files
torch.save(embeddings, embeddings_file)
torch.save(angles, angles_file)

with open(sequences_file, 'wb') as f: # you may not need to save the sequences
    pickle.dump(sequences, f)



start index:  0
['1e0z.cif', '1ry4.cif', '1ug7.cif', '1whn.cif', '2aya.cif', '2d87.cif', '2db5.cif', '2dkp.cif', '2ebk.cif', '2ett.cif', '2fm4.cif', '2i4k.cif', '2jq5.cif', '2k0r.cif', '2k5t.cif', '2lr4.cif', '2ls0.cif', '2lvl.cif', '2mah.cif', '2ncn.cif', '2rlo.cif', '2rv8.cif', '2ytu.cif', '4ar0.cif', '5aiw.cif', '5h7u.cif', '5n9q.cif', '8e22.cif', '1cmo.cif', '1ddf.cif', '1eal.cif', '1pqn.cif', '1qkl.cif', '1uey.cif', '1w2q.cif', '1wh4.cif', '1wjr.cif', '1x5m.cif', '1zu1.cif', '2cso.cif', '2dmk.cif', '2ee7.cif', '2l1l.cif', '2loe.cif', '2mf7.cif', '2mpl.cif', '2nbo.cif', '2tmp.cif', '4uei.cif', '5m1u.cif', '7xrw.cif', '8oil.cif', '1rsf.cif', '1v5s.cif', '1x5e.cif', '1zo0.cif', '2a4h.cif', '2cr4.cif', '2dav.cif', '2if1.cif', '2jp2.cif', '2jvn.cif', '2k2v.cif', '2l3g.cif', '2l57.cif', '2lbc.cif', '2lln.cif', '2m26.cif', '3msp.cif', '7f7n.cif', '7knv.cif', '8sqx.cif', '1buq.cif', '1bw3.cif', '1fr0.cif', '1gio.cif', '1mh6.cif', '1psy.cif', '1uen.cif', '1wyj.cif', '2ai6.cif', '2dhj.cif',



full pd ib: 1BUQA
letter:  0
126
73
73
full pd ib: 1BW3A
letter:  0
125
74
74
full pd ib: 1FR0A
letter:  0
125
75
75
full pd ib: 1GIOA
letter:  0
125
76
76
full pd ib: 1MH6A
letter:  0
125
77
77
full pd ib: 1PSYA
letter:  0
125
78
78
full pd ib: 1UENA
letter:  0
125
79
79
full pd ib: 1WYJA
letter:  0
125
80
80
full pd ib: 2AI6A
letter:  0
125
81
81
full pd ib: 2DHJA
letter:  0
125
82
82
full pd ib: 2K4VA
letter:  0
125
83
83
full pd ib: 2K4ZA
letter:  0
125
84
84
full pd ib: 2KA5A
letter:  0
125
85
85
full pd ib: 2KFPA
letter:  0
125
86
86
full pd ib: 2KYTA
letter:  0
125
87
87
full pd ib: 2L2CA
letter:  0
125
88
88
full pd ib: 2LDUA
letter:  0
125
89
89
full pd ib: 2LM0A
letter:  0
125
90
90
full pd ib: 2MDKA
letter:  0
125
91
91
full pd ib: 2MV4A
letter:  0
125
92
92
full pd ib: 2N2HB
letter:  1
125
93
93
full pd ib: 2NBGA
letter:  0
125
94
94
full pd ib: 2PRFA
letter:  0
125
95
95
full pd ib: 4A53A
letter:  0
125
96
96
full pd ib: 5KNWA
letter:  0
125
97
97
full pd ib: 5T3YA
letter:



full pd ib: 2P80D
letter:  3
124
152
152
full pd ib: 2YQGA
letter:  0
123
153
153
full pd ib: 6NBNA
letter:  0
124
154
154
full pd ib: 8I25A
letter:  0
123
155
155
full pd ib: 1AXJA
letter:  0
123
156
156
full pd ib: 1BIPA
letter:  0
122
157
157
full pd ib: 1D4BA
letter:  0
122
158
158
full pd ib: 1DROA
letter:  0
122
159
159
full pd ib: 1IT4A
letter:  0
123
160
160
full pd ib: 1NR3A
letter:  0
122
161
161
full pd ib: 1PLOA
letter:  0
122
162
162
full pd ib: 1V5QA
letter:  0
122
163
163
full pd ib: 1YUAA
letter:  0
122
164
164
full pd ib: 2CQMA
letter:  0
122
165
165
full pd ib: 2CRYA
letter:  0
122
166
166
full pd ib: 2EQYA
letter:  0
122
167
167
full pd ib: 2KUTA
letter:  0
122
168
168
full pd ib: 2LFIA
letter:  0
122
169
169
full pd ib: 2LHTA
letter:  0
122
170
170
full pd ib: 2M5YA
letter:  0
122
171
171
full pd ib: 2MWIA
letter:  0
122
172
172
full pd ib: 2MZNA
letter:  0
122
173
173
full pd ib: 2ROHA
letter:  0
122
174
174
full pd ib: 2YS4A
letter:  0
122
175
175
full pd ib: 5Y7L



full pd ib: 8CA0A
letter:  0
115
374
374
full pd ib: 8Y4ZA
letter:  0
114
375
375
full pd ib: 1GA3A
letter:  0
113
376
376
full pd ib: 1N3GA
letter:  0
113
377
377
full pd ib: 1PLSA
letter:  0
113
378
378
full pd ib: 1WI0A
letter:  0
113
379
379
full pd ib: 1XNEA
letter:  0
113
380
380
full pd ib: 1Y7XA
letter:  0
113
381
381
full pd ib: 2COKA
letter:  0
113
382
382
full pd ib: 2DGZA
letter:  0
113
383
383
full pd ib: 2E2WA
letter:  0
113
384
384
full pd ib: 2EDHA
letter:  0
113
385
385
full pd ib: 2IVWA
letter:  0
113
386
386
full pd ib: 2LCHA
letter:  0
113
387
387
full pd ib: 2LFGA
letter:  0
113
388
388
full pd ib: 2M1CA
letter:  0
113
389
389
full pd ib: 2MUKX
X
113
390
390
full pd ib: 2RPBA
letter:  0
113
391
391
full pd ib: 4CSQA
letter:  0
113
392
392
full pd ib: 5NR5A
letter:  0
113
393
393
full pd ib: 6I2OA
letter:  0
113
394
394
full pd ib: 6ZYGA
letter:  0
113
395
395
full pd ib: 1IRSA
letter:  0
112
396
396
full pd ib: 1J26A
letter:  0
112
397
397
full pd ib: 1JLIA
letter:



full pd ib: 1RF8B
letter:  1
100
731
731
full pd ib: 1UH6A
letter:  0
100
732
732
full pd ib: 1WGRA
letter:  0
100
733
733
full pd ib: 1WHDA
letter:  0
100
734
734
full pd ib: 1WHVA
letter:  0
100
735
735
full pd ib: 1WJUA
letter:  0
100
736
736
full pd ib: 1X3AA
letter:  0
100
737
737
full pd ib: 2CQLA
letter:  0
100
738
738
full pd ib: 2D9OA
letter:  0
100
739
739
full pd ib: 2DN8A
letter:  0
100
740
740
full pd ib: 2DO4A
letter:  0
100
741
741
full pd ib: 2DZMA
letter:  0
100
742
742
full pd ib: 2E2ZA
letter:  0
101
743
743
full pd ib: 2EFIA
letter:  0
100
744
744
full pd ib: 2EP8A
letter:  0
100
745
745
full pd ib: 2GAQA
letter:  0
100
746
746
full pd ib: 2J4MA
letter:  0
100
747
747
full pd ib: 2KDGA
letter:  0
100
748
748
full pd ib: 2KPQA
letter:  0
100
749
749
full pd ib: 2KREA
letter:  0
100
750
750
full pd ib: 2KXYA
letter:  0
100
751
751
full pd ib: 2L7PA
letter:  0
101
752
752
full pd ib: 2LGVA
letter:  0
103
753
753
full pd ib: 2LLZA
letter:  0
100
754
754
full pd ib: 2LZ0



full pd ib: 2N9PC
letter:  2
100
757
757
full pd ib: 2YREA
letter:  0
104
758
758
full pd ib: 3CRDA
letter:  0
100
759
759
full pd ib: 5O2PA
letter:  0
100
760
760
full pd ib: 6H0JA
letter:  0
100
761
761
full pd ib: 7M5TA
letter:  0
100
762
762




full pd ib: 7QRIA
letter:  0
101
763
763
full pd ib: 8AU4A
letter:  0
100
764
764
full pd ib: 8BXJA
letter:  0
100
765
765
full pd ib: 1BVEA
letter:  0
99
766
766
full pd ib: 1EXEA
letter:  0
99
767
767
full pd ib: 1HV2A
letter:  0
99
768
768
full pd ib: 1J2MA
letter:  0
99
769
769
full pd ib: 1KATV
letter:  21
99
770
770
full pd ib: 1T6WA
letter:  0
100
771
771




full pd ib: 1TKWA
letter:  0
100
772
772
full pd ib: 1WA8B
letter:  1
95
773
773
full pd ib: 1WWUA
letter:  0
99
774
774
full pd ib: 2CPDA
letter:  0
99
775
775
full pd ib: 2D8RA
letter:  0
100
776
776
full pd ib: 2DNWA
letter:  0
99
777
777
full pd ib: 2EZLA
letter:  0
99
778
778
full pd ib: 2KBZA
letter:  0
99
779
779
full pd ib: 2KCLA
letter:  0
99
780
780
full pd ib: 2KJGA
letter:  0
99
781
781
full pd ib: 2KRCA
letter:  0
99
782
782
full pd ib: 2L49A
letter:  0
99
783
783
full pd ib: 2LIUA
letter:  0
99
784
784
full pd ib: 2LNAA
letter:  0
99
785
785
full pd ib: 2LONA
letter:  0
99
786
786
full pd ib: 2LSIA
letter:  0
99
787
787
full pd ib: 2M4LA
letter:  0
99
788
788
full pd ib: 2MZSA
letter:  0
99
789
789
full pd ib: 2NDPA
letter:  0
99
790
790
full pd ib: 2NOCA
letter:  0
99
791
791
full pd ib: 2PFUA
letter:  0
99
792
792
full pd ib: 5WAHA
letter:  0
99
793
793
full pd ib: 6EWVA
letter:  0
99
794
794
full pd ib: 6KCZA
letter:  0
102
795
795
full pd ib: 1BEGA
letter:  0
98
796
7



letter:  0
98
818
818
full pd ib: 1M9GA
letter:  0
97
819
819
full pd ib: 1Y9XA
letter:  0
97
820
820
full pd ib: 2D8CA
letter:  0
97
821
821
full pd ib: 2DL0A
letter:  0
97
822
822
full pd ib: 2EBWA
letter:  0
97
823
823
full pd ib: 2HFVA
letter:  0
97
824
824
full pd ib: 2HGFA
letter:  0
97
825
825
full pd ib: 2JN6A
letter:  0
97
826
826
full pd ib: 2K5RA
letter:  0
97
827
827
full pd ib: 2KBIA
letter:  0
97
828
828
full pd ib: 2KG7B
letter:  1
97
829
829
full pd ib: 2KJ6A
letter:  0
97
830
830
full pd ib: 2L08A
letter:  0
97
831
831
full pd ib: 2L0CA
letter:  0
97
832
832
full pd ib: 2LQ7A
letter:  0
97
833
833
full pd ib: 2LWPA
letter:  0
97
834
834
full pd ib: 2N7DA
letter:  0
97
835
835
full pd ib: 6MSPA
letter:  0
97
836
836
full pd ib: 6VRJA
letter:  0
97
837
837
full pd ib: 1ESXA
letter:  0
96
838
838
full pd ib: 1I1SA
letter:  0
96
839
839
full pd ib: 1N88A
letter:  0
96
840
840
full pd ib: 1SLJA
letter:  0
96
841
841
full pd ib: 1WGGA
letter:  0
96
842
842
full pd ib: 1WIEA




full pd ib: 1E08D
letter:  3
89
1007
1007
full pd ib: 1EMWA
letter:  0
88
1008
1008
full pd ib: 1HEHC
letter:  2
88
1009
1009
full pd ib: 1IMOA
letter:  0
88
1010
1010
full pd ib: 1IURA
letter:  0
88
1011
1011
full pd ib: 1KA5A
letter:  0
88
1012
1012
full pd ib: 1UJSA
letter:  0
88
1013
1013
full pd ib: 1USSA
letter:  0
88
1014
1014
full pd ib: 1WG1A
letter:  0
88
1015
1015
full pd ib: 1WKTA
letter:  0
88
1016
1016
full pd ib: 1XHJA
letter:  0
88
1017
1017
full pd ib: 1Z8MA
letter:  0
88
1018
1018
full pd ib: 2CT2A
letter:  0
90
1019
1019
full pd ib: 2DZJA
letter:  0
88
1020
1020
full pd ib: 2ELJA
letter:  0
88
1021
1021
full pd ib: 2KO1A
letter:  0
88
1022
1022
full pd ib: 2LD3A
letter:  0
88
1023
1023
full pd ib: 2LR6A
letter:  0
88
1024
1024
full pd ib: 2MA3A
letter:  0
88
1025
1025
full pd ib: 2MNJB
letter:  1
88
1026
1026
full pd ib: 2MTLA
letter:  0
88
1027
1027
full pd ib: 2MV3A
letter:  0
88
1028
1028
full pd ib: 2RQPA
letter:  0
88
1029
1029
full pd ib: 2U1AA
letter:  0
88
10

  (


letter:  0
77
1260
1260
full pd ib: 1ITPA
letter:  0
77
1261
1261
full pd ib: 1OF9A
letter:  0
77
1262
1262
full pd ib: 2DJPA
letter:  0
77
1263
1263
full pd ib: 2GUTA
letter:  0
77
1264
1264
full pd ib: 2JZ6A
letter:  0
77
1265
1265
full pd ib: 2K4RA
letter:  0
77
1266
1266
full pd ib: 2L7XA
letter:  0
79
1267
1267
full pd ib: 2MMPA
letter:  0
77
1268
1268
full pd ib: 2MP1A
letter:  0
77
1269
1269
full pd ib: 2N1TA
letter:  0
69
1270
1270
full pd ib: 5UTVA
letter:  0
77
1271
1271
full pd ib: 6BZLA
letter:  0
77
1272
1272
full pd ib: 6DMPA
letter:  0
77
1273
1273
full pd ib: 7BY7A
letter:  0
77
1274
1274
full pd ib: 8B7TA
letter:  0
77
1275
1275
full pd ib: 1EKZA
letter:  0
76
1276
1276
full pd ib: 1P94A
letter:  0
76
1277
1277
full pd ib: 1TNSA
letter:  0
76
1278
1278
full pd ib: 1VZSA
letter:  0
76
1279
1279
full pd ib: 1WCLA
letter:  0
76
1280
1280
full pd ib: 1ZHCA
letter:  0
76
1281
1281
full pd ib: 2DJRA
letter:  0
77
1282
1282
full pd ib: 2JUBA
letter:  0
76
1283
1283
full pd ib



full pd ib: 5OEOC
letter:  2
73
1341
1341
full pd ib: 7PKUB
letter:  1
73
1342
1342
full pd ib: 1PVEA
letter:  0
72
1343
1343
full pd ib: 1WI9A
letter:  0
72
1344
1344
full pd ib: 2ADLA
letter:  0
72
1345
1345
full pd ib: 2CSVA
letter:  0
74
1346
1346
full pd ib: 2J2SA
letter:  0
74
1347
1347
full pd ib: 2J5OA
letter:  0
72
1348
1348
full pd ib: 2JO1A
letter:  0
72
1349
1349
full pd ib: 2JVGA
letter:  0
72
1350
1350
full pd ib: 2JYEA
letter:  0
72
1351
1351
full pd ib: 2K6XA
letter:  0
72
1352
1352
full pd ib: 6TXTA
letter:  0
72
1353
1353
full pd ib: 1ADZA
letter:  0
71
1354
1354




full pd ib: 1AJYA
letter:  0
73
1355
1355
full pd ib: 1BBIA
letter:  0
71
1356
1356
full pd ib: 1DAQA
letter:  0
73
1357
1357
full pd ib: 1K3GA
letter:  0
72
1358
1358
full pd ib: 1VIGA
letter:  0
71
1359
1359
full pd ib: 2CQJA
letter:  0
71
1360
1360
full pd ib: 2DA7A
letter:  0
71
1361
1361
full pd ib: 2E70A
letter:  0
71
1362
1362
full pd ib: 2JS5A
letter:  0
71
1363
1363
full pd ib: 2KDPA
letter:  0
72
1364
1364
full pd ib: 2KISA
letter:  0
71
1365
1365
full pd ib: 2KSKA
letter:  0
71
1366
1366
full pd ib: 2KVTA
letter:  0
71
1367
1367
full pd ib: 2L8TA
letter:  0
71
1368
1368
full pd ib: 2LTFA
letter:  0
71
1369
1369
full pd ib: 2LZZA
letter:  0
74
1370
1370
full pd ib: 3ZBEA
letter:  0
71
1371
1371
full pd ib: 3ZJ2A
letter:  0
73
1372
1372
full pd ib: 5NHQA
letter:  0
71
1373
1373
full pd ib: 6XORA
letter:  0
71
1374
1374
full pd ib: 1DWMA
letter:  0
70
1375
1375
full pd ib: 1E68A
letter:  0
70
1376
1376
full pd ib: 1FGPA
letter:  0
70
1377
1377
full pd ib: 1Q1VA
letter:  0
70
13



full pd ib: 2LWWB
letter:  1
70
1384
1384
full pd ib: 2MH3A
letter:  0
70
1385
1385
full pd ib: 2N34A
letter:  0
70
1386
1386
full pd ib: 2RU1A
letter:  0
70
1387
1387
full pd ib: 6EVIA
letter:  0
70
1388
1388
full pd ib: 6Y8VA
letter:  0
70
1389
1389
full pd ib: 1BBYA
letter:  0
69
1390
1390
full pd ib: 1BFMA
letter:  0
69
1391
1391
full pd ib: 1GO5A
letter:  0
69
1392
1392
full pd ib: 1L6HA
letter:  0
69
1393
1393
full pd ib: 1RYKA
letter:  0
69
1394
1394
full pd ib: 1U96A
letter:  0
70
1395
1395
full pd ib: 2DY8A
letter:  0
69
1396
1396
full pd ib: 2E61A
letter:  0
70
1397
1397
full pd ib: 2JX5A
letter:  0
69
1398
1398
full pd ib: 2KZ9A
letter:  0
69
1399
1399
full pd ib: 2LBFA
letter:  0
69
1400
1400
full pd ib: 2M1LA
letter:  0
69
1401
1401
full pd ib: 2MFKA
letter:  0
69
1402
1402
full pd ib: 2N72A
letter:  0
69
1403
1403
full pd ib: 4ULLA
letter:  0
69
1404
1404
full pd ib: 1BMRA
letter:  0
68
1405
1405
full pd ib: 1J9IA
letter:  0
68
1406
1406
full pd ib: 1NE3A
letter:  0
68
14



full pd ib: 2KA4B
letter:  1
57
1546
1546
full pd ib: 2LEVA
letter:  0
57
1547
1547
full pd ib: 2LFKA
letter:  0
57
1548
1548
full pd ib: 2LMKA
letter:  0
57
1549
1549
full pd ib: 2MDFA
letter:  0
57
1550
1550
full pd ib: 2MI5A
letter:  0
57
1551
1551
full pd ib: 2N5LA
letter:  0
57
1552
1552
full pd ib: 2YSDA
letter:  0
57
1553
1553
full pd ib: 5OQSA
letter:  0
57
1554
1554
full pd ib: 6LXFB
letter:  1
57
1555
1555
full pd ib: 6URSA
letter:  0
57
1556
1556
full pd ib: 1CQUA
letter:  0
56
1557
1557
full pd ib: 1RSOB
letter:  1
56
1558
1558
full pd ib: 2AYJA
letter:  0
57
1559
1559
full pd ib: 2KK9A
letter:  0
56
1560
1560
full pd ib: 2KMUA
letter:  0
56
1561
1561
full pd ib: 6NU4A
letter:  0
56
1562
1562
full pd ib: 1AUUA
letter:  0
55
1563
1563
full pd ib: 1CLFA
letter:  0
57
1564
1564
full pd ib: 1IW4A
letter:  0
55
1565
1565
full pd ib: 1KMXA
letter:  0
55
1566
1566
full pd ib: 1VIBA
letter:  0
55
1567
1567
full pd ib: 2FXPA
letter:  0
55
1568
1568
full pd ib: 2K9IA
letter:  0
55
15



full pd ib: 2JU0B
letter:  1
52
1588
1588
full pd ib: 2JX4A
letter:  0
52
1589
1589
full pd ib: 2LQHB
letter:  1
52
1590
1590
full pd ib: 2LUAA
letter:  0
55
1591
1591
full pd ib: 2M35A
letter:  0
52
1592
1592
full pd ib: 2N5MA
letter:  0
52
1593
1593
full pd ib: 1HICA
letter:  0
51
1594
1594




full pd ib: 1L8CB
letter:  1
51
1595
1595
full pd ib: 1UDKA
letter:  0
51
1596
1596
full pd ib: 2JO8A
letter:  0
51
1597
1597
full pd ib: 2LW9A
letter:  0
51
1598
1598
full pd ib: 2MD0A
letter:  0
51
1599
1599
full pd ib: 6E8WA
letter:  0
51
1600
1600
full pd ib: 1E8RA
letter:  0
50
1601
1601
full pd ib: 1FDMA
letter:  0
50
1602
1602
full pd ib: 1SS3A
letter:  0
50
1603
1603
full pd ib: 1TPMA
letter:  0
50
1604
1604
full pd ib: 1UGLA
letter:  0
50
1605
1605
full pd ib: 2EZWA
letter:  0
50
1606
1606
full pd ib: 2KJIA
letter:  0
50
1607
1607
full pd ib: 2KYGA
letter:  0
50
1608
1608
full pd ib: 2M76A
letter:  0
50
1609
1609
full pd ib: 2N3XA
letter:  0
50
1610
1610
full pd ib: 6FTKA
letter:  0
50
1611
1611
full pd ib: 7EAUA
letter:  0
50
1612
1612
full pd ib: 8AR0A
letter:  0
50
1613
1613
full pd ib: 8AR2A
letter:  0
50
1614
1614
full pd ib: 8AR3A
letter:  0
50
1615
1615
full pd ib: 1H7DA
letter:  0
49
1616
1616
full pd ib: 1SHIA
letter:  0
49
1617
1617
full pd ib: 1Z6VA
letter:  0
49
16



full pd ib: 5N7YA
letter:  0
50
1623
1623
full pd ib: 6WQLA
letter:  0
49
1624
1624
full pd ib: 8QNTA
letter:  0
49
1625
1625
full pd ib: 1SSLA
letter:  0
48
1626
1626
full pd ib: 2JWHA
letter:  0
48
1627
1627
full pd ib: 2KESA
letter:  0
48
1628
1628
full pd ib: 2KSGA
letter:  0
48
1629
1629
full pd ib: 5NAMA
letter:  0
48
1630
1630
full pd ib: 7C4OA
letter:  0
48
1631
1631
full pd ib: 1AQ5A
letter:  0
47
1632
1632
full pd ib: 1PGYA
letter:  0
47
1633
1633
full pd ib: 2B9KA
letter:  0
47
1634
1634
full pd ib: 2MW4A
letter:  0
47
1635
1635
full pd ib: 5J8TA
letter:  0
48
1636
1636
full pd ib: 5T42A
letter:  0
47
1637
1637
full pd ib: 7LGLA
letter:  0
47
1638
1638
full pd ib: 1CCMA
letter:  0
46
1639
1639
full pd ib: 1HYKA
letter:  0
46
1640
1640
full pd ib: 1NBLA
letter:  0
46
1641
1641
full pd ib: 1NI8A
letter:  0
46
1642
1642
full pd ib: 2ENFA
letter:  0
47
1643
1643
full pd ib: 2IMUA
letter:  0
46
1644
1644
full pd ib: 2JRYA
letter:  0
46
1645
1645
full pd ib: 2MBYA
letter:  0
46
16



full pd ib: 2KA6B
letter:  1
45
1656
1656
full pd ib: 5X9XB
letter:  1
45
1657
1657




full pd ib: 1CF4B
letter:  1
44
1658
1658
full pd ib: 1CWXA
letter:  0
44
1659
1659
full pd ib: 1JUNA
letter:  0
44
1660
1660
full pd ib: 2K9DA
letter:  0
44
1661
1661
full pd ib: 2LB7A
letter:  0
44
1662
1662
full pd ib: 5KGZA
letter:  0
44
1663
1663
full pd ib: 6T33A
letter:  0
44
1664
1664
full pd ib: 7MJ3A
letter:  0
44
1665
1665
full pd ib: 1AW6A
letter:  0
45
1666
1666
full pd ib: 1BZKA
letter:  0
43
1667
1667




full pd ib: 2E30B
letter:  1
43
1668
1668
full pd ib: 2KE3A
letter:  0
43
1669
1669
full pd ib: 2KUYA
letter:  0
45
1670
1670
full pd ib: 2L2LA
letter:  0
43
1671
1671
full pd ib: 2LZLA
letter:  0
43
1672
1672
full pd ib: 2N70A
letter:  0
43
1673
1673
full pd ib: 5LMEA
letter:  0
45
1674
1674
full pd ib: 6OBKA
letter:  0
43
1675
1675
full pd ib: 8IMHA
letter:  0
43
1676
1676
full pd ib: 1CO4A
letter:  0
43
1677
1677
full pd ib: 1D6BA
letter:  0
42
1678
1678
full pd ib: 1GO9A
letter:  0
42
1679
1679
full pd ib: 1KV4A
letter:  0
42
1680
1680
full pd ib: 1QDPA
letter:  0
42
1681
1681
full pd ib: 1UJLA
letter:  0
42
1682
1682
full pd ib: 1WQKA
letter:  0
42
1683
1683
full pd ib: 2B4NA
letter:  0
42
1684
1684
full pd ib: 2LMZA
letter:  0
42
1685
1685
full pd ib: 8BVCA
letter:  0
42
1686
1686
full pd ib: 8UM1A
letter:  0
42
1687
1687
full pd ib: 1BI6H
letter:  7
41
1688
1688
full pd ib: 1HJ0A
letter:  0
41
1689
1689
full pd ib: 1WT7A
letter:  0
41
1690
1690
full pd ib: 2J5HA
letter:  0
41
16

In [9]:
print(np.shape(embedding[:,:,:]))
print(np.shape(embeddings[:,:,:]))

print(np.shape(angle))
print(np.shape(angles))

torch.Size([1, 42, 1024])
torch.Size([1711, 131, 1024])
torch.Size([2, 40])
torch.Size([1711, 2, 131])


In [10]:
angles = torch.load('new_training/angles.pt')
phi_angles = angles[:,0,:]
psi_angles = angles[:,1,:]

print(angles.size())
print(phi_angles.size())
print(angles[15,:,:])

torch.Size([1711, 2, 131])
torch.Size([1711, 131])
tensor([[-129.0686,  -93.1709, -107.1592,  -47.9783,  -67.9515,  -54.4657,
         -115.7745, -156.6232, -133.5739, -163.0471,  -53.1599,   62.5310,
         -138.5958, -149.2970, -156.4849, -136.3228,  -56.4204,  -74.6726,
         -109.2731,   64.7929,   56.2258,  -65.5943,  -61.9461,   42.9776,
          -50.0812,  -71.9293,  -55.8442,  -66.1385, -145.5454,  -48.5025,
         -121.4797, -143.9788,  -66.8832,  -63.2468, -154.5751,  -56.8206,
          -74.4059,  -69.6196,   94.3943, -131.0393,  -76.3750,  -82.8213,
         -115.5403,  -92.0608, -128.3188, -103.1864,  -90.8789, -113.1141,
          -87.8241,  114.8718,  -74.2119, -152.0556, -133.8653, -121.1926,
          -85.7341, -138.8487, -142.8164,  -81.4579,   59.6251,  -68.3115,
          -71.6146,   86.9223, -138.8716,  -66.5406,  -58.2832,  -68.3479,
         -121.7699,  -95.3596,  173.9619,  -89.7241,  -65.4513, -140.8306,
          -72.8081, -138.7966, -157.7205, -150.27