(info about pisces dataset) 
Here using the cif files from pisces dataset we are extracting the angles (phi and psi omega ?) and the sequences. The sequences are also have been put in the embedding space using Prot-Bert model for the ease of use. All angles are below 129 residues. 

## Importing libraries

In [1]:
#accessing files
import os

#manipulation libraries
import numpy as np
import torch
import pandas as pd

#for python object structure
import pickle

#biopython for parsing the cif files and manipulation of the cif files
from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB import PDBIO
from Bio import PDB

#data structures for manipulation
from typing import Dict, Tuple

#transformer model for embedding space creation
from transformers import BertModel, BertTokenizer

#gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

  from .autonotebook import tqdm as notebook_tqdm


## Changing to GPU

### Initializations for Pisces data

In [3]:
file_path = 'data/cullpdb_pc30_0_res0_0_2_0_noBrks_noDsdr_len40_10000_R0_25_Nmr_d2024'
df = pd.read_csv(file_path, delim_whitespace=True)

#Getting the id of files containing sequence length less than 128
df = df[df['len'] <= 128]
pdb_ids = df['PDBchain'].tolist()
print(pdb_ids)

#Chain information for the sequence
pdb_id_map = {pdb_id[:-1]: pdb_id for pdb_id in pdb_ids}
print(pdb_id_map)

mapping_file = os.path.join( 'data_processed/pisces_training/pdb_id_mapping.csv')
pd.Series({k.lower(): v for k, v in pdb_id_map.items()}).to_csv(mapping_file, header=False)

['1E0ZA', '1RY4A', '1UG7A', '1WHNA', '2AYAA', '2D87A', '2DB5A', '2DKPA', '2EBKA', '2ETTA', '2FM4A', '2I4KA', '2JQ5A', '2K0RA', '2K5TA', '2LR4A', '2LS01', '2LVLA', '2MAHA', '2NCNA', '2RLOA', '2RV8A', '2YTUA', '4AR0A', '5AIWA', '5H7UA', '5N9QA', '8E22X', '1CMOA', '1DDFA', '1EALA', '1PQNA', '1QKLA', '1UEYA', '1W2QA', '1WH4A', '1WJRA', '1X5MA', '1ZU1A', '2CSOA', '2DMKA', '2EE7A', '2L1LB', '2LOEA', '2MF7A', '2MPLA', '2NBOA', '2TMPA', '4UEIA', '5M1UA', '7XRWA', '8OILA', '1RSFA', '1V5SA', '1X5EA', '1ZO0A', '2A4HA', '2CR4A', '2DAVA', '2IF1A', '2JP2A', '2JVNA', '2K2VA', '2L3GA', '2L57A', '2LBCA', '2LLNA', '2M26A', '3MSPA', '7F7NA', '7KNVA', '8SQXA', '1BUQA', '1BW3A', '1FR0A', '1GIOA', '1MH6A', '1PSYA', '1UENA', '1WYJA', '2AI6A', '2DHJA', '2K4VA', '2K4ZA', '2KA5A', '2KFPA', '2KYTA', '2L2CA', '2LDUA', '2LM0A', '2MDKA', '2MV4A', '2N2HB', '2NBGA', '2PRFA', '4A53A', '5KNWA', '5T3YA', '5WOEA', '6GWMA', '6TH8A', '6UF2A', '6XEHA', '7BQCA', '7PRDA', '7QYIA', '1DC7A', '1E8EA', '1EO1A', '1PIRA', '1T3VA', 

  df = pd.read_csv(file_path, delim_whitespace=True)


In [4]:
#Embedding model initialization
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False ) # change model and tokenizer to t5
model_embedd = BertModel.from_pretrained("Rostlab/prot_bert")
model_embedd = model_embedd.to(device)


In [5]:
#Sequence, angle and embedding processing
def extract_input(pdb_file, model, tokenizer,data):
    '''
    Input =>
    pdb_file : file containing all the information for the given protein (.pdb or .cif)
    tokenizer : tokenizer for the sequence tokenization
    model : transformer model used for creating embedding space
    data : data nickname wanted to be extracted (alphafold, pisces)
    
    Output =>
    sequence : extracted from the file
    angle_tensor : tensor containing phi and psi angles for the given sequence
    embeddings : embedding created using the model for the given sequence
    '''
    #print('Data folder used:',data)
    #Parsing the cif file
    cif_parser = MMCIFParser()
    structure = cif_parser.get_structure("protein", pdb_file) # getting structure
    model0 = structure[0]

    #This is a very common need in bioinformatics of proteins
    d3to1 = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
    'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',
    'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',
    'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'}

    if data=='pisces':
        #This is to access the dictionary keys, which are the same as the file name, 
        #Once the keys are accessed we can find which chain to take
        mapping_file = os.path.join('data_processed/new_training/pdb_id_mapping.csv') 

        pdb_id_map = pd.read_csv(mapping_file, header=None, index_col=0).squeeze().to_dict()

        #extracting the file name as it contains the id
        filename = os.path.basename(pdb_file)
        pdb_id = filename[:4] # the first 4 letters of the file name is the dictionary key,whose value is the chain to be used

        full_pdb_id = pdb_id_map.get(pdb_id, None) #pdb id with the correct chain

        chain_A = model0[full_pdb_id[-1]]  # and we get chain A , the last letter of the id is the chain
        print("Full pdb id: ", full_pdb_id)

        #Creating a dictionary to collect chain ids and turn them into indices
        chain_id_to_num = {}
        for num, chain in enumerate(model0.get_chains()):
            chain_id_to_num[chain.id] = num

        #Iterator of chains, turns it into list, [0] first chain
        chain:PDB.Chain.Chain = list(model0.get_chains())[chain_id_to_num.get(full_pdb_id[-1])]

    elif data=='alphafold':
        chain_A = model0['A']
        #Iterator of chains, turns it into list, [0] first chain
        chain:PDB.Chain.Chain = list(model0.get_chains())[0]

    sequence = []
    for residue in chain_A:
        #For simplicity we can use X for heteroatoms (ions and water)
        sequence.append(d3to1.get(residue.get_resname(), 'X'))  #converts water and ions to X
    
    structure.atom_to_internal_coordinates() # turns xyz coordinates into angles and bond lengths

    #This accesses the internal chain coords of the chain object
    ic_chain: PDB.internal_coords.IC_Chain = chain.internal_coord 

    d: Dict[Tuple[PDB.internal_coords.AtomKey,
              PDB.internal_coords.AtomKey,
              PDB.internal_coords.AtomKey,
              PDB.internal_coords.AtomKey],
        PDB.internal_coords.Dihedron] = ic_chain.dihedra

    phi_angles_list = []
    psi_angles_list = []

    for key in d:
      if key[0].akl[3] == 'N' and key[1].akl[3] == 'CA' and key[2].akl[3] == 'C' and key[3].akl[3] == 'N':
          phi_angles_list.append(d[key].angle)
      elif key[0].akl[3] == 'C' and key[1].akl[3] == 'N' and key[2].akl[3] == 'CA' and key[3].akl[3] == 'C':
        psi_angles_list.append(d[key].angle)

    structure.internal_to_atom_coordinates(verbose = False)
    io = PDBIO() #this is to write a pdb/cif file again
    io.set_structure(structure)#setting the structure to the desired strcuture in the given file
    phi_angles_list.append(0)
    psi_angles_list.append(0)

    phi = np.asarray(phi_angles_list,dtype=np.float32)
    psi = np.asarray(psi_angles_list,dtype=np.float32)
    angles = np.vstack((psi,phi))
    angle_tensor = torch.tensor(angles, dtype=torch.float32).to(device)

    #Encoding the sequence
    encoded_input = tokenizer.encode(sequence,return_tensors='pt').to(device)  

    #Create the embedding using the model
    with torch.no_grad():
        outputs = model(input_ids=encoded_input)

    #Cropping the embedding created because of the added start and end paddings
    start_index = 1
    end_index = len(sequence) + 1
    embeddings = outputs.last_hidden_state[:,start_index:end_index,:]
    #print('Embedding length: ',embeddings.size())

    return sequence, angle_tensor, embeddings 

In [6]:
#Data path
alphafold_path = 'alphafold_data2'
#Processed data path
target_alpha_path = 'data_processed/alphafold_training/'

#Data path
pisces_path = 'data/small_proteins'
#Processed data path
target_pisces_path = 'data_processed/pisces_training/'


In [7]:
def procces_data(directory_path, target_path,data):

    embeddings_file = os.path.join(target_path, 'embeddings.pt')
    angles_file = os.path.join(target_path, 'angles.pt')
    sequences_file = os.path.join(target_path, 'sequences.pkl') 
    
    #Embeddings tensor and angles tensor initialized with zero tensors
    #The calculated values are concatenated, this way the padding is achieved to have equal length for each sequence
    #129(max_length)
    embeddings = torch.zeros(0, 129, 1024).to(device) 
    angles = torch.zeros(0, 2, 129).to(device)
    sequences = []

    def pad_sequence(sequence, target_length=129, pad_char="X"):
        padding_length = target_length - len(sequence)
        padding = [pad_char] * padding_length
        return sequence + padding

    #Get list of files in the directory containing pdb/cif files
    file_list = os.listdir(directory_path)

    for i,filename in enumerate(file_list):
        file_path = os.path.join(directory_path, filename)
    
        #Extracting the information from the files
        sequence, angle, embedding= extract_input(file_path, model_embedd, tokenizer,data)
        if sequence is None or embedding is None or angle is None:
            print(f"Skipping {file_path} due to missing data")
            continue
        print('Length of the sequence: ',len(sequence))

        
        padded_sequence = pad_sequence(sequence)
        sequences.append(padded_sequence)

        #Append the new data to the existing tensors
        embedding_length = min(embedding.size(1), 129)
        if i >= embeddings.size(0):
            embeddings = torch.cat([embeddings.to(device), torch.zeros(1, 129, 1024).to(device)], dim=0)
        
        embeddings[i, :embedding_length] = embedding[ :, :]
        angle_length = min(angle.size(1), 129)
        if i >= angles.size(0):
            angles = torch.cat([angles.to(device), torch.zeros(1, 2, 129).to(device)], dim=0)
        
        angles[i, :, :angle_length] = angle[:, :]
        print('Loop number: ',i)
        
    
    #Saving the updated tensors back to the files
    torch.save(embeddings, embeddings_file)
    torch.save(angles, angles_file)

    #Saving the sequence
    with open(sequences_file, 'wb') as f:
        pickle.dump(sequences, f)

    return angle, angles, embeddings, embedding, sequences



### Checking if the sizes are correct

In [8]:
pisces_angle, pisces_angles, pisces_embeddings, pisces_embedding, _ = procces_data(pisces_path,target_pisces_path,'pisces')

print(np.shape(pisces_embedding[:,:,:]))
print(np.shape(pisces_embeddings[:,:,:]))

print(np.shape(pisces_angle))
print(np.shape(pisces_angles))

Full pdb id:  1IMOA
Length of the sequence:  88
Loop number:  0
Full pdb id:  1K1ZA
Length of the sequence:  78
Loop number:  1
Full pdb id:  1J46A
Length of the sequence:  85
Loop number:  2
Full pdb id:  2L3LA
Length of the sequence:  111
Loop number:  3
Full pdb id:  2DL9A
Length of the sequence:  103
Loop number:  4
Full pdb id:  2M6MA
Length of the sequence:  85
Loop number:  5
Full pdb id:  2CPRA
Length of the sequence:  124
Loop number:  6
Full pdb id:  1HICA
Length of the sequence:  51
Loop number:  7
Full pdb id:  1WWQA
Length of the sequence:  111
Loop number:  8
Full pdb id:  2K9HA
Length of the sequence:  59
Loop number:  9
Full pdb id:  2CQWA
Length of the sequence:  84
Loop number:  10
Full pdb id:  1Q6AA
Length of the sequence:  107
Loop number:  11
Full pdb id:  6RFMA
Length of the sequence:  80
Loop number:  12
Full pdb id:  7SEKA
Length of the sequence:  82
Loop number:  13
Full pdb id:  1QCVA
Length of the sequence:  53
Loop number:  14
Full pdb id:  1YSYA
Length of 



Full pdb id:  5OEOC
Length of the sequence:  73
Loop number:  68
Full pdb id:  1WILA
Length of the sequence:  91
Loop number:  69
Full pdb id:  1PVEA
Length of the sequence:  72
Loop number:  70
Full pdb id:  1WT7A
Length of the sequence:  41
Loop number:  71
Full pdb id:  6I2OA
Length of the sequence:  113
Loop number:  72
Full pdb id:  1KA5A
Length of the sequence:  88
Loop number:  73
Full pdb id:  1RYKA
Length of the sequence:  69
Loop number:  74
Full pdb id:  2KWAA
Length of the sequence:  101
Loop number:  75
Full pdb id:  5MCSA
Length of the sequence:  81
Loop number:  76
Full pdb id:  2KCDA
Length of the sequence:  120
Loop number:  77
Full pdb id:  1XSXA
Length of the sequence:  95
Loop number:  78
Full pdb id:  2MCAA
Length of the sequence:  103
Loop number:  79
Full pdb id:  2NDPA
Length of the sequence:  99
Loop number:  80
Full pdb id:  2DOFA
Length of the sequence:  85
Loop number:  81
Full pdb id:  7B3JA
Length of the sequence:  55
Loop number:  82
Full pdb id:  7BY7A
L



Full pdb id:  2KA4B
Length of the sequence:  57
Loop number:  153
Full pdb id:  1X0HA
Length of the sequence:  112
Loop number:  154
Full pdb id:  2M76A
Length of the sequence:  50
Loop number:  155
Full pdb id:  6Z41A
Length of the sequence:  68
Loop number:  156
Full pdb id:  1CWXA
Length of the sequence:  44
Loop number:  157
Full pdb id:  1BMRA
Length of the sequence:  68
Loop number:  158
Full pdb id:  1NEIA
Length of the sequence:  60
Loop number:  159
Full pdb id:  1AW6A
Length of the sequence:  45
Loop number:  160
Full pdb id:  1R57A
Length of the sequence:  102
Loop number:  161
Full pdb id:  7PQ4A
Length of the sequence:  90
Loop number:  162
Full pdb id:  1V5LA
Length of the sequence:  103
Loop number:  163
Full pdb id:  2L02A
Length of the sequence:  82
Loop number:  164
Full pdb id:  1WX8A
Length of the sequence:  96
Loop number:  165
Full pdb id:  2KKXA
Length of the sequence:  102
Loop number:  166
Full pdb id:  7B2BA
Length of the sequence:  104
Loop number:  167
Full 



Length of the sequence:  98
Loop number:  207
Full pdb id:  2LGNA
Length of the sequence:  66
Loop number:  208
Full pdb id:  2DLZA
Length of the sequence:  118
Loop number:  209
Full pdb id:  2MKLC
Length of the sequence:  105
Loop number:  210
Full pdb id:  1I35A
Length of the sequence:  95
Loop number:  211
Full pdb id:  2KD2A
Length of the sequence:  94
Loop number:  212
Full pdb id:  2K0MA
Length of the sequence:  104
Loop number:  213
Full pdb id:  5LSDA
Length of the sequence:  118
Loop number:  214
Full pdb id:  2JRAA
Length of the sequence:  67
Loop number:  215
Full pdb id:  2LQHB
Length of the sequence:  52
Loop number:  216
Full pdb id:  2LVXA
Length of the sequence:  94
Loop number:  217
Full pdb id:  2CR4A
Length of the sequence:  126
Loop number:  218
Full pdb id:  2LUIA
Length of the sequence:  116
Loop number:  219
Full pdb id:  2K49A
Length of the sequence:  118
Loop number:  220
Full pdb id:  2KMUA
Length of the sequence:  56
Loop number:  221
Full pdb id:  1ILOA


  (


Length of the sequence:  77
Loop number:  222
Full pdb id:  2NCNA
Length of the sequence:  128
Loop number:  223
Full pdb id:  2LG4A
Length of the sequence:  40
Loop number:  224
Full pdb id:  2N7DA
Length of the sequence:  97
Loop number:  225
Full pdb id:  1RIPA
Length of the sequence:  81
Loop number:  226
Full pdb id:  2KANA
Length of the sequence:  94
Loop number:  227
Full pdb id:  2KBQA
Length of the sequence:  80
Loop number:  228
Full pdb id:  1OF9A
Length of the sequence:  77
Loop number:  229




Full pdb id:  7QRIA
Length of the sequence:  101
Loop number:  230
Full pdb id:  2RPRA
Length of the sequence:  88
Loop number:  231
Full pdb id:  2NCGA
Length of the sequence:  118
Loop number:  232
Full pdb id:  2LOEA
Length of the sequence:  127
Loop number:  233
Full pdb id:  2MM4A
Length of the sequence:  58
Loop number:  234
Full pdb id:  2NBHA
Length of the sequence:  107
Loop number:  235
Full pdb id:  1XNEA
Length of the sequence:  113
Loop number:  236
Full pdb id:  1RJHA
Length of the sequence:  118
Loop number:  237
Full pdb id:  2KNCB
Length of the sequence:  79
Loop number:  238
Full pdb id:  2N2TA
Length of the sequence:  84
Loop number:  239
Full pdb id:  6DSLB
Length of the sequence:  118
Loop number:  240
Full pdb id:  2B4NA
Length of the sequence:  42
Loop number:  241
Full pdb id:  6YDHA
Length of the sequence:  62
Loop number:  242
Full pdb id:  1UG7A
Length of the sequence:  128
Loop number:  243
Full pdb id:  2L1LB
Length of the sequence:  127
Loop number:  244
F



Full pdb id:  2JU0B
Length of the sequence:  52
Loop number:  560
Full pdb id:  1WJ4A
Length of the sequence:  124
Loop number:  561
Full pdb id:  1AXJA
Length of the sequence:  123
Loop number:  562
Full pdb id:  2EBUA
Length of the sequence:  112
Loop number:  563
Full pdb id:  2YUKA
Length of the sequence:  90
Loop number:  564
Full pdb id:  2EBKA
Length of the sequence:  128
Loop number:  565
Full pdb id:  2MWRA
Length of the sequence:  58
Loop number:  566
Full pdb id:  2Z59A
Length of the sequence:  109
Loop number:  567
Full pdb id:  2MG4A
Length of the sequence:  66
Loop number:  568
Full pdb id:  2N39A
Length of the sequence:  108
Loop number:  569
Full pdb id:  2LHKA
Length of the sequence:  107
Loop number:  570
Full pdb id:  5KNWA
Length of the sequence:  125
Loop number:  571
Full pdb id:  2L35A
Length of the sequence:  63
Loop number:  572
Full pdb id:  2LVWA
Length of the sequence:  98
Loop number:  573
Full pdb id:  2HGFA
Length of the sequence:  97
Loop number:  574
Fu



Full pdb id:  2P80D
Length of the sequence:  124
Loop number:  591
Full pdb id:  1M42A
Length of the sequence:  102
Loop number:  592
Full pdb id:  1HEHC
Length of the sequence:  88
Loop number:  593
Full pdb id:  1WJ5A
Length of the sequence:  120
Loop number:  594
Full pdb id:  1XSFA
Length of the sequence:  108
Loop number:  595
Full pdb id:  1AG4A
Length of the sequence:  103
Loop number:  596
Full pdb id:  1UF0A
Length of the sequence:  116
Loop number:  597
Full pdb id:  5IRDA
Length of the sequence:  119
Loop number:  598
Full pdb id:  2DB5A
Length of the sequence:  128
Loop number:  599
Full pdb id:  1WGRA
Length of the sequence:  100
Loop number:  600
Full pdb id:  6EHZA
Length of the sequence:  98
Loop number:  601
Full pdb id:  2LCHA
Length of the sequence:  113
Loop number:  602
Full pdb id:  1WE7A
Length of the sequence:  115
Loop number:  603
Full pdb id:  1FGPA
Length of the sequence:  70
Loop number:  604
Full pdb id:  5JTKA
Length of the sequence:  94
Loop number:  605



Full pdb id:  1TKWA
Length of the sequence:  100
Loop number:  632
Full pdb id:  2DK3A
Length of the sequence:  86
Loop number:  633
Full pdb id:  2NBBA
Length of the sequence:  118
Loop number:  634
Full pdb id:  7VBGA
Length of the sequence:  67
Loop number:  635
Full pdb id:  1WJ7A
Length of the sequence:  104
Loop number:  636
Full pdb id:  2M8RA
Length of the sequence:  109
Loop number:  637
Full pdb id:  2KERA
Length of the sequence:  78
Loop number:  638
Full pdb id:  2DY8A
Length of the sequence:  69
Loop number:  639
Full pdb id:  8HZQA
Length of the sequence:  80
Loop number:  640
Full pdb id:  2LZJA
Length of the sequence:  112
Loop number:  641
Full pdb id:  1XHJA
Length of the sequence:  88
Loop number:  642
Full pdb id:  6G7GA
Length of the sequence:  115
Loop number:  643
Full pdb id:  1A6XA
Length of the sequence:  87
Loop number:  644
Full pdb id:  2LHRA
Length of the sequence:  78
Loop number:  645
Full pdb id:  5T42A
Length of the sequence:  47
Loop number:  646
Full



Full pdb id:  1RF8B
Length of the sequence:  100
Loop number:  757
Full pdb id:  2M7KA
Length of the sequence:  66
Loop number:  758
Full pdb id:  2FXPA
Length of the sequence:  55
Loop number:  759
Full pdb id:  1R4GA
Length of the sequence:  53
Loop number:  760
Full pdb id:  1V32A
Length of the sequence:  101
Loop number:  761
Full pdb id:  2KA0A
Length of the sequence:  124
Loop number:  762
Full pdb id:  2N5GA
Length of the sequence:  101
Loop number:  763
Full pdb id:  2MMBA
Length of the sequence:  107
Loop number:  764
Full pdb id:  1YZSA
Length of the sequence:  121
Loop number:  765
Full pdb id:  2KQFA
Length of the sequence:  96
Loop number:  766
Full pdb id:  2COMA
Length of the sequence:  124
Loop number:  767
Full pdb id:  1WJOA
Length of the sequence:  124
Loop number:  768
Full pdb id:  2JXBA
Length of the sequence:  86
Loop number:  769
Full pdb id:  2EKFA
Length of the sequence:  61
Loop number:  770
Full pdb id:  1EMWA
Length of the sequence:  88
Loop number:  771
Fu



Full pdb id:  2KA6B
Length of the sequence:  45
Loop number:  786
Full pdb id:  2IMUA
Length of the sequence:  46
Loop number:  787
Full pdb id:  2LZFA
Length of the sequence:  68
Loop number:  788
Full pdb id:  2KHQA
Length of the sequence:  110
Loop number:  789
Full pdb id:  1QDPA
Length of the sequence:  42
Loop number:  790
Full pdb id:  2YSDA
Length of the sequence:  57
Loop number:  791
Full pdb id:  1SM7A
Length of the sequence:  109
Loop number:  792
Full pdb id:  2LXRA
Length of the sequence:  76
Loop number:  793
Full pdb id:  2KJIA
Length of the sequence:  50
Loop number:  794
Full pdb id:  2KDXA
Length of the sequence:  120
Loop number:  795
Full pdb id:  1GHTA
Length of the sequence:  105
Loop number:  796
Full pdb id:  1WFYA
Length of the sequence:  104
Loop number:  797
Full pdb id:  6VRJA
Length of the sequence:  97
Loop number:  798
Full pdb id:  1SJGA
Length of the sequence:  113
Loop number:  799
Full pdb id:  2RPBA
Length of the sequence:  113
Loop number:  800
Ful



Full pdb id:  1E08D
Length of the sequence:  89
Loop number:  845
Full pdb id:  2KGJA
Length of the sequence:  102
Loop number:  846
Full pdb id:  1CMOA
Length of the sequence:  127
Loop number:  847
Full pdb id:  2LNHA
Length of the sequence:  65
Loop number:  848
Full pdb id:  2JSFA
Length of the sequence:  117
Loop number:  849
Full pdb id:  2KREA
Length of the sequence:  100
Loop number:  850
Full pdb id:  2E6QA
Length of the sequence:  112
Loop number:  851
Full pdb id:  2KOBA
Length of the sequence:  108
Loop number:  852
Full pdb id:  7LIEA
Length of the sequence:  78
Loop number:  853
Full pdb id:  2DNEA
Length of the sequence:  108
Loop number:  854
Full pdb id:  2MJ6A
Length of the sequence:  90
Loop number:  855
Full pdb id:  2LJPA
Length of the sequence:  119
Loop number:  856
Full pdb id:  2DJPA
Length of the sequence:  77
Loop number:  857
Full pdb id:  1WGGA
Length of the sequence:  96
Loop number:  858
Full pdb id:  1GHCA
Length of the sequence:  75
Loop number:  859
Fu



Full pdb id:  1L8CB
Length of the sequence:  51
Loop number:  930
Full pdb id:  5ODDA
Length of the sequence:  92
Loop number:  931
Full pdb id:  7OMKA
Length of the sequence:  93
Loop number:  932
Full pdb id:  2L89A
Length of the sequence:  108
Loop number:  933
Full pdb id:  2YSTA
Length of the sequence:  119
Loop number:  934
Full pdb id:  1OH1A
Length of the sequence:  109
Loop number:  935
Full pdb id:  5M1UA
Length of the sequence:  127
Loop number:  936
Full pdb id:  2KJ3A
Length of the sequence:  79
Loop number:  937
Full pdb id:  2L9FA
Length of the sequence:  102
Loop number:  938
Full pdb id:  2KSKA
Length of the sequence:  71
Loop number:  939
Full pdb id:  1NR3A
Length of the sequence:  122
Loop number:  940
Full pdb id:  7ZK0A
Length of the sequence:  89
Loop number:  941
Full pdb id:  2EZLA
Length of the sequence:  99
Loop number:  942
Full pdb id:  1WG1A
Length of the sequence:  88
Loop number:  943
Full pdb id:  2K5QA
Length of the sequence:  105
Loop number:  944
Ful



Full pdb id:  2LWWB
Length of the sequence:  70
Loop number:  1040
Full pdb id:  2N4OA
Length of the sequence:  95
Loop number:  1041
Full pdb id:  2MTLA
Length of the sequence:  88
Loop number:  1042
Full pdb id:  2MH3A
Length of the sequence:  70
Loop number:  1043
Full pdb id:  1RJTA
Length of the sequence:  73
Loop number:  1044
Full pdb id:  2JR5A
Length of the sequence:  94
Loop number:  1045
Full pdb id:  2RQ1A
Length of the sequence:  109
Loop number:  1046
Full pdb id:  1CQUA
Length of the sequence:  56
Loop number:  1047
Full pdb id:  2CQFA
Length of the sequence:  65
Loop number:  1048
Full pdb id:  2LQJA
Length of the sequence:  94
Loop number:  1049
Full pdb id:  2MZSA
Length of the sequence:  99
Loop number:  1050
Full pdb id:  1IMTA
Length of the sequence:  80
Loop number:  1051
Full pdb id:  2L7KA
Length of the sequence:  76
Loop number:  1052
Full pdb id:  2RQRA
Length of the sequence:  119
Loop number:  1053
Full pdb id:  2EP8A
Length of the sequence:  100
Loop number



Full pdb id:  2N9PC
Length of the sequence:  100
Loop number:  1235
Full pdb id:  1BBIA
Length of the sequence:  71
Loop number:  1236
Full pdb id:  1HCDA
Length of the sequence:  118
Loop number:  1237
Full pdb id:  1COPD
Length of the sequence:  66
Loop number:  1238
Full pdb id:  1PCNA
Length of the sequence:  94
Loop number:  1239
Full pdb id:  2MAOA
Length of the sequence:  106
Loop number:  1240
Full pdb id:  2RV8A
Length of the sequence:  128
Loop number:  1241
Full pdb id:  2MQCA
Length of the sequence:  105
Loop number:  1242
Full pdb id:  2JUAA
Length of the sequence:  102
Loop number:  1243
Full pdb id:  8AR0A
Length of the sequence:  50
Loop number:  1244
Full pdb id:  2MZ7A
Length of the sequence:  46
Loop number:  1245
Full pdb id:  1WH4A
Length of the sequence:  127
Loop number:  1246
Full pdb id:  2RN4A
Length of the sequence:  106
Loop number:  1247
Full pdb id:  2KT0A
Length of the sequence:  84
Loop number:  1248
Full pdb id:  1PIHA
Length of the sequence:  74
Loop n



Full pdb id:  1BUQA
Length of the sequence:  126
Loop number:  1367
Full pdb id:  2L57A
Length of the sequence:  126
Loop number:  1368
Full pdb id:  2YUWA
Length of the sequence:  110
Loop number:  1369
Full pdb id:  5XN4X
Length of the sequence:  87
Loop number:  1370
Full pdb id:  2W0NA
Length of the sequence:  118
Loop number:  1371
Full pdb id:  6QJBA
Length of the sequence:  40
Loop number:  1372
Full pdb id:  2JTQA
Length of the sequence:  85
Loop number:  1373
Full pdb id:  2LTFA
Length of the sequence:  71
Loop number:  1374
Full pdb id:  1PAVA
Length of the sequence:  78
Loop number:  1375
Full pdb id:  2DOAA
Length of the sequence:  104
Loop number:  1376
Full pdb id:  1BW3A
Length of the sequence:  125
Loop number:  1377
Full pdb id:  2A05A
Length of the sequence:  57
Loop number:  1378
Full pdb id:  2JODA
Length of the sequence:  106
Loop number:  1379
Full pdb id:  1V5SA
Length of the sequence:  126
Loop number:  1380
Full pdb id:  2L3AA
Length of the sequence:  82
Loop n



Full pdb id:  1AJYA
Length of the sequence:  73
Loop number:  1512
Full pdb id:  1RHXA
Length of the sequence:  87
Loop number:  1513
Full pdb id:  7BQSA
Length of the sequence:  122
Loop number:  1514
Full pdb id:  2LGVA
Length of the sequence:  103
Loop number:  1515
Full pdb id:  2JV7A
Length of the sequence:  78
Loop number:  1516
Full pdb id:  2KQ6A
Length of the sequence:  78
Loop number:  1517
Full pdb id:  2L8YA
Length of the sequence:  105
Loop number:  1518
Full pdb id:  8GDGA
Length of the sequence:  117
Loop number:  1519
Full pdb id:  2B7EA
Length of the sequence:  59
Loop number:  1520
Full pdb id:  2CQ4A
Length of the sequence:  114
Loop number:  1521
Full pdb id:  6FTKA
Length of the sequence:  50
Loop number:  1522
Full pdb id:  2V0FA
Length of the sequence:  87
Loop number:  1523
Full pdb id:  2N2YA
Length of the sequence:  92
Loop number:  1524
Full pdb id:  2IF1A
Length of the sequence:  126
Loop number:  1525
Full pdb id:  2RM4A
Length of the sequence:  103
Loop nu



Full pdb id:  5N7YA
Length of the sequence:  50
Loop number:  1560
Full pdb id:  2DHXA
Length of the sequence:  104
Loop number:  1561
Full pdb id:  2KL5A
Length of the sequence:  110
Loop number:  1562
Full pdb id:  2KOUA
Length of the sequence:  102
Loop number:  1563
Full pdb id:  2YRPA
Length of the sequence:  114
Loop number:  1564
Full pdb id:  6TH8A
Length of the sequence:  125
Loop number:  1565
Full pdb id:  5Y0UA
Length of the sequence:  112
Loop number:  1566
Full pdb id:  1DCJA
Length of the sequence:  81
Loop number:  1567
Full pdb id:  1ERCA
Length of the sequence:  40
Loop number:  1568
Full pdb id:  1R05A
Length of the sequence:  87
Loop number:  1569
Full pdb id:  6MI5X
Length of the sequence:  120
Loop number:  1570
Full pdb id:  1KX2A
Length of the sequence:  82
Loop number:  1571
Full pdb id:  2K4NA
Length of the sequence:  111
Loop number:  1572




Full pdb id:  2E30B
Length of the sequence:  43
Loop number:  1573
Full pdb id:  1YWLA
Length of the sequence:  96
Loop number:  1574
Full pdb id:  2KJ6A
Length of the sequence:  97
Loop number:  1575
Full pdb id:  2D88A
Length of the sequence:  121
Loop number:  1576
Full pdb id:  1T6WA
Length of the sequence:  100
Loop number:  1577
Full pdb id:  1BUSA
Length of the sequence:  57
Loop number:  1578
Full pdb id:  2KVTA
Length of the sequence:  71
Loop number:  1579
Full pdb id:  2TMPA
Length of the sequence:  127
Loop number:  1580
Full pdb id:  8QNTA
Length of the sequence:  49
Loop number:  1581
Full pdb id:  2LSHA
Length of the sequence:  118
Loop number:  1582
Full pdb id:  1WEUA
Length of the sequence:  93
Loop number:  1583
Full pdb id:  2MBYA
Length of the sequence:  46
Loop number:  1584
Full pdb id:  5NAMA
Length of the sequence:  48
Loop number:  1585
Full pdb id:  7DABA
Length of the sequence:  64
Loop number:  1586
Full pdb id:  6OBIA
Length of the sequence:  68
Loop numbe



Full pdb id:  1CF4B
Length of the sequence:  44
Loop number:  1611
Full pdb id:  2MMVA
Length of the sequence:  86
Loop number:  1612
Full pdb id:  2RQPA
Length of the sequence:  88
Loop number:  1613
Full pdb id:  2CQHA
Length of the sequence:  93
Loop number:  1614
Full pdb id:  2N5MA
Length of the sequence:  52
Loop number:  1615
Full pdb id:  2KHCA
Length of the sequence:  118
Loop number:  1616
Full pdb id:  2LMEA
Length of the sequence:  105
Loop number:  1617
Full pdb id:  2JNZA
Length of the sequence:  108
Loop number:  1618
Full pdb id:  2DJMA
Length of the sequence:  106
Loop number:  1619
Full pdb id:  1EWIA
Length of the sequence:  114
Loop number:  1620
Full pdb id:  6O1QA
Length of the sequence:  119
Loop number:  1621
Full pdb id:  2KHPA
Length of the sequence:  92
Loop number:  1622
Full pdb id:  2KMCA
Length of the sequence:  102
Loop number:  1623
Full pdb id:  2K3GA
Length of the sequence:  102
Loop number:  1624
Full pdb id:  5TM0A
Length of the sequence:  108
Loop 



Full pdb id:  8CA0A
Length of the sequence:  115
Loop number:  1704
Full pdb id:  1G10A
Length of the sequence:  102
Loop number:  1705
Full pdb id:  6QVWA
Length of the sequence:  119
Loop number:  1706
Full pdb id:  2N4IA
Length of the sequence:  119
Loop number:  1707
Full pdb id:  2A4HA
Length of the sequence:  126
Loop number:  1708
Full pdb id:  1YWXA
Length of the sequence:  102
Loop number:  1709
Full pdb id:  2JS1A
Length of the sequence:  80
Loop number:  1710
torch.Size([1, 80, 1024])
torch.Size([1711, 129, 1024])
torch.Size([2, 80])
torch.Size([1711, 2, 129])


In [9]:
alpha_angle, alpha_angles, alpha_embeddings, alpha_embedding, _ = procces_data(alphafold_path,target_alpha_path,'alphafold')

print(np.shape(alpha_embedding[:,:,:]))
print(np.shape(alpha_embeddings[:,:,:]))

print(np.shape(alpha_angle))
print(np.shape(alpha_angles))

Length of the sequence:  119
Loop number:  0
Length of the sequence:  103
Loop number:  1
Length of the sequence:  115
Loop number:  2
Length of the sequence:  65
Loop number:  3
Length of the sequence:  36
Loop number:  4
Length of the sequence:  101
Loop number:  5
Length of the sequence:  113
Loop number:  6
Length of the sequence:  111
Loop number:  7
Length of the sequence:  120
Loop number:  8
Length of the sequence:  112
Loop number:  9
Length of the sequence:  112
Loop number:  10
Length of the sequence:  108
Loop number:  11
Length of the sequence:  95
Loop number:  12
Length of the sequence:  54
Loop number:  13
Length of the sequence:  68
Loop number:  14
Length of the sequence:  53
Loop number:  15
Length of the sequence:  69
Loop number:  16
Length of the sequence:  50
Loop number:  17
Length of the sequence:  86
Loop number:  18
Length of the sequence:  111
Loop number:  19
Length of the sequence:  62
Loop number:  20
Length of the sequence:  100
Loop number:  21
Length o

In [12]:
angles = torch.load('data_processed/alphafold_training/angles.pt')
phi_angles = angles[:,0,:]
psi_angles = angles[:,1,:]
#
print(angles.size())
#print(phi_angles.size())
#print(angles[15,:,:])

torch.Size([2630, 2, 129])
