(info about pisces dataset) 
Here using the cif files from pisces dataset we are extracting the angles (phi and psi omega ?) and the sequences. The sequences are also have been put in the embedding space using Prot-Bert model for the ease of use. All angles are below 129 residues. 

## Importing libraries

In [None]:
#accessing files
import os

#manipulation libraries
import numpy as np
import torch
import pandas as pd

#for python object structure
import pickle

#biopython for parsing the cif files and manipulation of the cif files
from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB import PDBIO
from Bio import PDB

#data structures for manipulation
from typing import Dict, Tuple

#transformer model for embedding space creation
from transformers import BertModel, BertTokenizer

#gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

## Changing to GPU

### Initializations for Pisces data

In [None]:
file_path = 'data/cullpdb_pc30_0_res0_0_2_0_noBrks_noDsdr_len40_10000_R0_25_Nmr_d2024'
df = pd.read_csv(file_path, delim_whitespace=True)

#Getting the id of files containing sequence length less than 128
df = df[df['len'] <= 128]
pdb_ids = df['PDBchain'].tolist()
print(pdb_ids)

#Chain information for the sequence
pdb_id_map = {pdb_id[:-1]: pdb_id for pdb_id in pdb_ids}
print(pdb_id_map)

mapping_file = os.path.join( 'data_processed/pisces_training/pdb_id_mapping.csv')
pd.Series({k.lower(): v for k, v in pdb_id_map.items()}).to_csv(mapping_file, header=False)

In [None]:
#Embedding model initialization
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False ) # change model and tokenizer to t5
model_embedd = BertModel.from_pretrained("Rostlab/prot_bert")
model_embedd = model_embedd.to(device)


In [None]:
#Sequence, angle and embedding processing
def extract_input(pdb_file, model, tokenizer,data):
    '''
    Input =>
    pdb_file : file containing all the information for the given protein (.pdb or .cif)
    tokenizer : tokenizer for the sequence tokenization
    model : transformer model used for creating embedding space
    data : data nickname wanted to be extracted (alphafold, pisces)
    
    Output =>
    sequence : extracted from the file
    angle_tensor : tensor containing phi and psi angles for the given sequence
    embeddings : embedding created using the model for the given sequence
    '''
    #print('Data folder used:',data)
    #Parsing the cif file
    cif_parser = MMCIFParser()
    structure = cif_parser.get_structure("protein", pdb_file) # getting structure
    model0 = structure[0]

    #This is a very common need in bioinformatics of proteins
    d3to1 = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
    'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',
    'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',
    'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'}

    if data=='pisces':
        #This is to access the dictionary keys, which are the same as the file name, 
        #Once the keys are accessed we can find which chain to take
        mapping_file = os.path.join('data_processed/new_training/pdb_id_mapping.csv') 

        pdb_id_map = pd.read_csv(mapping_file, header=None, index_col=0).squeeze().to_dict()

        #extracting the file name as it contains the id
        filename = os.path.basename(pdb_file)
        pdb_id = filename[:4] # the first 4 letters of the file name is the dictionary key,whose value is the chain to be used

        full_pdb_id = pdb_id_map.get(pdb_id, None) #pdb id with the correct chain

        chain_A = model0[full_pdb_id[-1]]  # and we get chain A , the last letter of the id is the chain
        print("Full pdb id: ", full_pdb_id)

        #Creating a dictionary to collect chain ids and turn them into indices
        chain_id_to_num = {}
        for num, chain in enumerate(model0.get_chains()):
            chain_id_to_num[chain.id] = num

        #Iterator of chains, turns it into list, [0] first chain
        chain:PDB.Chain.Chain = list(model0.get_chains())[chain_id_to_num.get(full_pdb_id[-1])]

    elif data=='alphafold':
        chain_A = model0['A']
        #Iterator of chains, turns it into list, [0] first chain
        chain:PDB.Chain.Chain = list(model0.get_chains())[0]

    sequence = []
    for residue in chain_A:
        #For simplicity we can use X for heteroatoms (ions and water)
        sequence.append(d3to1.get(residue.get_resname(), 'X'))  #converts water and ions to X
    
    structure.atom_to_internal_coordinates() # turns xyz coordinates into angles and bond lengths

    #This accesses the internal chain coords of the chain object
    ic_chain: PDB.internal_coords.IC_Chain = chain.internal_coord 

    d: Dict[Tuple[PDB.internal_coords.AtomKey,
              PDB.internal_coords.AtomKey,
              PDB.internal_coords.AtomKey,
              PDB.internal_coords.AtomKey],
        PDB.internal_coords.Dihedron] = ic_chain.dihedra

    phi_angles_list = []
    psi_angles_list = []

    for key in d:
      if key[0].akl[3] == 'N' and key[1].akl[3] == 'CA' and key[2].akl[3] == 'C' and key[3].akl[3] == 'N':
          phi_angles_list.append(d[key].angle)
      elif key[0].akl[3] == 'C' and key[1].akl[3] == 'N' and key[2].akl[3] == 'CA' and key[3].akl[3] == 'C':
        psi_angles_list.append(d[key].angle)

    structure.internal_to_atom_coordinates(verbose = False)
    io = PDBIO() #this is to write a pdb/cif file again
    io.set_structure(structure)#setting the structure to the desired strcuture in the given file
    phi_angles_list.append(0)
    psi_angles_list.append(0)

    phi = np.asarray(phi_angles_list,dtype=np.float32)
    psi = np.asarray(psi_angles_list,dtype=np.float32)
    angles = np.vstack((psi,phi))
    angle_tensor = torch.tensor(angles, dtype=torch.float32).to(device)

    #Encoding the sequence
    encoded_input = tokenizer.encode(sequence,return_tensors='pt').to(device)  

    #Create the embedding using the model
    with torch.no_grad():
        outputs = model(input_ids=encoded_input)

    #Cropping the embedding created because of the added start and end paddings
    start_index = 1
    end_index = len(sequence) + 1
    embeddings = outputs.last_hidden_state[:,start_index:end_index,:]
    #print('Embedding length: ',embeddings.size())

    return sequence, angle_tensor, embeddings 

In [None]:
#Data path
alphafold_path = 'alphafold_data2'
#Processed data path
target_alpha_path = 'data_processed/alphafold_training/'

#Data path
pisces_path = 'data/small_proteins'
#Processed data path
target_pisces_path = 'data_processed/pisces_training/'


In [None]:
def procces_data(directory_path, target_path,data):

    embeddings_file = os.path.join(target_path, 'embeddings.pt')
    angles_file = os.path.join(target_path, 'angles.pt')
    sequences_file = os.path.join(target_path, 'sequences.pkl') 
    
    #Embeddings tensor and angles tensor initialized with zero tensors
    #The calculated values are concatenated, this way the padding is achieved to have equal length for each sequence
    #129(max_length)
    embeddings = torch.zeros(0, 129, 1024).to(device) 
    angles = torch.zeros(0, 2, 129).to(device)
    sequences = []

    def pad_sequence(sequence, target_length=129, pad_char="X"):
        padding_length = target_length - len(sequence)
        padding = [pad_char] * padding_length
        return sequence + padding

    #Get list of files in the directory containing pdb/cif files
    file_list = os.listdir(directory_path)

    for i,filename in enumerate(file_list):
        file_path = os.path.join(directory_path, filename)
    
        #Extracting the information from the files
        sequence, angle, embedding= extract_input(file_path, model_embedd, tokenizer,data)
        if sequence is None or embedding is None or angle is None:
            print(f"Skipping {file_path} due to missing data")
            continue
        print('Length of the sequence: ',len(sequence))

        
        padded_sequence = pad_sequence(sequence)
        sequences.append(padded_sequence)

        #Append the new data to the existing tensors
        embedding_length = min(embedding.size(1), 129)
        if i >= embeddings.size(0):
            embeddings = torch.cat([embeddings.to(device), torch.zeros(1, 129, 1024).to(device)], dim=0)
        
        embeddings[i, :embedding_length] = embedding[ :, :]
        angle_length = min(angle.size(1), 129)
        if i >= angles.size(0):
            angles = torch.cat([angles.to(device), torch.zeros(1, 2, 129).to(device)], dim=0)
        
        angles[i, :, :angle_length] = angle[:, :]
        print('Loop number: ',i)
        
    
    #Saving the updated tensors back to the files
    torch.save(embeddings, embeddings_file)
    torch.save(angles, angles_file)

    #Saving the sequence
    with open(sequences_file, 'wb') as f:
        pickle.dump(sequences, f)

    return angle, angles, embeddings, embedding, sequences



### Checking if the sizes are correct

In [None]:
pisces_angle, pisces_angles, pisces_embeddings, pisces_embedding, _ = procces_data(pisces_path,target_pisces_path,'pisces')

print(np.shape(pisces_embedding[:,:,:]))
print(np.shape(pisces_embeddings[:,:,:]))

print(np.shape(pisces_angle))
print(np.shape(pisces_angles))

In [None]:
alpha_angle, alpha_angles, alpha_embeddings, alpha_embedding, _ = procces_data(alphafold_path,target_alpha_path,'alphafold')

print(np.shape(alpha_embedding[:,:,:]))
print(np.shape(alpha_embeddings[:,:,:]))

print(np.shape(alpha_angle))
print(np.shape(alpha_angles))

In [None]:
angles = torch.load('data_processed/alphafold_training/angles.pt')
phi_angles = angles[:,0,:]
psi_angles = angles[:,1,:]
#
print(angles.size())
#print(phi_angles.size())
#print(angles[15,:,:])