In [6]:
#accessing files
import os

#manipulation libraries
import numpy as np
import torch
import pandas as pd

#for python object structure
import pickle

#biopython for parsing the cif files and manipulation of the cif files
from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB import PDBIO
from Bio import PDB

#data structures for manipulation
from typing import Dict, Tuple

#transformer model for embedding space creation
from transformers import BertModel, BertTokenizer

#gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

  torch.utils._pytree._register_pytree_node(


device(type='cpu')

In [6]:
#Embedding model initialization
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False ) # change model and tokenizer to t5
model_embedd = BertModel.from_pretrained("Rostlab/prot_bert")
model_embedd = model_embedd.to(device)


In [7]:
#Sequence, angle and embedding processing
def extract_input(pdb_file, model, tokenizer,data):
    '''
    Input =>
    pdb_file : file containing all the information for the given protein (.pdb or .cif)
    tokenizer : tokenizer for the sequence tokenization
    model : transformer model used for creating embedding space
    data : data nickname wanted to be extracted (alphafold, pisces)
    
    Output =>
    sequence : extracted from the file
    angle_tensor : tensor containing phi and psi angles for the given sequence
    embeddings : embedding created using the model for the given sequence
    '''
    #print('Data folder used:',data)
    #Parsing the cif file
    cif_parser = MMCIFParser()
    structure = cif_parser.get_structure("protein", pdb_file) # getting structure
    model0 = structure[0]

    #This is a very common need in bioinformatics of proteins
    d3to1 = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
    'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',
    'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',
    'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'}

    if data=='pisces':
        #This is to access the dictionary keys, which are the same as the file name, 
        #Once the keys are accessed we can find which chain to take
        mapping_file = os.path.join('data_processed/new_training/pdb_id_mapping.csv') 

        pdb_id_map = pd.read_csv(mapping_file, header=None, index_col=0).squeeze().to_dict()

        #extracting the file name as it contains the id
        filename = os.path.basename(pdb_file)
        pdb_id = filename[:4] # the first 4 letters of the file name is the dictionary key,whose value is the chain to be used

        full_pdb_id = pdb_id_map.get(pdb_id, None) #pdb id with the correct chain

        chain_A = model0[full_pdb_id[-1]]  # and we get chain A , the last letter of the id is the chain
        print("Full pdb id: ", full_pdb_id)

        #Creating a dictionary to collect chain ids and turn them into indices
        chain_id_to_num = {}
        for num, chain in enumerate(model0.get_chains()):
            chain_id_to_num[chain.id] = num

        #Iterator of chains, turns it into list, [0] first chain
        chain:PDB.Chain.Chain = list(model0.get_chains())[chain_id_to_num.get(full_pdb_id[-1])]

    elif data=='alphafold':
        chain_A = model0['A']
        #Iterator of chains, turns it into list, [0] first chain
        chain:PDB.Chain.Chain = list(model0.get_chains())[0]

    sequence = []
    for residue in chain_A:
        #For simplicity we can use X for heteroatoms (ions and water)
        sequence.append(d3to1.get(residue.get_resname(), 'X'))  #converts water and ions to X
    
    structure.atom_to_internal_coordinates() # turns xyz coordinates into angles and bond lengths

    #This accesses the internal chain coords of the chain object
    ic_chain: PDB.internal_coords.IC_Chain = chain.internal_coord 

    d: Dict[Tuple[PDB.internal_coords.AtomKey,
              PDB.internal_coords.AtomKey,
              PDB.internal_coords.AtomKey,
              PDB.internal_coords.AtomKey],
        PDB.internal_coords.Dihedron] = ic_chain.dihedra

    phi_angles_list = []
    psi_angles_list = []

    for key in d:
      if key[0].akl[3] == 'N' and key[1].akl[3] == 'CA' and key[2].akl[3] == 'C' and key[3].akl[3] == 'N':
          phi_angles_list.append(d[key].angle)
      elif key[0].akl[3] == 'C' and key[1].akl[3] == 'N' and key[2].akl[3] == 'CA' and key[3].akl[3] == 'C':
        psi_angles_list.append(d[key].angle)

    structure.internal_to_atom_coordinates(verbose = False)
    io = PDBIO() #this is to write a pdb/cif file again
    io.set_structure(structure)#setting the structure to the desired strcuture in the given file
    phi_angles_list.append(0)
    psi_angles_list.append(0)

    phi = np.asarray(phi_angles_list,dtype=np.float32)
    psi = np.asarray(psi_angles_list,dtype=np.float32)
    angles = np.vstack((psi,phi))
    angle_tensor = torch.tensor(angles, dtype=torch.float32).to(device)

    #Encoding the sequence
    encoded_input = tokenizer.encode(sequence,return_tensors='pt').to(device)  

    #Create the embedding using the model
    with torch.no_grad():
        outputs = model(input_ids=encoded_input)

    #Cropping the embedding created because of the added start and end paddings
    start_index = 1
    end_index = len(sequence) + 1
    embeddings = outputs.last_hidden_state[:,start_index:end_index,:]
    #print('Embedding length: ',embeddings.size())

    return sequence, angle_tensor, embeddings 

In [8]:
#Data path
alphafold_path = 'alphafold_data2'
#Processed data path
target_alpha_path = 'data_processed/alphafold_training/'

#Data path
pisces_path = 'data/small_proteins'
#Processed data path
target_pisces_path = 'data_processed/pisces_training/'

In [9]:
def procces_data(directory_path, target_path,data):

    embeddings_file = os.path.join(target_path, 'embeddings.pt')
    angles_file = os.path.join(target_path, 'angles.pt')
    sequences_file = os.path.join(target_path, 'sequences.pkl') 
    
    #Embeddings tensor and angles tensor initialized with zero tensors
    #The calculated values are concatenated, this way the padding is achieved to have equal length for each sequence
    #129(max_length)
    embeddings = torch.zeros(0, 129, 1024).to(device) 
    angles = torch.zeros(0, 2, 129).to(device)
    sequences = []

    def pad_sequence(sequence, target_length=129, pad_char="X"):
        padding_length = target_length - len(sequence)
        padding = [pad_char] * padding_length
        return sequence + padding

    #Get list of files in the directory containing pdb/cif files
    file_list = os.listdir(directory_path)

    for i,filename in enumerate(file_list):
        file_path = os.path.join(directory_path, filename)
    
        #Extracting the information from the files
        sequence, angle, embedding= extract_input(file_path, model_embedd, tokenizer,data)
        if sequence is None or embedding is None or angle is None:
            print(f"Skipping {file_path} due to missing data")
            continue
        print('Length of the sequence: ',len(sequence))

        
        padded_sequence = pad_sequence(sequence)
        sequences.append(padded_sequence)

        #Append the new data to the existing tensors
        embedding_length = min(embedding.size(1), 129)
        if i >= embeddings.size(0):
            embeddings = torch.cat([embeddings.to(device), torch.zeros(1, 129, 1024).to(device)], dim=0)
        
        embeddings[i, :embedding_length] = embedding[ :, :]
        angle_length = min(angle.size(1), 129)
        if i >= angles.size(0):
            angles = torch.cat([angles.to(device), torch.zeros(1, 2, 129).to(device)], dim=0)
        
        angles[i, :, :angle_length] = angle[:, :]
        print('Loop number: ',i)
        
    
    #Saving the updated tensors back to the files
    torch.save(embeddings, embeddings_file)
    torch.save(angles, angles_file)

    #Saving the sequence
    with open(sequences_file, 'wb') as f:
        pickle.dump(sequences, f)

    return angle, angles, embeddings, embedding, sequences



In [12]:
#Importing processed PISCES data
pisces_angles = torch.load('/Users/goudarzimandanagmail.com/Desktop/TransformerFromScratch/data_processed/pisces_training/angles.pt', map_location=torch.device('cpu'))
pisces_embeddings = torch.load('/Users/goudarzimandanagmail.com/Desktop/TransformerFromScratch/data_processed/pisces_training/embeddings.pt', map_location=torch.device('cpu'))
with open('/Users/goudarzimandanagmail.com/Desktop/TransformerFromScratch/data_processed/pisces_training/sequences.pkl', 'rb') as f:
        pisces_sequences = pickle.load(f)

#Loading data to the device
#pisces_angles = pisces_angles.to(device)
#pisces_embeddings = pisces_embeddings.to(device)

print("Size of the angles: ",pisces_angles.size())
print("Size of the embeddings: ",pisces_embeddings.size())
print("Number of sequences in total: ",len(pisces_sequences)) 


pisces_sequences = pisces_sequences

Size of the angles:  torch.Size([1711, 2, 129])
Size of the embeddings:  torch.Size([1711, 129, 1024])
Number of sequences in total:  1711


In [14]:
#Importing processed AlphaFold data
alphafold_angles = torch.load('/Users/goudarzimandanagmail.com/Desktop/TransformerFromScratch/data_processed/alphafold_training/angles.pt', map_location=torch.device('cpu'))
alphafold_embeddings = torch.load('/Users/goudarzimandanagmail.com/Desktop/TransformerFromScratch/data_processed/alphafold_training/embeddings.pt', map_location=torch.device('cpu'))
with open('/Users/goudarzimandanagmail.com/Desktop/TransformerFromScratch/data_processed/alphafold_training/sequences.pkl', 'rb') as f:
        alphafold_sequences = pickle.load(f)

#Loading data to the device
alphafold_angles = alphafold_angles[:1000,:,:].to(device)
alphafold_embeddings = alphafold_embeddings[:1000,:,:].to(device)

print("Size of the angles: ",alphafold_angles.size())
print("Size of the embeddings: ",alphafold_embeddings.size())
print("Number of sequences in total: ",len(alphafold_sequences))

#print(alphafold_angles[2,:,:])
alphafold_sequences = alphafold_sequences[:1000]

Size of the angles:  torch.Size([1000, 2, 129])
Size of the embeddings:  torch.Size([1000, 129, 1024])
Number of sequences in total:  2630


In [7]:

def extract_sequence_from_structure(file_path):
    """
    Extracts protein sequences from a PDB or CIF file.
    
    Args:
    file_path (str): Path to the PDB or CIF file.
    
    Returns:
    sequences (dict): A dictionary where keys are chain identifiers and values are sequences.
    """
    if file_path.endswith('.cif'):
        parser = MMCIFParser()
    else:
        parser = PDBParser()
    
    structure = parser.get_structure('protein', file_path)
    
    sequences = {}
    for model in structure:
        for chain in model:
            seq = ''
            for residue in chain:
                if residue.id[0] == ' ':  # Exclude hetero residues
                    seq += PDB.Polypeptide.three_to_one(residue.resname)
            sequences[chain.id] = seq
    return sequences


file_path = "/Users/goudarzimandanagmail.com/Desktop/TransformerFromScratch/files/AF-A0A1D8PD42-F1-model_v4.cif"
sequences = extract_sequence_from_structure(file_path)
print(sequences)


{'A': 'MSSSNTDNQYPKYINDTTPPTITLKEYDNASWASTTCLDHNPIKNQYIVVVMENPNQIVAIIDQQDNMILDILFKNAHDAHSKQEYSTK'}


In [43]:
from collections import defaultdict

def parse_cif(file_content):
    metadata = defaultdict(list)
    current_loop = None

    for line in file_content:
        line = line.strip()

        # Skip empty lines
        if not line:
            continue

        if line.startswith('_'):
            parts = line.split(None, 1)
            if len(parts) == 2:  # Ensure there are both key and value
                key, value = parts
                value = value.strip()
                metadata[key].append(value)
            current_loop = None
        
        elif line.startswith('loop_'):
            current_loop = []
        
        elif current_loop is not None:
            current_loop.append(line.strip().split())

    return dict(metadata)

def extract_specific_metadata(metadata):
    extracted_data = {}

    # Extract entry ID
    extracted_data['Entry ID'] = metadata.get('_entry.id', [None])[0]

    # Extract authors
    authors = metadata.get('_citation_author.name', [])
    extracted_data['Authors'] = authors

    # Extract citation details
    # extracted_data['Authors'] = metadata.get('_citation_author.name', [None])[0]
    extracted_data['Citation Title'] = metadata.get('_citation.title', [None])[0]
    extracted_data['Citation Year'] = metadata.get('_citation.year', [None])[0]
    extracted_data['Journal'] = metadata.get('_citation.journal_full', [None])[0]
    extracted_data['DOI'] = metadata.get('_citation.pdbx_database_id_DOI', [None])[0]
    extracted_data['country'] = metadata.get('_citation.country', [None])[0]

    # Extract model details
    extracted_data['Type'] = metadata.get('_entity.type', [None])[0]
    extracted_data['Poly Type'] = metadata.get('_entity_poly.type', [None])[0]
    extracted_data['Model Name'] = metadata.get('_ma_model_list.model_name', [None])[0]
    extracted_data['Model Type'] = metadata.get('_ma_model_list.model_type', [None])[0]
    extracted_data['pLDDT'] = metadata.get('_ma_qa_metric_global.metric_value', [None])[0]
    extracted_data['Gene Name'] = metadata.get('_ma_target_ref_db_details.gene_name', [None])[0]

    return extracted_data

cif_file_path = "/Users/goudarzimandanagmail.com/Desktop/TransformerFromScratch/files/AF-A0A1D8PD42-F1-model_v4.cif"
with open(cif_file_path, 'r') as file:
    file_content = file.readlines()


metadata = parse_cif(file_content)

extracted_metadata = extract_specific_metadata(metadata)

for key, value in extracted_metadata.items():
    print(f"{key}: {value}")


Entry ID: AF-A0A1D8PD42-F1
Authors: None
Citation Title: "Highly accurate protein structure prediction with AlphaFold"
Citation Year: 2021
Journal: Nature
DOI: 10.1038/s41586-021-03819-2
country: UK
Type: polymer
Poly Type: polypeptide(L)
Model Name: "Top ranked model"
Model Type: "Ab initio model"
pLDDT: 86.99
Gene Name: orf19.1026.1


In [1]:
def blocks_between_hashtags(file_content):
    
    in_block = False
    blocks = []
    current_block = []

    for line in file_content:
        line = line.strip()

        if line == '#':
            if in_block:
                if current_block:
                    blocks.append('\n'.join(current_block))
                    current_block = []
            in_block = not in_block
        elif in_block:
            current_block.append(line)

    return blocks

cif_file_path = "/Users/goudarzimandanagmail.com/Desktop/TransformerFromScratch/files/AF-A0A1D8PD42-F1-model_v4.cif"
with open(cif_file_path, 'r') as file:
    file_content = file.readlines()

blocks = blocks_between_hashtags(file_content)

for i, block in enumerate(blocks, start=1):
    print(f"Block {i}:\n{block}\n{'-'*50}")


Block 1:
_entry.id AF-A0A1D8PD42-F1
--------------------------------------------------
Block 2:
loop_
_audit_author.name
_audit_author.pdbx_ordinal
"Jumper, John"               1
"Evans, Richard"             2
"Pritzel, Alexander"         3
"Green, Tim"                 4
"Figurnov, Michael"          5
"Ronneberger, Olaf"          6
"Tunyasuvunakool, Kathryn"   7
"Bates, Russ"                8
"Zidek, Augustin"            9
"Potapenko, Anna"            10
"Bridgland, Alex"            11
"Meyer, Clemens"             12
"Kohl, Simon A. A."          13
"Ballard, Andrew J."         14
"Cowie, Andrew"              15
"Romera-Paredes, Bernardino" 16
"Nikolov, Stanislav"         17
"Jain, Rishub"               18
"Adler, Jonas"               19
"Back, Trevor"               20
"Petersen, Stig"             21
"Reiman, David"              22
"Clancy, Ellen"              23
"Zielinski, Michal"          24
"Steinegger, Martin"         25
"Pacholska, Michalina"       26
"Berghammer, Tamas"          