##Installation of the libraries

In [36]:
!pip install -q transformers

In [37]:
!pip install biopython



In [38]:
#!pip3 uninstall --yes torch torchaudio torchvision torchtext torchdata
!pip3 install torch



Torch optimization.

##All libraries needed for training

In [202]:
import os
import math
import numpy as np
import random
import logging

# Bring in PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
# Most of the examples have typing on the signatures for readability
from typing import Optional, Callable, List, Tuple
from Bio import SeqIO
# For data loading
from torch.utils.data import Dataset, IterableDataset, TensorDataset, DataLoader
import json
import glob
import gzip
import bz2

# For progress and timing
from tqdm import tqdm
import time
import shutil
from Bio.PDB import PDBList
from Bio.PDB.MMCIFParser import MMCIFParser
import re

In [203]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##Data processing

Getting rid of the sequences which are outside of the threshold (64-128).

In [204]:
pdbl = PDBList()

# Define the source directory containing the files
source_dir = ""

# Define the target directory where the filtered files will be moved
target_dir = ''

# Ensure the target directory exists, if not, create it
#os.makedirs(target_dir, exist_ok=True)



# Define a function to determine if a file meets the filtering criteria
def filter_criteria(file_path,file_model):
    #pdbl.retrieve_pdb_file("3goe", file_format='mmCif', pdir=".")
    pdbl = PDBList()
    pdbl.retrieve_pdb_file(file_path, file_format='mmCif', pdir=".")
    # import the needed class
    # instantiate the class to prepare the parser
    cif_parser = MMCIFParser()
    #structure = cif_parser.get_structure("3goe", "3goe.cif")
    structure = cif_parser.get_structure(file_model, file_path)
    model0 = structure[0]

    #model0 = structure[1] - error due to there only being one model
    chain_A = model0['A']
    #structure = cif_parser.get_structure("3goe", "3goe.cif")
    print(structure[0]["A"])
    if 64 <= len(structure[0]["A"]) <= 128:
        shutil.move(file_path, target_dir)
        print(f"Moved file: {file_path} to {target_dir}")

for item in os.listdir(source_dir):
    item_path = os.path.join(source_dir, item)
    item_model = item.replace(".cif","")
    filter_criteria(item_path,item_model)




FileNotFoundError: [Errno 2] No such file or directory: ''

Getting the sequence of a given file in the target folder (contains only the files with desired sequences).

In [205]:
file_path = "AF-A0A1D8PD42-F1-model_v4.cif"
file_model = "AF-A0A1D8PD42-F1-model_v4"
pdbl = PDBList()
pdbl.retrieve_pdb_file(file_path, file_format='mmCif', pdir=".")
# import the needed class
# instantiate the class to prepare the parser
cif_parser = MMCIFParser()
#structure = cif_parser.get_structure("3goe", "3goe.cif")
structure = cif_parser.get_structure(file_model, file_path)
model0 = structure[0]
chain_A = model0['A']  # and we get chain A
# dictionary converting 3-letter codes to 1-letter codes
# this is a very common need in bioinformatics of proteins
d3to1 = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',
 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',
 'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'}

sequence = []
for residue in chain_A:
    # for simplicity we can use X for heteroatoms (ions and water)
    sequence.append(d3to1.get(residue.get_resname(), 'X'))  #converts water and ions to X
print(''.join(sequence))

Downloading PDB structure 'AF-A0A1D8PD42-F1-model_v4.cif'...
Desired structure doesn't exists
MSSSNTDNQYPKYINDTTPPTITLKEYDNASWASTTCLDHNPIKNQYIVVVMENPNQIVAIIDQQDNMILDILFKNAHDAHSKQEYSTK


Calculating the angles for the given sequence

In [206]:
#phi and psi
from Bio.PDB import PICIO, PDBIO
from Bio import PDB
from typing import TypedDict, Dict, Tuple
structure.atom_to_internal_coordinates() # turns xyz coordinates into angles and bond lengths

chain:PDB.Chain.Chain = list(structure.get_chains())[0]#iterator of chains, turns it into list, [0] first chain

ic_chain: PDB.internal_coords.IC_Chain = chain.internal_coord #this access the internal chain coords of the chain object

d: Dict[Tuple[PDB.internal_coords.AtomKey,
              PDB.internal_coords.AtomKey,
              PDB.internal_coords.AtomKey,
              PDB.internal_coords.AtomKey],
        PDB.internal_coords.Dihedron] = ic_chain.dihedra

cnt = 1
phi_angles = {}
phi_angles_list = []
psi_angles = {}
psi_angles_list = []

for key in d:
    if key[0].akl[3] == 'N' and key[1].akl[3] == 'CA' and key[2].akl[3] == 'C' and key[3].akl[3] == 'N':
        phi_angles[key] = d[key].angle
        phi_angles_list.append(d[key].angle)
    elif key[0].akl[3] == 'CA' and key[1].akl[3] == 'C' and key[2].akl[3] == 'N' and key[3].akl[3] == 'CA':
        psi_angles[key] = d[key].angle
        psi_angles_list.append(d[key].angle)


structure.internal_to_atom_coordinates(verbose = True)
io = PDBIO() #this is to write a pdb file again
io.set_structure(structure)#set structure, the structure you wan tin the pdb file

Putting angles in a matrix.

In [207]:
phi_angles_list.append(0)
psi_angles_list.append(0)

phi = np.asarray(phi_angles_list,dtype=np.float32)*(np.pi/180)
psi = np.asarray(psi_angles_list,dtype=np.float32)*(np.pi/180)
angles = np.vstack((psi,phi))

Changing sequence for to be used in the Prot-Bert embedding.

In [208]:
seq_example =  ' '.join(sequence)
seq_example

'M S S S N T D N Q Y P K Y I N D T T P P T I T L K E Y D N A S W A S T T C L D H N P I K N Q Y I V V V M E N P N Q I V A I I D Q Q D N M I L D I L F K N A H D A H S K Q E Y S T K'

##Embedding space creation (using Prot-Bert)

In [209]:
from transformers import BertModel, BertTokenizer
import re
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
seq = re.sub(r"[UZOB]", "X", seq_example)
encoded_input = tokenizer(seq, return_tensors='pt')

Getting the dimensions of the embedding space.

In [210]:
N , D = output.last_hidden_state.size()[1], output.last_hidden_state.size()[2]
embedding_prot_bert = output.last_hidden_state


In [211]:
print(N)
print(D)

91
1024


##Single-head self unmasked attention layer

In [212]:
class SelfAttention(nn.Module):
    def __init__(self, embed_dim: int):
        super(SelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.w_q = nn.Parameter(torch.randn(embed_dim, embed_dim))
        self.w_k = nn.Parameter(torch.randn(embed_dim, embed_dim))
        self.w_v = nn.Parameter(torch.randn(embed_dim, embed_dim))

    def forward(self, embeddings_prot_bert: torch.Tensor) -> torch.Tensor:
        Q = torch.matmul(embeddings_prot_bert, self.w_q)
        K = torch.matmul(embeddings_prot_bert, self.w_k)
        V = torch.matmul(embeddings_prot_bert, self.w_v)

        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(K.size(-1))
        attn = torch.softmax(scores, dim=-1)
        attention_output = torch.matmul(attn, V)

        return attention_output

##Encoder with attention and 2 layer FFNN

In [213]:
class TransformerModel(nn.Module):
    def __init__(self, embed_dim: int, feed_forward_dim1: int, feed_forward_dim2: int, output_dim: int = 2):
        super(TransformerModel, self).__init__()
        self.self_attention = SelfAttention(embed_dim)
        self.layer_norm1 = nn.LayerNorm(embed_dim)
        self.layer_norm2 = nn.LayerNorm(output_dim)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, feed_forward_dim1),
            nn.ReLU(),
            nn.Linear(feed_forward_dim1, feed_forward_dim2),
            nn.ReLU(),
            nn.Linear(feed_forward_dim2, output_dim)
        )

    def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
        attention_output = self.self_attention(embeddings)
        normalized_attention_output = self.layer_norm1(attention_output)
        ff_output = self.feed_forward(normalized_attention_output)
        #output = self.layer_norm2(ff_output)
        return ff_output


In [214]:
class TransformerTrainer:
    def __init__(self, model: nn.Module, learning_rates: List[float], criterion: nn.Module, num_epochs: int, sequence: torch.Tensor, angles: np.ndarray):
        self.model = model
        self.learning_rates = learning_rates
        self.criterion = criterion
        self.num_epochs = num_epochs
        self.sequence = sequence
        self.angles_tensor = torch.from_numpy(angles.T)
        self.optimizer = optim.SGD(model.parameters(), lr=learning_rates[0])

    def train(self):
        loss_list = []
        for epoch in range(self.num_epochs):
            predictions = self.model(self.sequence)[:, :len(self.angles_tensor)]
            loss = self.criterion(predictions, self.angles_tensor)
            loss.backward(retain_graph=True)
            self.optimizer.step()
            self.optimizer.zero_grad()
            loss_list.append(loss.item())

            if loss_list[-1] - loss.item() < 0 and loss_list[-1] - loss.item() < 0.04:
                self.optimizer = optim.SGD(self.model.parameters(), lr=self.learning_rates[4])
            elif 0.05 < loss_list[-1] - loss.item() < 0.5:
                self.optimizer = optim.SGD(self.model.parameters(), lr=self.learning_rates[2])
            else:
                self.optimizer = optim.SGD(self.model.parameters(), lr=self.learning_rates[0])

            print(f"Epoch {epoch + 1}/{self.num_epochs}, Loss: {loss.item()}")



In [215]:
bert_model = BertModel.from_pretrained("Rostlab/prot_bert")
output = bert_model(**encoded_input)
embedded_pb = output.last_hidden_state
N, D = embedded_pb.size()[1], embedded_pb.size()[2]

# Initialize and train transformer model
feed_forward_dim1 = 512
feed_forward_dim2 = 256
learning_rates = [0.01, 0.1, 0.05, 0.2, 0.5]
num_epochs = 50

model = TransformerModel(embed_dim=D, feed_forward_dim1=feed_forward_dim1, feed_forward_dim2= feed_forward_dim2)
criterion = nn.MSELoss()
trainer = TransformerTrainer(model, learning_rates, criterion, num_epochs, embedded_pb, angles)
trainer.train()

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1/50, Loss: 6.110396862030029
Epoch 2/50, Loss: 5.023796558380127
Epoch 3/50, Loss: 4.6743669509887695
Epoch 4/50, Loss: 4.566898822784424
Epoch 5/50, Loss: 4.54811954498291
Epoch 6/50, Loss: 4.546210289001465
Epoch 7/50, Loss: 4.546042442321777
Epoch 8/50, Loss: 4.5459442138671875
Epoch 9/50, Loss: 4.5458855628967285
Epoch 10/50, Loss: 4.545851230621338
Epoch 11/50, Loss: 4.545818328857422
Epoch 12/50, Loss: 4.5457844734191895
Epoch 13/50, Loss: 4.545750617980957
Epoch 14/50, Loss: 4.5457024574279785
Epoch 15/50, Loss: 4.545644760131836
Epoch 16/50, Loss: 4.5456109046936035
Epoch 17/50, Loss: 4.545578956604004
Epoch 18/50, Loss: 4.545544624328613
Epoch 19/50, Loss: 4.545510768890381
Epoch 20/50, Loss: 4.5454792976379395
Epoch 21/50, Loss: 4.54544734954834
Epoch 22/50, Loss: 4.54541540145874
Epoch 23/50, Loss: 4.545381546020508
Epoch 24/50, Loss: 4.54534912109375
Epoch 25/50, Loss: 4.545316696166992
Epoch 26/50, Loss: 4.545283794403076
Epoch 27/50, Loss: 4.54525089263916
Epoch 28

## Training

In [156]:
len(encoded_input) = N
encoded_input = tokenizer(seq_example, return_tensors='pt')
embeddings = D
output_dim = 2
num_heads = 1
feed_forward_dim = 512
num_layers = 2
dropout_rate = 0.001
max_len = 128
model = TransformerEncoder(encoded_input, embeddings, output_dim, num_heads, feed_forward_dim, num_layers, dropout_rate, max_len)
# model = TransformerEncoder(N, D, 2, 1, 200)
model = model.to(device)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_epochs = 1


#define the loss function and optimizer
criterion = nn.MSELoss()#nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001) #model.parameter?

data = [(seq_example,angles)]
print(data)

def train(model, data, criterion, optimizer, device):
    model.train()
    total_loss = 0

    for encoded_input, target_seq in data:

        #clear gradients w.r.t. parameters
        optimizer.zero_grad()

        output_seq = model(encoded_input)

        # print(output_seq)
        #reshape output and target to calculate loss
        output_seq = output_seq.view(-1, output_dim)
        target_seq = target_seq.view(-1, output_dim)

        #calculate loss: softmax --> cross entropy loss
        loss = criterion(output_seq, target_seq)

        #backward pass and optimize
        loss.backward()
        optimizer.step()

        total_loss +=loss.item()

    return total_loss / len(data)

for epoch in range(num_epochs):
    epoch_loss = train(model, data, criterion, optimizer, device)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')

[('M S S S N T D N Q Y P K Y I N D T T P P T I T L K E Y D N A S W A S T T C L D H N P I K N Q Y I V V V M E N P N Q I V A I I D Q Q D N M I L D I L F K N A H D A H S K Q E Y S T K', array([[ 3.1403687 ,  2.9137056 ,  3.0676677 ,  2.6755645 ,  2.4376874 ,
         3.0883443 ,  2.8678832 ,  3.1158628 ,  2.9880154 ,  3.0470784 ,
         3.0327706 ,  3.0104942 ,  3.1298056 ,  2.9787838 , -3.0790305 ,
        -3.0347888 , -2.9572258 ,  3.0289235 ,  3.0597904 ,  2.9211586 ,
         3.0762625 ,  2.974405  ,  3.0007036 ,  3.0569184 ,  3.0577672 ,
         3.0367448 , -3.126715  , -3.135688  , -3.1012697 ,  3.0807595 ,
         3.1382358 , -3.117632  , -3.1371424 ,  3.0779903 , -3.0377026 ,
         2.9499664 ,  2.9536798 ,  3.0535054 ,  3.0442324 ,  3.0053039 ,
        -3.0663934 , -3.1265657 ,  3.1294613 , -3.022085  , -3.0948193 ,
         2.9532042 , -3.101842  ,  2.9802246 , -3.102699  ,  3.0692794 ,
        -3.0905883 ,  3.1111856 ,  3.1259942 , -3.032551  ,  3.0171309 ,
         3.113

TypeError: 'BatchEncoding' object is not callable