In [1]:
import os
#import math
import numpy as np
#import random
#import logging
#import pandas as pd
# Bring in PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
# Most of the examples have typing on the signatures for readability
#from typing import Optional, Callable, List, Tuple
#from Bio import SeqIO
# For data loading
#from torch.utils.data import Dataset, IterableDataset, TensorDataset, DataLoader
#import json
#import glob
#import gzip
#import bz2
#import torch.nn.functional as F
# For progress and timing
#from tqdm import tqdm
#import time
#import shutil
#from Bio.PDB import PDBList
#from Bio.PDB.MMCIFParser import MMCIFParser
import re
#from Bio.PDB import PICIO, PDBIO
#from Bio import PDB
from transformers import BertModel, BertTokenizer, T5Tokenizer, T5EncoderModel


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
"""from transformers import BertModel, BertTokenizer
import re
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
seq = re.sub(r"[UZOB]", "X", seq_example)
encoded_input = tokenizer(seq, return_tensors='pt')"""

'from transformers import BertModel, BertTokenizer\nimport re\ntokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )\nseq = re.sub(r"[UZOB]", "X", seq_example)\nencoded_input = tokenizer(seq, return_tensors=\'pt\')'

In [4]:
class SelfAttention(nn.Module):
    def __init__(self, embed_dim: int):
        super(SelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.w_q = nn.Parameter(torch.randn(embed_dim, embed_dim))
        self.w_k = nn.Parameter(torch.randn(embed_dim, embed_dim))
        self.w_v = nn.Parameter(torch.randn(embed_dim, embed_dim))

    def forward(self, embeddings_prot_bert: torch.Tensor) -> torch.Tensor:
        Q = torch.matmul(embeddings_prot_bert, self.w_q)
        K = torch.matmul(embeddings_prot_bert, self.w_k)
        V = torch.matmul(embeddings_prot_bert, self.w_v)

        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(K.size(-1))
        attn = torch.softmax(scores, dim=-1)
        attention_output = torch.matmul(attn, V)

        return attention_output

In [5]:
class TransformerModel(nn.Module):
    def __init__(self, embed_dim: int, feed_forward_dim1: int, feed_forward_dim2: int, output_dim: int = 2, dropout_rate: float = 0.1):
        super(TransformerModel, self).__init__()
        self.self_attention = SelfAttention(embed_dim)
        self.layer_norm1 = nn.LayerNorm(embed_dim)
        self.layer_norm2 = nn.LayerNorm(output_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, feed_forward_dim1),
            nn.GELU(),
            self.dropout,
            nn.Linear(feed_forward_dim1, feed_forward_dim2),
            nn.GELU(),
            self.dropout,
            nn.Linear(feed_forward_dim2, output_dim)
        )

    def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
        attention_output = self.self_attention(embeddings)
        normalized_attention_output = self.layer_norm1(attention_output)
        ff_output = self.feed_forward(normalized_attention_output)
        output = self.layer_norm2(ff_output)
        return ff_output


In [6]:
class AngularLoss(nn.Module):
    def __init__(self):
        super(AngularLoss, self).__init__()

    def forward(self, predicted_angles, angles_tensor):
        predicted_angles_phi, predicted_angles_psi = predicted_angles[:, 0], predicted_angles[:, 1]
        angles_tensor_phi, angles_tensor_psi = angles_tensor[:, 0], angles_tensor[:, 1]

        predicted_angles_phi = (predicted_angles_phi + torch.pi) % (2 * torch.pi) - torch.pi
        angles_tensor_phi = (angles_tensor_phi + torch.pi) % (2 * torch.pi) - torch.pi
        predicted_angles_psi = (predicted_angles_psi + torch.pi) % (2 * torch.pi) - torch.pi
        angles_tensor_psi = (angles_tensor_psi + torch.pi) % (2 * torch.pi) - torch.pi

        difference_phi = torch.abs(predicted_angles_phi - angles_tensor_phi)
        loss_phi = torch.mean(torch.min(difference_phi, 2 * torch.pi - difference_phi))

        difference_psi = torch.abs(predicted_angles_psi - angles_tensor_psi).to(device)
        loss_psi = torch.mean(torch.min(difference_psi, 2 * torch.pi - difference_psi))
        
        loss = loss_phi + loss_psi
        return loss


In [7]:
class CustomLoss(nn.Module):
    def __init__(self, predicted_angles, angles_tensor):
        super(CustomLoss, self).__init__()
        self.predicted_angles= predicted_angles
        self.angles_tensor= angles_tensor
    def forward(self):
        d_list = []
        for i in range(len(self.angles_tensor)):
            x1, y1 = self.predicted_angles[0][i]
            x2, y2 = self.angles_tensor[i]
            ax_x = torch.min(torch.abs(x2 - x1), torch.abs(360 - torch.abs(x2 - x1)))
            ax_y = torch.min(torch.abs(y2 - y1), torch.abs(360 - torch.abs(y2 - y1)))
            d = torch.sqrt(ax_x**2 + ax_y**2)
            d_list.append(d)
        d_tensor = torch.stack(d_list)
        return d_tensor.mean() 
 

In [8]:
class TransformerTrainer:
    def __init__(self, model: nn.Module, criterion: nn.Module, num_epochs: int, sequence: torch.Tensor, angles: torch.Tensor):
        self.model = model
        self.criterion = criterion
        self.num_epochs = num_epochs
        self.sequence = sequence
        self.angles_tensor = angles
        self.optimizer = optim.AdamW(model.parameters(), lr=0.001)
        #self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=10, gamma=0.1)

    def train(self):
        loss_list = []
        for epoch in range(self.num_epochs):
            self.optimizer.zero_grad()
            predictions = self.model.forward(self.sequence)[:, :len(self.angles_tensor)]
            #loss = CustomLoss()
            #loss.forward(torch.tensor(predictions, requires_grad=True), self.angles_tensor)
            loss = self.criterion(predictions.squeeze(), self.angles_tensor)
            loss.backward(retain_graph=True)
            self.optimizer.step()
            #self.scheduler.step()
            loss_list.append(loss.item())
            #print(f"Epoch {epoch + 1}/{self.num_epochs}, Loss: {loss.item()}")
        return loss_list



In [9]:

source_dir = "source"
#tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
#bert_model = BertModel.from_pretrained("Rostlab/prot_bert")

# Initialize and train transformer model
feed_forward_dim1 = 512
feed_forward_dim2 = 256
feed_forward_dim3 = 128
num_epochs = 100
dropout_rate = 0.1
transformer = TransformerModel(embed_dim=1024, feed_forward_dim1=feed_forward_dim1, feed_forward_dim2= feed_forward_dim2, dropout_rate = dropout_rate)
criterion = AngularLoss()
transformer = transformer.to(device)

In [10]:
folders = sorted(os.listdir(source_dir)) #[:50]

seq_list = []
angles_list = []
for folder in folders:
    folder_path = os.path.join(source_dir, folder)
    if os.path.isdir(folder_path):
        seq_path = os.path.join(folder_path ,f"seq_{folder}.csv" )
        ang_path = os.path.join(folder_path, f"angle_{folder}.csv")
        seq = torch.load(seq_path)
        if len(seq.replace(' ','')) <= 128:
            angle = torch.load(ang_path)
            #angle_tensor = torch.from_numpy(angle.T)
            seq_list.append(seq.replace(' ',''))
            angles_list.append(angle.T)
        


Adding padding to the angles

In [11]:
max_length = max(len(x) for x in angles_list )
padded_angles = []
for angle in angles_list:
    padding = np.zeros((max_length-len(angle),2))
    padded_angles.append(np.vstack((angle,padding)))


To add the padding Prot-T5 was used and batch encoding was done

In [12]:
transformer_link = "Rostlab/prot_t5_xl_half_uniref50-enc"
print("Loading: {}".format(transformer_link))
model = T5EncoderModel.from_pretrained(transformer_link)

model = model.to(device)
model = model.eval()
tokenizer = T5Tokenizer.from_pretrained(transformer_link, do_lower_case=False, legacy=True )

Loading: Rostlab/prot_t5_xl_half_uniref50-enc


In [40]:
sequence_examples = ["E M "]

# tokenize sequences and pad up to the longest sequence in the batch
ids = tokenizer.batch_encode_plus(sequence_examples,special_token=False, padding="longest")
input_ids = torch.tensor(ids['input_ids']).to(device)
attention_mask = torch.tensor(ids['attention_mask']).to(device)
print(attention_mask.size())

# generate embeddings
with torch.no_grad():
    embedding_repr = model(input_ids=input_ids)


Keyword arguments {'special_token': False} not recognized.


torch.Size([1, 3])


In [38]:
embedding_repr.last_hidden_state[0]

tensor([[ 0.1622, -0.2762, -0.1633,  ..., -0.0368, -0.2990,  0.0872],
        [ 0.4380, -0.1595, -0.3481,  ...,  0.0857, -0.1513, -0.1183],
        [-0.1083, -0.0707,  0.0562,  ...,  0.0556,  0.0236,  0.0181]],
       device='cuda:0')

In [36]:
embedding_repr.last_hidden_state[0]

tensor([[ 0.2630, -0.2531, -0.0511,  ..., -0.0465, -0.4211, -0.1257],
        [-0.0713, -0.0862,  0.0057,  ...,  0.0311,  0.0302,  0.0646]],
       device='cuda:0')

In [14]:
sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in seq_list]

# tokenize sequences and pad up to the longest sequence in the batch
ids = tokenizer.batch_encode_plus(sequence_examples, special_tokens=False,padding="longest")
input_ids = torch.tensor(ids['input_ids']).to(device)
attention_mask = torch.tensor(ids['attention_mask']).to(device)
print(attention_mask.size())

# generate embeddings
with torch.no_grad():
    embedding_repr = model(input_ids=input_ids)

#torch.save(embedding_repr.last_hidden_state,'embedding')

torch.Size([79, 129])


In [23]:
print(len(sequence_examples[10].replace(' ','')))
print(attention_mask[10])

80
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')


Encoded input in the embedding space with dimensions (batch_size, max_length, 1024)

In [14]:
embedded_input = embedding_repr.last_hidden_state[:,:-1,:]
print(embedded_input.size())

torch.Size([79, 128, 1024])


In [15]:
#torch.save(transformer,'model_pretraining.pt')
torch.save(transformer.state_dict(), 'pretraining_state_dict.pt')

In [16]:

for index in range(embedded_input.size()[0]):
    print(index)
    N, D = embedded_input.size()[1], embedded_input.size()[2]
    angles_tensor = torch.tensor(padded_angles[index]).to(device)
    trainer = TransformerTrainer(transformer, criterion, num_epochs, embedded_input[index], angles_tensor)
    trainer.train()
    torch.save(transformer.state_dict(), 'model_postraining.pt')


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78


In [28]:
transformer.load_state_dict(torch.load('model_postraining.pt'))
#transformer.eval()
transformer.self_attention.w_q

Parameter containing:
tensor([[-0.6842,  0.4372,  0.6347,  ...,  0.9728,  2.7258,  1.0129],
        [ 1.5015, -1.3181, -1.1218,  ..., -0.2082, -0.8369, -0.8994],
        [-0.6838,  1.0184,  1.2882,  ...,  1.5031, -1.6067, -0.2977],
        ...,
        [-0.2741,  0.8392, -2.8594,  ..., -1.0226,  0.5374, -1.4510],
        [-0.3024, -0.3968, -0.7321,  ...,  0.3048,  0.2136, -0.5446],
        [ 0.6818, -1.2832, -0.8371,  ...,  0.4947,  0.5219,  0.3092]],
       device='cuda:0', requires_grad=True)

In [38]:
transformer.feed_forward[0].weight

Parameter containing:
tensor([[ 0.0187, -0.0189, -0.0550,  ...,  0.0329,  0.0374,  0.0220],
        [ 0.0300,  0.0393,  0.0424,  ..., -0.0248, -0.0662, -0.0233],
        [ 0.0221, -0.0276, -0.0060,  ...,  0.0411, -0.0376, -0.0208],
        ...,
        [ 0.0125, -0.0256, -0.0248,  ...,  0.0326,  0.0427, -0.0077],
        [ 0.0340, -0.0050,  0.0277,  ...,  0.0365, -0.0277,  0.0311],
        [-0.0572,  0.0129, -0.0105,  ...,  0.0029, -0.0041, -0.0759]],
       device='cuda:0', requires_grad=True)

In [39]:
transformer.load_state_dict(torch.load('pretraining_state_dict.pt'))
transformer.self_attention.w_q

Parameter containing:
tensor([[-0.7857,  0.6206,  0.6637,  ...,  0.9910,  2.9711,  1.0559],
        [ 1.6767, -1.5417, -1.1654,  ..., -0.2459, -0.9541, -0.9731],
        [-0.7866,  1.0688,  1.4431,  ...,  1.6864, -1.7418, -0.4119],
        ...,
        [-0.2409,  0.9181, -3.2631,  ..., -1.1879,  0.5847, -1.5172],
        [-0.2963, -0.5263, -0.7923,  ...,  0.3881,  0.3060, -0.5665],
        [ 0.6173, -1.5142, -0.8757,  ...,  0.6113,  0.5773,  0.2670]],
       device='cuda:0', requires_grad=True)

In [40]:
transformer.feed_forward[0].weight

Parameter containing:
tensor([[-0.0275, -0.0310, -0.0194,  ...,  0.0067,  0.0306,  0.0037],
        [-0.0020, -0.0056, -0.0154,  ..., -0.0240, -0.0273, -0.0221],
        [ 0.0089, -0.0199,  0.0258,  ...,  0.0205, -0.0072, -0.0037],
        ...,
        [-0.0011, -0.0097, -0.0226,  ...,  0.0118, -0.0056, -0.0092],
        [ 0.0300, -0.0111,  0.0061,  ...,  0.0139, -0.0180,  0.0113],
        [-0.0158, -0.0188, -0.0037,  ...,  0.0021, -0.0111, -0.0222]],
       device='cuda:0', requires_grad=True)