In [None]:
file_path = "/content/AF-A0A1D8PD42-F1-model_v4.cif"
file_model = "AF-A0A1D8PD42-F1-model_v4"
pdbl = PDBList()
pdbl.retrieve_pdb_file(file_path, file_format='mmCif', pdir=".")
# import the needed class
# instantiate the class to prepare the parser
cif_parser = MMCIFParser()
#structure = cif_parser.get_structure("3goe", "3goe.cif")
structure = cif_parser.get_structure(file_model, file_path)
model0 = structure[0]
chain_A = model0['A']  # and we get chain A
# dictionary converting 3-letter codes to 1-letter codes
# this is a very common need in bioinformatics of proteins
d3to1 = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',
 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',
 'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'}

seq = []
for residue in chain_A:
    # for simplicity we can use X for heteroatoms (ions and water)
    seq.append(d3to1.get(residue.get_resname(), 'X'))  #converts water and ions to X
print(''.join(seq))

In [None]:
from Bio.PDB import PICIO, PDBIO
from Bio import PDB
from typing import TypedDict, Dict, Tuple
structure.atom_to_internal_coordinates() # turns xyz coordinates into angles and bond lengths

chain:PDB.Chain.Chain = list(structure.get_chains())[0]#iterator of chains, turns it into list, [0] first chain

ic_chain: PDB.internal_coords.IC_Chain = chain.internal_coord #this access the internal chain coords of the chain object

d: Dict[Tuple[PDB.internal_coords.AtomKey,
              PDB.internal_coords.AtomKey,
              PDB.internal_coords.AtomKey,
              PDB.internal_coords.AtomKey],
        PDB.internal_coords.Dihedron] = ic_chain.dihedra

cnt = 1

phi_angles_list = []
psi_angles_list = []

for key in d:
    if key[0].akl[3] == 'N' and key[1].akl[3] == 'CA' and key[2].akl[3] == 'C' and key[3].akl[3] == 'N':
        phi_angles_list.append(d[key].angle)
    elif key[0].akl[3] == 'CA' and key[1].akl[3] == 'C' and key[2].akl[3] == 'N' and key[3].akl[3] == 'CA':
        psi_angles_list.append(d[key].angle)


structure.internal_to_atom_coordinates(verbose = True)
io = PDBIO() #this is to write a pdb file again
io.set_structure(structure)#set structure, the structure you wan tin the pdb file

In [None]:
phi_angles_list.append(0)
psi_angles_list.append(0)

phi = np.asarray(phi_angles_list,dtype=np.float32)*(np.pi/180)
psi = np.asarray(psi_angles_list,dtype=np.float32)*(np.pi/180)
angles = np.vstack((psi,phi))

In [None]:
seq_example =  ' '.join(seq)
seq_example

In [None]:
seq = "M S S S N T D N Q Y P K Y I N D T T P P T I T L K E Y D N A S W A S T T C L D H N P I K N Q Y I V V V M E N P N Q I V A I I D Q Q D N M I L D I L F K N A H D A H S K Q E Y S T K"

In [None]:
from transformers import BertModel, BertTokenizer
import re
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
seq = re.sub(r"[UZOB]", "X", seq)
encoded_input = tokenizer(seq, return_tensors='pt')


In [None]:
import os
import math
import numpy as np
import random
import logging

# Bring in PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
# Most of the examples have typing on the signatures for readability
from typing import Optional, Callable, List, Tuple
from Bio import SeqIO
# For data loading
from torch.utils.data import Dataset, IterableDataset, TensorDataset, DataLoader
import json
import glob
import gzip
import bz2

# For progress and timing
from tqdm import tqdm
import time
import shutil
from Bio.PDB import PDBList
from Bio.PDB.MMCIFParser import MMCIFParser

In [None]:
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )

seq_example = re.sub(r"[UZOB]", "X", seq_example)
encoded_input = tokenizer(seq_example, return_tensors='pt')


In [None]:
def transformer(input_sequence, feed_forward_dim, output_dim=2):
    #putting the input in the embedding space
    model = BertModel.from_pretrained("Rostlab/prot_bert")
    
    output = model(**input_sequence)
    embedding_prot_bert = output.last_hidden_state
    N , D = embedding_prot_bert.size()[1], embedding_prot_bert.size()[2]

    #self, single-head, unmasked attention layer
    #attention weights
    w_q = nn.Parameter(torch.randn([D, D])) #(DxD)
    w_k = nn.Parameter(torch.randn([D, D])) #(DxD)
    w_v = nn.Parameter(torch.randn([D, D])) #(DxD)

    Q = torch.matmul(embedding_prot_bert, w_q)
    K = torch.matmul(embedding_prot_bert, w_k)
    V = torch.matmul(embedding_prot_bert, w_v)

    scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(K.size()[1])
    #apply the softmax function to obtain the attention weights
    attn = torch.softmax(scores, dim=-1)
    # Compute the context vector as the weighted sum of the values V
    attention_output = torch.matmul(attn, V) #(NxD)

    #first normalization layer
    layer_norm1 = nn.LayerNorm((N,D))
    embedded_after_attention = layer_norm1(attention_output)

    #feed forward neural network
    feed_forward = nn.Sequential(
        nn.Linear(D, feed_forward_dim),
        nn.ReLU(),
        nn.Linear(feed_forward_dim, feed_forward_dim),
        nn.ReLU(),
        nn.Linear(feed_forward_dim, output_dim) 
        )
    
    ff_output = feed_forward(embedded_after_attention) #(Nx2)

    #second normalization layer
    layer_norm2 = nn.LayerNorm((N,2))
    embedded_after_ff = layer_norm2(ff_output)

    return embedded_after_ff





In [None]:
input_sequence = encoded_input
output = transformer(input_sequence,500)
output = output[:,:len(seq)]
angles_tensor = torch.from_numpy(angles.T)
criterion = nn.MSELoss()

#calculate loss: softmax --> cross entropy loss
loss = criterion(output, angles_tensor)
#backward pass and optimize
loss.backward()
#optimizer.step()
total_loss =loss.item()
print(total_loss)