##Installation of the libraries

In [None]:
!pip install -q transformers

In [None]:
!pip install biopython



In [None]:
#!pip3 uninstall --yes torch torchaudio torchvision torchtext torchdata
!pip3 install torch



Torch optimization.

##All libraries needed for training

In [None]:
import os
import math
import numpy as np
import random
import logging

# Bring in PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
# Most of the examples have typing on the signatures for readability
from typing import Optional, Callable, List, Tuple
from Bio import SeqIO
# For data loading
from torch.utils.data import Dataset, IterableDataset, TensorDataset, DataLoader
import json
import glob
import gzip
import bz2

# For progress and timing
from tqdm import tqdm
import time
import shutil
from Bio.PDB import PDBList
from Bio.PDB.MMCIFParser import MMCIFParser
import re

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##Data processing

Getting rid of the sequences which are outside of the threshold (64-128).

Getting the sequence of a given file in the target folder (contains only the files with desired sequences).

In [None]:
file_path = "/content/AF-A0A1D8PD42-F1-model_v4.cif"
file_model = "AF-A0A1D8PD42-F1-model_v4.cif"
pdbl = PDBList()
#pdbl.retrieve_pdb_file(file_path, file_format='mmCif', pdir=".")
# import the needed class
# instantiate the class to prepare the parser
cif_parser = MMCIFParser()
#structure = cif_parser.get_structure("3goe", "3goe.cif")
structure = cif_parser.get_structure(file_model, file_path)
model0 = structure[0]
chain_A = model0['A']  # and we get chain A
# dictionary converting 3-letter codes to 1-letter codes
# this is a very common need in bioinformatics of proteins
d3to1 = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',
 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',
 'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'}

sequence = []
for residue in chain_A:
    # for simplicity we can use X for heteroatoms (ions and water)
    sequence.append(d3to1.get(residue.get_resname(), 'X'))  #converts water and ions to X
print(''.join(sequence))

MSSSNTDNQYPKYINDTTPPTITLKEYDNASWASTTCLDHNPIKNQYIVVVMENPNQIVAIIDQQDNMILDILFKNAHDAHSKQEYSTK


Calculating the angles for the given sequence

In [None]:

atom_coords = []

for residue in chain_A:

    if 'N' in residue and 'CA' in residue and 'C' in residue:

        n_coord = residue['N'].get_coord()
        ca_coord = residue['CA'].get_coord()
        c_coord = residue['C'].get_coord()

        atom_coords.append(n_coord)
        atom_coords.append(ca_coord)
        atom_coords.append(c_coord)

        #apparently this is also an extra atom that sometimes exist called Cb so we need to be careful but that will complicate things so for now im ignoring it
        #if 'CB' in residue:
        #    cb_coord = residue['CB'].get_coord()
        #   atom_coords.append(cb_coord)

atom_coords = np.array(atom_coords)
# Now atom_coords contains the XYZ coordinates of the selected atoms
print(atom_coords)

[[ 2.36229992e+01  2.66019993e+01  1.49000001e+00]
 [ 2.40100002e+01  2.52490005e+01  1.04799998e+00]
 [ 2.27509995e+01  2.44069996e+01  1.04200006e+00]
 [ 2.21219997e+01  2.42329998e+01 -1.19999997e-01]
 [ 2.08980007e+01  2.34360008e+01 -2.36000001e-01]
 [ 2.12859993e+01  2.19909992e+01 -5.14999986e-01]
 [ 2.16170006e+01  2.12560005e+01  5.42999983e-01]
 [ 2.17390003e+01  1.98010006e+01  4.74999994e-01]
 [ 2.03279991e+01  1.92169991e+01  4.90000010e-01]
 [ 1.96509991e+01  1.92420006e+01 -6.53999984e-01]
 [ 1.83269997e+01  1.86410007e+01 -8.19000006e-01]
 [ 1.83750000e+01  1.77010002e+01 -2.00699997e+00]
 [ 1.88470001e+01  1.64860001e+01 -1.76100004e+00]
 [ 1.84790001e+01  1.53079996e+01 -2.53900003e+00]
 [ 1.88820000e+01  1.40780001e+01 -1.72200000e+00]
 [ 1.83139992e+01  1.39480000e+01 -5.22000015e-01]
 [ 1.79039993e+01  1.26029997e+01 -1.28000006e-01]
 [ 1.68320007e+01  1.22519999e+01 -1.14300001e+00]
 [ 1.71770000e+01  1.14270000e+01 -2.13199997e+00]
 [ 1.62579994e+01  1.09359999e+

In [None]:
import numpy as np

#use this code to update the predicted matrix into pdb file
#should have dimensions of atom_coords, im printing it so you can check the dimensions
print(np.shape(atom_coords))

# Initialize a counter for your new coordinates
coord_idx = 0
residue_cnt = 0

# Loop through each residue and update the coordinates of the N, CA, and C atoms
for model in structure:
    for chain in model:
        for residue in chain:
            residue_cnt += 1
            if 'N' in residue and 'CA' in residue and 'C' in residue:
                # Update N atom
                residue['N'].set_coord(atom_coords[coord_idx])
                coord_idx += 1
                # Update CA atom
                residue['CA'].set_coord(atom_coords[coord_idx])
                coord_idx += 1
                # Update C atom
                residue['C'].set_coord(atom_coords[coord_idx])
                coord_idx += 1

print(residue_cnt)
#notice how there are

(267, 3)
89
