##Installation of the libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
!pip install -q transformers

In [2]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83


In [3]:
#!pip3 uninstall --yes torch torchaudio torchvision torchtext torchdata
!pip3 install torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

Torch optimization.

##All libraries needed for training

In [4]:
import os
import math
import numpy as np
import random
import logging

# Bring in PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
# Most of the examples have typing on the signatures for readability
from typing import Optional, Callable, List, Tuple
from Bio import SeqIO
# For data loading
from torch.utils.data import Dataset, IterableDataset, TensorDataset, DataLoader
import json
import glob
import gzip
import bz2

# For progress and timing
from tqdm import tqdm
import time
import shutil
from Bio.PDB import PDBList
from Bio.PDB.MMCIFParser import MMCIFParser
import re

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##Data processing

In [6]:
from Bio.PDB import PICIO, PDBIO
from Bio import PDB
from typing import TypedDict, Dict, Tuple

def angle_seq(file_path, file_model):
  # file_path = "AF-A0A1D8PD42-F1-model_v4.cif"
  # file_model = "AF-A0A1D8PD42-F1-model_v4"
  pdbl = PDBList()
  pdbl.retrieve_pdb_file(file_path, file_format='mmCif', pdir=".")
  # import the needed class
  # instantiate the class to prepare the parser
  cif_parser = MMCIFParser()
  #structure = cif_parser.get_structure("3goe", "3goe.cif")
  structure = cif_parser.get_structure(file_model, file_path)
  model0 = structure[0]
  chain_A = model0['A']  # and we get chain A
  # dictionary converting 3-letter codes to 1-letter codes
  # this is a very common need in bioinformatics of proteins
  d3to1 = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
  'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',
  'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',
  'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'}

  sequence = []
  for residue in chain_A:
      # for simplicity we can use X for heteroatoms (ions and water)
      sequence.append(d3to1.get(residue.get_resname(), 'X'))  #converts water and ions to X
  seq = ' '.join(sequence)

  structure.atom_to_internal_coordinates() # turns xyz coordinates into angles and bond lengths

  chain:PDB.Chain.Chain = list(structure.get_chains())[0]#iterator of chains, turns it into list, [0] first chain

  ic_chain: PDB.internal_coords.IC_Chain = chain.internal_coord #this access the internal chain coords of the chain object

  d: Dict[Tuple[PDB.internal_coords.AtomKey,
                PDB.internal_coords.AtomKey,
                PDB.internal_coords.AtomKey,
                PDB.internal_coords.AtomKey],
          PDB.internal_coords.Dihedron] = ic_chain.dihedra

  cnt = 1
  phi_angles = {}
  phi_angles_list = []
  psi_angles = {}
  psi_angles_list = []

  for key in d:
      if key[0].akl[3] == 'N' and key[1].akl[3] == 'CA' and key[2].akl[3] == 'C' and key[3].akl[3] == 'N':
          phi_angles[key] = d[key].angle
          phi_angles_list.append(d[key].angle)
      elif key[0].akl[3] == 'CA' and key[1].akl[3] == 'C' and key[2].akl[3] == 'N' and key[3].akl[3] == 'CA':
          psi_angles[key] = d[key].angle
          psi_angles_list.append(d[key].angle)


  structure.internal_to_atom_coordinates(verbose = True)
  io = PDBIO() #this is to write a pdb file again
  io.set_structure(structure)#set structure, the structure you wan tin the pdb file

  phi_angles_list.append(0)
  psi_angles_list.append(0)

  phi = np.asarray(phi_angles_list,dtype=np.float32)*(np.pi/180)
  psi = np.asarray(psi_angles_list,dtype=np.float32)*(np.pi/180)
  angles = np.vstack((psi,phi))

  # out_seq.write(seq)
  # out_angle.write(angles)
  return seq , angles






In [27]:
def process_files_in_main_folder(main_folder_path):
    # List all subfolders in the main folder
    subfolders = os.listdir(main_folder_path)

    for subfolder in subfolders:
        subfolder_name = os.path.basename(subfolder)
        file_path = os.path.join(main_folder_path, subfolder_name)

        if subfolder_name.endswith('.cif'):
            # Extract the base name (excluding ".cif")
            file_model = subfolder_name[:-4]
            newpath = os.path.join('/content/source', file_model)
            # print('new path', newpath)
            if not os.path.exists(newpath):
              os.makedirs(newpath)
              # with open(f"{file_model}.txt" , 'w') as sequence:
              #   with open(f"{file_model}.txt", 'w') as target_angle:
            seq , angles = angle_seq(file_path, file_model)
              # print('fl path', file_path)
              # sequence = open(f"{file_model}.csv" , 'w', )
              # target_angle = open(f"{file_model}.csv", 'w')
              # sequence.write(seq)
              # target_angle.write(angles)
            seq_path = os.path.join(newpath,f"seq_{file_model}.csv" )
            ang_path = os.path.join(newpath, f"angle_{file_model}.csv")
            # print('seq path:', seq_path, '\n', 'angl path:', ang_path)

            torch.save(seq , seq_path)
            torch.save(angles, ang_path)

    print('finish')

In [28]:
process_files_in_main_folder('/content/data')


new path /content/source/AF-Q9SVM3-F1-model_v4
Downloading PDB structure '/content/data/af-q9svm3-f1-model_v4.cif'...
Desired structure doesn't exist
AF-Q9SVM3-F1-model_v4 A coordinates for 0 dihedra updated in 0 iterations
new path /content/source/AF-F4I3J6-F1-model_v4
Downloading PDB structure '/content/data/af-f4i3j6-f1-model_v4.cif'...
Desired structure doesn't exist
AF-F4I3J6-F1-model_v4 A coordinates for 0 dihedra updated in 0 iterations
new path /content/source/AF-B3H5J3-F1-model_v4
Downloading PDB structure '/content/data/af-b3h5j3-f1-model_v4.cif'...
Desired structure doesn't exist
AF-B3H5J3-F1-model_v4 A coordinates for 0 dihedra updated in 0 iterations
new path /content/source/AF-Q84WY5-F1-model_v4
Downloading PDB structure '/content/data/af-q84wy5-f1-model_v4.cif'...
Desired structure doesn't exist
AF-Q84WY5-F1-model_v4 A coordinates for 0 dihedra updated in 0 iterations
finish


when we want to use the folders and files in the code we want a for loop to go to each folder, then use the angle and sequence folders and use them in our code