In [15]:
!pip install Bio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from Bio.PDB import *
import numpy as np
import os
from tqdm import tqdm

In [18]:
MAX_LENGTH_D = 130
MAX_LENGTH_P = 30
TOTAL_lENGTH = MAX_LENGTH_D + MAX_LENGTH_P

AA_DICT = {"A": 0, "C": 1, "D": 2, "E": 3, "F": 4, "G": 5, "H": 6, "I": 7, "K": 8, "L": 9, "M": 10, "N": 11,
           "P": 12, "Q": 13, "R": 14, "S": 15, "T": 16, "W": 17, "Y": 18, "V": 19, "X": 20, "-": 21}
UNIQE_AA = {"UNK": "X", "TYS": "Y", "FME": "M", "PCA": "Q", "CSD":"C", "MLY":"K", "SEP":"S", "YCM":"C", "CSX": "C", "NEP":"H", "IAS":"D", "MSE":"M","IPG":"G", "NMC" : "G", "PTR": "Y"}
FEATURE_NUM = len(AA_DICT) +2
BACKBONE_ATOMS = ["N", "CA", "C", "O", "CB"]
OUTPUT_SIZE = len(BACKBONE_ATOMS) * 3
DOMAIN_CHAIN = "A"
P_CHAIN = "B"
D_COL = 22
P_COL = 23

In [20]:
def get_seq_aa(pdb_file, chain_id):
    """
    returns the sequence (String) and a list of all the aa residue objects of the given protein chain.
    :param pdb_file: path to a pdb file
    :param chain_id: chain letter (char)
    :return: sequence, [aa objects]
    """
    # load model
    print(pdb_file)
    chain = PDBParser(QUIET=True).get_structure(pdb_file, pdb_file)[0][chain_id]

    aa_residues = []
    seq = ""

    for residue in chain.get_residues():
        aa = residue.get_resname()
        if not is_aa(aa) or not residue.has_id('CA'): # Not amino acid
            continue
        elif aa in UNIQE_AA:  # unkown amino acid
            seq += UNIQE_AA[aa]
        else:
            seq += Polypeptide.three_to_one(residue.get_resname())
        aa_residues.append(residue)

    return seq, aa_residues

In [None]:
def generate_input(pdb_file): # TODO: implement this!
    """
    receives a pdb file and returns its sequence in a one-hot encoding matrix (each row is an aa in the sequence, and
    each column represents a different aa out of the 20 aa + 2 special columns).
    :param pdb_file: path to a pdb file (nanobody, heavy chain has id 'H')
    :return: numpy array of shape (NB_MAX_LENGTH, FEATURE_NUM)
    """

    # get seq and aa residues
    seqD, _ = get_seq_aa(pdb_file, DOMAIN_CHAIN)
    seqP, _ = get_seq_aa(pdb_file, P_CHAIN)
    input_mat = np.zeros([MAX_LENGTH_D + MAX_LENGTH_P,FEATURE_NUM],dtype=int)


    for i,aa in enumerate(seqD):
      input_mat[i,AA_DICT[aa]] = 1
      input_mat[i,D_COL] = 1

    for i,aa in enumerate(seqP):
      input_mat[i+MAX_LENGTH_D,AA_DICT[aa]] = 1
      input_mat[i+MAX_LENGTH_D,P_COL] = 1
      
    for i in range(len(seqD),MAX_LENGTH_D):
      input_mat[i,AA_DICT['-']] = 1
      input_mat[i,D_COL] = 1

    for i in range(len(seqP)+MAX_LENGTH_D,MAX_LENGTH_P+MAX_LENGTH_D):
      input_mat[i,AA_DICT['-']] = 1
      input_mat[i,P_COL] = 1
    return input_mat

In [None]:
def generate_label(pdb_file):  # TODO: implement this!
    """
    receives a pdb file and returns its backbone + CB coordinates.
    :param pdb_file: path to a pdb file  already alingned to a reference.
    :return: numpy array of shape (CDR_MAX_LENGTH, OUTPUT_SIZE).
    """
    # get seq and aa residues
    label_mat = np.zeros([MAX_LENGTH_D+MAX_LENGTH_P, OUTPUT_SIZE])

    seqD, aa_residues_D = get_seq_aa(pdb_file, DOMAIN_CHAIN )
    for i,res in enumerate(aa_residues_D):
      for atom in res.get_atoms():
        if atom.name == BACKBONE_ATOMS[0]:
          label_mat[i,0:3] = list(atom.get_vector())
        elif atom.name == BACKBONE_ATOMS[1]:
          label_mat[i,3:6] = list(atom.get_vector())
        elif atom.name ==  BACKBONE_ATOMS[2]:
          label_mat[i,6:9] = list(atom.get_vector())
        elif atom.name == BACKBONE_ATOMS[3]:
          label_mat[i,9:12] = list(atom.get_vector())
        elif atom.name == BACKBONE_ATOMS[4] and res.get_resname() != 'G':
          label_mat[i,12:15] = list(atom.get_vector())

    seqP, aa_residues_P = get_seq_aa(pdb_file, P_CHAIN )
    for i,res in enumerate(aa_residues_P):
      for atom in res.get_atoms():
        if atom.name == BACKBONE_ATOMS[0]:
          label_mat[i+MAX_LENGTH_D,0:3] = list(atom.get_vector())
        elif atom.name == BACKBONE_ATOMS[1]:
          label_mat[i+MAX_LENGTH_D,3:6] = list(atom.get_vector())
        elif atom.name ==  BACKBONE_ATOMS[2]:
          label_mat[i+MAX_LENGTH_D,6:9] = list(atom.get_vector())
        elif atom.name == BACKBONE_ATOMS[3]:
          label_mat[i+MAX_LENGTH_D,9:12] = list(atom.get_vector())
        elif atom.name == BACKBONE_ATOMS[4] and res.get_resname() != 'G':
          label_mat[i+MAX_LENGTH_D,12:15] = list(atom.get_vector())
    
    return label_mat

In [None]:
from re import T
def matrix_to_pdb(seq_D, seq_P, coord_matrix, pdb_name):
    """
    Receives a sequence (String) and the output matrix of the neural network (coord_matrix, numpy array)
    and creates from them a PDB file named pdb_name.pdb.
    :param seq: protein sequence (String), with no padding
    :param coord_matrix: output np array of the nanobody neural network, shape = (NB_MAX_LENGTH, OUTPUT_SIZE)
    :param pdb_name: name of the output PDB file (String)
    """
    ATOM_LINE = "ATOM{}{}  {}{}{} {}{}{}{}{:.3f}{}{:.3f}{}{:.3f}  1.00{}{:.2f}           {}\n"
    END_LINE = "END\n"
    k = 1
    with open(f"{pdb_name}.pdb", "w") as pdb_file:
        for i, aa in enumerate(seq_D):
            third_space = (4 - len(str(i))) * " "
            for j, atom in enumerate(BACKBONE_ATOMS):
                if not (aa == "G" and atom == "CB"):  # GLY lacks CB atom
                    x, y, z = coord_matrix[i][3*j], coord_matrix[i][3*j+1], coord_matrix[i][3*j+2]
                    b_factor = 0.00
                    first_space = (7 - len(str(k))) * " "
                    second_space = (4 - len(atom)) * " "
                    forth_space = (12 - len("{:.3f}".format(x))) * " "
                    fifth_space = (8 - len("{:.3f}".format(y))) * " "
                    sixth_space = (8 - len("{:.3f}".format(z))) * " "
                    seventh_space = (6 - len("{:.2f}".format(b_factor))) * " "

                    pdb_file.write(ATOM_LINE.format(first_space, k, atom, second_space, Polypeptide.one_to_three(aa) , "A", third_space, 
                                                    i, forth_space, x, fifth_space, y, sixth_space, z, seventh_space,
                                                    b_factor, atom[0]))
                    k += 1
        for t, aa in enumerate(seq_P):
            third_space = (4 - len(str(t))) * " "
            for l, atom in enumerate(BACKBONE_ATOMS):
                if not (aa == "G" and atom == "CB"):  # GLY lacks CB atom
                    x, y, z = coord_matrix[t+MAX_LENGTH_D][3*l], coord_matrix[t+MAX_LENGTH_D][3*l+1], coord_matrix[t+MAX_LENGTH_D][3*l+2]
                    b_factor = 0.00
                    first_space = (7 - len(str(k))) * " "
                    second_space = (4 - len(atom)) * " "
                    forth_space = (12 - len("{:.3f}".format(x))) * " "
                    fifth_space = (8 - len("{:.3f}".format(y))) * " "
                    sixth_space = (8 - len("{:.3f}".format(z))) * " "
                    seventh_space = (6 - len("{:.2f}".format(b_factor))) * " "

                    pdb_file.write(ATOM_LINE.format(first_space, k, atom, second_space, Polypeptide.one_to_three(aa) , "B", third_space, 
                                                    t, forth_space, x, fifth_space, y, sixth_space, z, seventh_space,
                                                    b_factor, atom[0]))
                    k += 1
          

        pdb_file.write(END_LINE)

In [24]:
if __name__ == '__main__':
    
   #  you can make all the data for the network in this section.
   # you can save the matrices to your drive and load them in your google colab file later.


    input_matrix = []
    labels_matrix = []
    data_path = "/content/drive/MyDrive/3D protein Hackathon/test_pdbs/"  # TODO: change path if needed
    
    for pdb in tqdm(os.listdir(data_path)):
        if pdb.endswith(".pdb"):

          nb_one_hot = generate_input(os.path.join(data_path, pdb))
          nb_xyz = generate_label(os.path.join(data_path, pdb))
      
          input_matrix.append(nb_one_hot)
          labels_matrix.append(nb_xyz)

    save_path = "/content/drive/MyDrive/3D protein Hackathon/"  # TODO: change path if needed

    np.save(f"{save_path}/test_input.npy", np.array(input_matrix))
    np.save(f"{save_path}/test_labels.npy", np.array(labels_matrix))

    print(f"Number of samples: {len(input_matrix)}")

  0%|          | 0/12 [00:00<?, ?it/s]

/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb6atv.pdb


  8%|▊         | 1/12 [00:00<00:03,  3.63it/s]

/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb6atv.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb6atv.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb6atv.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb4u5w.pdb


 17%|█▋        | 2/12 [00:00<00:02,  4.26it/s]

/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb4u5w.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb4u5w.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb4u5w.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb3rea.pdb


 25%|██▌       | 3/12 [00:00<00:02,  3.20it/s]

/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb3rea.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb3rea.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb3rea.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb1qwe.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb1qwe.pdb


 33%|███▎      | 4/12 [00:01<00:02,  3.59it/s]

/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb1qwe.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb1qwe.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb1rlp.pdb


 42%|████▏     | 5/12 [00:01<00:02,  3.46it/s]

/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb1rlp.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb1rlp.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb1rlp.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb2rqw.pdb


 50%|█████     | 6/12 [00:01<00:01,  3.07it/s]

/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb2rqw.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb2rqw.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb2rqw.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb4afs.pdb


 58%|█████▊    | 7/12 [00:02<00:01,  3.51it/s]

/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb4afs.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb4afs.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb4afs.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb4j9f.pdb


 67%|██████▋   | 8/12 [00:02<00:01,  3.42it/s]

/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb4j9f.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb4j9f.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb4j9f.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb2oi3.pdb


 75%|███████▌  | 9/12 [00:02<00:00,  3.26it/s]

/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb2oi3.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb2oi3.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb2oi3.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb1jqq.pdb


 83%|████████▎ | 10/12 [00:02<00:00,  3.68it/s]

/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb1jqq.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb1jqq.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb1jqq.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb1b07.pdb


 92%|█████████▏| 11/12 [00:03<00:00,  3.56it/s]

/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb1b07.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb1b07.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb1b07.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb1avz.pdb


100%|██████████| 12/12 [00:03<00:00,  3.43it/s]

/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb1avz.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb1avz.pdb
/content/drive/MyDrive/3D protein Hackathon/test_pdbs/pdb1avz.pdb
Number of samples: 12



