In [None]:
# delete this cell if working on Pycharm
!pip install Bio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from Bio.PDB import *
import numpy as np
import os
from tqdm import tqdm

In [None]:
## constants:

AA_DICT = {"A": 0, "C": 1, "D": 2, "E": 3, "F": 4, "G": 5, "H": 6, "I": 7, "K": 8, "L": 9, "M": 10, "N": 11,
           "P": 12, "Q": 13, "R": 14, "S": 15, "T": 16, "W": 17, "Y": 18, "V": 19, "X": 20, "-": 21}
UNIQE_AA = {"UNK": "X", "TYS": "Y", "FME": "M", "PCA": "Q", "CSD":"C", "MLY":"K", "SEP":"S", "YCM":"C", "CSX": "C", "NEP":"H", "IAS":"D", "PM3": "X"}
FEATURE_NUM = len(AA_DICT) + 2 
BACKBONE_ATOMS = ["N", "CA", "C", "O", "CB"]
OUTPUT_SIZE = len(BACKBONE_ATOMS) * 3

SH2_MAX_LEN = 140 + 10 # maximum length in data + safety range
PEP_MAX_LEN = 25 + 10 # maximum length in data + safety range

In [None]:
def get_seq_aa(pdb_file, chain_id):
    """
    returns the sequence (String) and a list of all the aa residue objects of the given protein chain.
    :param pdb_file: path to a pdb file
    :param chain_id: chain letter (char)
    :return: sequence, [aa objects]
    """
    # load model
    chain = PDBParser(QUIET=True).get_structure(pdb_file, pdb_file)[0][chain_id]

    aa_residues = []
    seq = ""
    for residue in chain:
        aa = residue.get_resname()
        if not is_aa(aa) or not residue.has_id('CA'):
            continue
        # elif aa in ["PM3", "PTH"]:
          # continue
        elif aa in UNIQE_AA:
            seq += UNIQE_AA[aa]
        else:
          try:
            seq += Polypeptide.three_to_one(residue.get_resname())
          except:
            seq += "X"
        aa_residues.append(residue)
    return seq, aa_residues

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def generate_single_input(pdb_file, chain_id, max_len): 
    """
    receives a pdb file and returns its sequence in a one-hot encoding matrix (each row is an aa in the sequence, and
    each column represents a different aa out of the 20 aa + 2 special columns).
    :param pdb_file: path to a pdb file
    :param chain_id: chain ID of the chain we want to genarate label to
    :param max_len: maximum possible length of input (rows)
    :return: numpy array of shape (max_len, FEATURE_NUM)
    """

    # get seq and aa residues
    seq, _ = get_seq_aa(pdb_file, chain_id)

    output_map = np.zeros((max_len,FEATURE_NUM))
    seq = seq + ((max_len - len(seq)) * "-")
    for i in range(max_len):
        output_map[i, AA_DICT[seq[i]]] = 1
    return output_map
    

In [None]:
def generate_input(sh2_path, sh2_chain_id, pep_path, pep_chain_id):
  """
  receives 2 pdbs files and returns both of them sequence in a one-hot encoding 
  matrix (each row is an aa in the sequence, and
  each column represents a different aa out of the 20 aa + 2 special columns).
  in addition, last 2 colunms separates between sh2 and peptide
  :param sh2_path: path to a pdb file of sh2
  :param sh2_chain_id: chain ID of the SH2 we want to genarate label to
  :param pep_path: path to a pdb file of a peptide
  :param pep_chain_id: chain ID of the peptie we want to genarate label to
  :return: numpy array of shape (SH2_MAX_LEN+PEP_MAX_LEN, FEATURE_NUM)
  """
  sh2_input = generate_single_input(sh2_path, sh2_chain_id, SH2_MAX_LEN)
  pep_input = generate_single_input(pep_path, pep_chain_id, PEP_MAX_LEN,)
  
  # last 2 colunms separates between sh2 and peptide
  sh2_input[:,22] = 1
  pep_input[:,23] = 1
  return np.concatenate([sh2_input, pep_input], axis=0)


In [None]:
def generate_single_label(pdb_file, chain_id, max_len):  # TODO: implement this!

    """
    receives a pdb file and returns its pairwise distances and pairwise angles (omega, theta, phi).
    :param pdb_file: path to a pdb file (nanobody, heavy chain has id 'H')
    :return: 4 numpy arrays, the first one with size (CDR_MAX_LENGTH * CDR_MAX_LENGTH * 1) and the other three with
    size (max_len * max_len * 2).
    """
    # get seq and aa residues
    seq, aa_residues = get_seq_aa(pdb_file, chain_id)

    # turn into backbone + CB xyz matrix
    xyz_matrix = np.zeros((max_len, OUTPUT_SIZE))
    for i in range(len(aa_residues)):
        for j, atom in enumerate(BACKBONE_ATOMS):
            if not (atom=="CB" and seq[i] == "G"):
                if atom in [a.id for a in aa_residues[i].get_atoms()]:
                    xyz_matrix[i][3*j:3*j+3] = aa_residues[i][atom].get_coord()

    return xyz_matrix

In [None]:
def generate_label(sh2_path, sh2_chain_id, pep_path, pep_chain_id):
  """
  receives 2 pdb files and returns both of thembackbone + CB coordinates.
  :param sh2_path: path to a pdb file of sh2
  :param sh2_chain_id: chain ID of the SH2 we want to genarate label to
  :param pep_path: path to a pdb file of a peptide
  :param pep_chain_id: chain ID of the peptie we want to genarate label to
  :return: numpy array of shape (SH2_MAX_LEN+PEP_MAX_LEN, OUTPUT_SIZE).
  """
  sh2_label = generate_single_label(sh2_path, sh2_chain_id, SH2_MAX_LEN)
  pep_label = generate_single_label(pep_path, pep_chain_id, PEP_MAX_LEN)
  return np.concatenate([sh2_label, pep_label], axis=0)

In [None]:
if __name__ == '__main__':
    
    # this scripts creates input and labels to SH2 and peptide neural network,
    # from data in data path
    
    # data path is a path to a directory:
    # in the directory there are several directories, representing different pdbs, in each we have:
    # pdb-name_SH2_<chain_id>.pdb
    # pdb-name_peptide_<chain_id>.pdb

    input_matrix = []
    labels_matrix = []

    all_seqs_sh2 = []
    all_seqs_pep = []

    all_pdbs = []

    empty_files = 0

    # TODO: change path to your data
    data_path = "/content/drive/MyDrive/protein_hackaton_data/actual_data" 
    
    for pdb_dir in tqdm(os.listdir(data_path)): # iterate all dirs
      empty_f = False
      print("now scanning dir: ", pdb_dir)
      had_sh2 = False
      has_pep = False
      pdb_dir = os.path.join(data_path, pdb_dir)
      for pdb in tqdm(os.listdir(pdb_dir)): # iterate sh2, pep
        assert pdb.endswith(".pdb"), f"invalid file name {pdb}"
        pdb_no_suffix = pdb[:-4] # remove .pdb
        pdb_l = pdb_no_suffix.split("_")
        assert len(pdb_l)==3, f"invalid file name {pdb}"
        pdb_name, protein_type, chain_id = pdb_l
        if protein_type == "SH2":
          sh2_path = os.path.join(pdb_dir, pdb)
          if os.path. getsize(sh2_path) == 0:
            raise Exception(f'File {sh2_path} is empty')
          sh2_chain_id = chain_id
          had_sh2 = True
        elif protein_type == "peptide":
          pep_path = os.path.join(pdb_dir, pdb)
          if os.path. getsize(pep_path) == 0:
            raise Exception(f'File {pep_path} is empty')
          pep_chain_id = chain_id
          has_pep = True
        else:
          raise Exception(f"invalid protein type {protein_type}")

      if empty_f:
        continue
      if not had_sh2 or not has_pep:
        raise Exception("sh2 or pep are missing!")

      input = generate_input(sh2_path, sh2_chain_id, pep_path, pep_chain_id)
      input_matrix.append(input)


      label = generate_label(sh2_path, sh2_chain_id, pep_path, pep_chain_id)
      labels_matrix.append(label)

      seq_sh2, _ = get_seq_aa(sh2_path, sh2_chain_id)
      seq_pep, _ = get_seq_aa(pep_path, pep_chain_id)

      all_seqs_sh2.append(seq_sh2)
      all_seqs_pep.append(seq_pep)

      all_pdbs.append(pdb_name)

    # TODO: change path to where you want to save the data you need for the network train and validation
    save_path = "/content/drive/MyDrive/protein_hackaton_data/data_to_network/"

    assert len(all_seqs_sh2) == len(input_matrix)
    
    np.save(f"{save_path}/train_input.npy", np.array(input_matrix))
    np.save(f"{save_path}/train_labels.npy", np.array(labels_matrix))
    
    with open(f"{save_path}/all_seqs_sh2.txt", "w") as f:
      for s in all_seqs_sh2:
        f.write(str(s) +"\n")
    
    with open(f"{save_path}/all_seqs_pep.txt", "w") as f:
      for s in all_seqs_pep:
          f.write(str(s) +"\n")

    with open(f"{save_path}/all_pdbs.txt", "w") as f:
      for s in all_pdbs:
          f.write(str(s) +"\n")

    print(f"Number of samples: {len(input_matrix)}")


  0%|          | 0/91 [00:00<?, ?it/s]

now scanning dir:  pdb6roz



100%|██████████| 2/2 [00:00<00:00, 1886.78it/s]


now scanning dir:  pdb6roy



100%|██████████| 2/2 [00:00<00:00, 2883.67it/s]
  2%|▏         | 2/91 [00:00<00:08, 10.85it/s]

now scanning dir:  pdb6pxc



100%|██████████| 2/2 [00:00<00:00, 2971.52it/s]


now scanning dir:  pdb6kc4



100%|██████████| 2/2 [00:00<00:00, 1883.39it/s]
  4%|▍         | 4/91 [00:00<00:08, 10.55it/s]

now scanning dir:  pdb6icg



100%|██████████| 2/2 [00:00<00:00, 2502.57it/s]


now scanning dir:  pdb5x94



100%|██████████| 2/2 [00:00<00:00, 3362.17it/s]
  7%|▋         | 6/91 [00:00<00:07, 10.87it/s]

now scanning dir:  pdb5x7b



100%|██████████| 2/2 [00:00<00:00, 1943.61it/s]


now scanning dir:  pdb5u1q



100%|██████████| 2/2 [00:00<00:00, 1872.88it/s]
  9%|▉         | 8/91 [00:00<00:07, 10.77it/s]

now scanning dir:  pdb5u06



100%|██████████| 2/2 [00:00<00:00, 1297.74it/s]


now scanning dir:  pdb5tyi



100%|██████████| 2/2 [00:00<00:00, 2923.88it/s]
 11%|█         | 10/91 [00:01<00:09,  8.89it/s]

now scanning dir:  pdb6way



100%|██████████| 2/2 [00:00<00:00, 568.72it/s]


now scanning dir:  pdb5tqs



100%|██████████| 2/2 [00:00<00:00, 1975.18it/s]
 13%|█▎        | 12/91 [00:01<00:08,  8.79it/s]

now scanning dir:  pdb5tq1



100%|██████████| 2/2 [00:00<00:00, 2637.93it/s]
 14%|█▍        | 13/91 [00:01<00:09,  8.47it/s]

now scanning dir:  pdb5mtj



100%|██████████| 2/2 [00:00<00:00, 2641.25it/s]


now scanning dir:  pdb5gji



100%|██████████| 2/2 [00:00<00:00, 3310.42it/s]
 16%|█▋        | 15/91 [00:01<00:09,  7.70it/s]

now scanning dir:  pdb5gjh



100%|██████████| 2/2 [00:00<00:00, 1364.45it/s]


now scanning dir:  pdb5eg3



100%|██████████| 2/2 [00:00<00:00, 1033.21it/s]
 19%|█▊        | 17/91 [00:01<00:08,  8.67it/s]

now scanning dir:  pdb5eeq



100%|██████████| 2/2 [00:00<00:00, 1737.85it/s]
 20%|█▉        | 18/91 [00:01<00:08,  8.89it/s]

now scanning dir:  pdb5eel



100%|██████████| 2/2 [00:00<00:00, 2302.03it/s]


now scanning dir:  pdb5df6



100%|██████████| 2/2 [00:00<00:00, 402.89it/s]
 22%|██▏       | 20/91 [00:02<00:07,  9.61it/s]

now scanning dir:  pdb5aul



100%|██████████| 2/2 [00:00<00:00, 2893.62it/s]
 23%|██▎       | 21/91 [00:02<00:07,  9.60it/s]

now scanning dir:  pdb4xz1



100%|██████████| 2/2 [00:00<00:00, 2869.86it/s]
 24%|██▍       | 22/91 [00:02<00:07,  8.99it/s]

now scanning dir:  pdb4x6s



100%|██████████| 2/2 [00:00<00:00, 639.18it/s]
 25%|██▌       | 23/91 [00:02<00:09,  7.49it/s]

now scanning dir:  pdb4u1p



100%|██████████| 2/2 [00:00<00:00, 1817.29it/s]


now scanning dir:  pdb4qsy



100%|██████████| 2/2 [00:00<00:00, 2815.91it/s]
 27%|██▋       | 25/91 [00:02<00:07,  8.84it/s]

now scanning dir:  pdb4l1b



100%|██████████| 2/2 [00:00<00:00, 3090.87it/s]
 29%|██▊       | 26/91 [00:02<00:07,  9.07it/s]

now scanning dir:  pdb4k45



100%|██████████| 2/2 [00:00<00:00, 2440.68it/s]


now scanning dir:  pdb4je4



100%|██████████| 2/2 [00:00<00:00, 718.02it/s]
 31%|███       | 28/91 [00:03<00:06,  9.61it/s]

now scanning dir:  pdb4gl9



100%|██████████| 2/2 [00:00<00:00, 1893.16it/s]


now scanning dir:  pdb3wa4



100%|██████████| 2/2 [00:00<00:00, 3138.27it/s]
 33%|███▎      | 30/91 [00:03<00:05, 10.19it/s]

now scanning dir:  pdb3tl0



100%|██████████| 2/2 [00:00<00:00, 1539.19it/s]


now scanning dir:  pdb3pqz



100%|██████████| 2/2 [00:00<00:00, 2059.06it/s]
 35%|███▌      | 32/91 [00:03<00:06,  8.72it/s]

now scanning dir:  pdb3mxy



100%|██████████| 2/2 [00:00<00:00, 1428.58it/s]


now scanning dir:  pdb3mxc



100%|██████████| 2/2 [00:00<00:00, 3014.23it/s]
 37%|███▋      | 34/91 [00:03<00:06,  9.45it/s]

now scanning dir:  pdb3maz



100%|██████████| 2/2 [00:00<00:00, 589.79it/s]


now scanning dir:  pdb3k2m



100%|██████████| 2/2 [00:00<00:00, 1674.71it/s]
 40%|███▉      | 36/91 [00:03<00:05,  9.89it/s]

now scanning dir:  pdb3hiz



100%|██████████| 2/2 [00:00<00:00, 658.34it/s]


now scanning dir:  pdb3hhm



100%|██████████| 2/2 [00:00<00:00, 1108.58it/s]
 42%|████▏     | 38/91 [00:04<00:05,  9.90it/s]

now scanning dir:  pdb2vif



100%|██████████| 2/2 [00:00<00:00, 1422.04it/s]


now scanning dir:  pdb2oq1



100%|██████████| 2/2 [00:00<00:00, 1119.82it/s]
 44%|████▍     | 40/91 [00:04<00:05, 10.02it/s]

now scanning dir:  pdb2iui



100%|██████████| 2/2 [00:00<00:00, 2742.27it/s]


now scanning dir:  pdb2hmh



100%|██████████| 2/2 [00:00<00:00, 2852.30it/s]
 46%|████▌     | 42/91 [00:04<00:04, 10.29it/s]

now scanning dir:  pdb2hdx



100%|██████████| 2/2 [00:00<00:00, 1933.75it/s]


now scanning dir:  pdb2cia



100%|██████████| 2/2 [00:00<00:00, 2810.25it/s]
 48%|████▊     | 44/91 [00:04<00:05,  8.89it/s]

now scanning dir:  pdb2ci9



100%|██████████| 2/2 [00:00<00:00, 2825.40it/s]


now scanning dir:  pdb1zfp



100%|██████████| 2/2 [00:00<00:00, 2918.79it/s]
 51%|█████     | 46/91 [00:04<00:04,  9.46it/s]

now scanning dir:  pdb1x27



100%|██████████| 2/2 [00:00<00:00, 3062.65it/s]


now scanning dir:  pdb1tze



100%|██████████| 2/2 [00:00<00:00, 688.10it/s]
 53%|█████▎    | 48/91 [00:05<00:04,  9.91it/s]

now scanning dir:  pdb1sps



100%|██████████| 2/2 [00:00<00:00, 3077.26it/s]


now scanning dir:  pdb1shb



100%|██████████| 2/2 [00:00<00:00, 2928.98it/s]
 55%|█████▍    | 50/91 [00:05<00:04,  9.94it/s]

now scanning dir:  pdb1r1q



100%|██████████| 2/2 [00:00<00:00, 449.69it/s]


now scanning dir:  pdb1p13



100%|██████████| 2/2 [00:00<00:00, 1258.42it/s]
 57%|█████▋    | 52/91 [00:05<00:04,  8.63it/s]

now scanning dir:  pdb1nzv



100%|██████████| 2/2 [00:00<00:00, 2793.41it/s]


now scanning dir:  pdb1nzl



100%|██████████| 2/2 [00:00<00:00, 1369.35it/s]
 59%|█████▉    | 54/91 [00:05<00:04,  9.07it/s]

now scanning dir:  pdb1m27



100%|██████████| 2/2 [00:00<00:00, 1585.15it/s]


now scanning dir:  pdb1lkl



100%|██████████| 2/2 [00:00<00:00, 2906.66it/s]
 62%|██████▏   | 56/91 [00:05<00:03,  9.66it/s]

now scanning dir:  pdb1lkk



100%|██████████| 2/2 [00:00<00:00, 3288.36it/s]


now scanning dir:  pdb1lck



100%|██████████| 2/2 [00:00<00:00, 2770.35it/s]
 64%|██████▎   | 58/91 [00:06<00:03,  9.21it/s]

now scanning dir:  pdb1lcj



100%|██████████| 2/2 [00:00<00:00, 3116.12it/s]
 65%|██████▍   | 59/91 [00:06<00:04,  7.93it/s]

now scanning dir:  pdb1kc2



100%|██████████| 2/2 [00:00<00:00, 1410.80it/s]


now scanning dir:  pdb1jyr



100%|██████████| 2/2 [00:00<00:00, 1491.84it/s]
 67%|██████▋   | 61/91 [00:06<00:03,  8.77it/s]

now scanning dir:  pdb1is0



100%|██████████| 2/2 [00:00<00:00, 1368.67it/s]


now scanning dir:  pdb1i3z



100%|██████████| 2/2 [00:00<00:00, 2958.94it/s]
 69%|██████▉   | 63/91 [00:06<00:02,  9.51it/s]

now scanning dir:  pdb1f1w



100%|██████████| 2/2 [00:00<00:00, 837.19it/s]


now scanning dir:  pdb1d4w



100%|██████████| 2/2 [00:00<00:00, 301.10it/s]
 71%|███████▏  | 65/91 [00:06<00:02,  9.96it/s]

now scanning dir:  pdb1d4t



100%|██████████| 2/2 [00:00<00:00, 2870.84it/s]


now scanning dir:  pdb1cwe



100%|██████████| 2/2 [00:00<00:00, 3364.86it/s]
 74%|███████▎  | 67/91 [00:07<00:02,  9.90it/s]

now scanning dir:  pdb1cwd



100%|██████████| 2/2 [00:00<00:00, 3276.80it/s]


now scanning dir:  pdb1bmb



100%|██████████| 2/2 [00:00<00:00, 3013.15it/s]
 76%|███████▌  | 69/91 [00:07<00:02,  8.78it/s]

now scanning dir:  pdb1bhf



100%|██████████| 2/2 [00:00<00:00, 1158.97it/s]


now scanning dir:  pdb1ayb



100%|██████████| 2/2 [00:00<00:00, 2154.79it/s]
 78%|███████▊  | 71/91 [00:07<00:02,  9.31it/s]

now scanning dir:  pdb1aya



100%|██████████| 2/2 [00:00<00:00, 2864.96it/s]


now scanning dir:  pdb1a81



100%|██████████| 2/2 [00:00<00:00, 952.82it/s]
 80%|████████  | 73/91 [00:07<00:01,  9.88it/s]

now scanning dir:  pdb6r5g



100%|██████████| 2/2 [00:00<00:00, 1504.68it/s]


now scanning dir:  pdb2yu7



100%|██████████| 2/2 [00:00<00:00, 3159.55it/s]
 82%|████████▏ | 75/91 [00:08<00:02,  7.89it/s]

now scanning dir:  pdb2rsy



100%|██████████| 2/2 [00:00<00:00, 2769.43it/s]
 84%|████████▎ | 76/91 [00:08<00:01,  7.87it/s]

now scanning dir:  pdb2rmx



100%|██████████| 2/2 [00:00<00:00, 2213.94it/s]
 85%|████████▍ | 77/91 [00:08<00:01,  7.71it/s]

now scanning dir:  pdb2ple



100%|██████████| 2/2 [00:00<00:00, 3044.87it/s]
 86%|████████▌ | 78/91 [00:08<00:01,  7.64it/s]

now scanning dir:  pdb2pld



100%|██████████| 2/2 [00:00<00:00, 1930.19it/s]
 87%|████████▋ | 79/91 [00:08<00:01,  6.29it/s]

now scanning dir:  pdb2mrk



100%|██████████| 2/2 [00:00<00:00, 1277.39it/s]
 88%|████████▊ | 80/91 [00:08<00:01,  6.70it/s]

now scanning dir:  pdb2lnw



100%|██████████| 2/2 [00:00<00:00, 700.98it/s]
 89%|████████▉ | 81/91 [00:09<00:01,  6.82it/s]

now scanning dir:  pdb2lct



100%|██████████| 2/2 [00:00<00:00, 1193.94it/s]
 90%|█████████ | 82/91 [00:09<00:01,  5.71it/s]

now scanning dir:  pdb2k7a



100%|██████████| 2/2 [00:00<00:00, 488.96it/s]
 91%|█████████ | 83/91 [00:09<00:01,  6.03it/s]

now scanning dir:  pdb2k79



100%|██████████| 2/2 [00:00<00:00, 3022.92it/s]
 92%|█████████▏| 84/91 [00:09<00:01,  6.34it/s]

now scanning dir:  pdb2eu0



100%|██████████| 2/2 [00:00<00:00, 3575.71it/s]
 93%|█████████▎| 85/91 [00:09<00:00,  6.71it/s]

now scanning dir:  pdb1qg1



100%|██████████| 2/2 [00:00<00:00, 725.16it/s]
 95%|█████████▍| 86/91 [00:10<00:00,  5.57it/s]

now scanning dir:  pdb1pic



100%|██████████| 2/2 [00:00<00:00, 2855.21it/s]
 96%|█████████▌| 87/91 [00:10<00:00,  6.03it/s]

now scanning dir:  pdb1ka7



100%|██████████| 2/2 [00:00<00:00, 3401.71it/s]
 97%|█████████▋| 88/91 [00:10<00:00,  6.32it/s]

now scanning dir:  pdb1csz



100%|██████████| 2/2 [00:00<00:00, 506.25it/s]
 98%|█████████▊| 89/91 [00:10<00:00,  5.47it/s]

now scanning dir:  pdb1csy



100%|██████████| 2/2 [00:00<00:00, 3338.09it/s]
 99%|█████████▉| 90/91 [00:10<00:00,  5.89it/s]

now scanning dir:  pdb1aot



100%|██████████| 2/2 [00:00<00:00, 1673.04it/s]
100%|██████████| 91/91 [00:10<00:00,  8.42it/s]

Number of samples: 91



