In this notebook we will recreate the encodings since I realised I had not set the model in evaluation form, hence dropout was still active.

In [1]:
import torch
import sys
sys.path.insert(1, "/home/luchinoprince/Dropbox/Old_OneDrive/Phd/Second_year/research/Feinauer/esm/")
import esm
import esm.pretrained as pretrained
import time
import biotite.structure
from biotite.structure.io import pdbx, pdb
from biotite.structure.residues import get_residues
from biotite.structure import filter_backbone
from biotite.structure import get_chains
from biotite.sequence import ProteinSequence
from typing import Sequence, Tuple, List
import numpy as np
sys.path.insert(1, './../util/')
from ioutils import read_fasta, read_encodings


import os
import warnings


In [4]:
val = torch.load('/Data/InverseFoldingData/msas/train/2ggoA01_train.a3m.pt')

In [5]:
val.shape

torch.Size([10026, 210])

In [2]:
def load_structure(fpath, chain=None):
    """
    Args:
        fpath: filepath to either pdb or cif file
        chain: the chain id or list of chain ids to load
    Returns:
        biotite.structure.AtomArray
    """
    with open(fpath) as fin:
        pdbf = pdb.PDBFile.read(fin)
    structure = pdb.get_structure(pdbf, model=1)
    bbmask = filter_backbone(structure)
    structure = structure[bbmask]
    all_chains = get_chains(structure)
    if len(all_chains) == 0:
        raise ValueError('No chains found in the input file.')
    if chain is None:
        chain_ids = all_chains
    elif isinstance(chain, list):
        chain_ids = chain
    else:
        chain_ids = [chain] 
    for chain in chain_ids:
        if chain not in all_chains:
            raise ValueError(f'Chain {chain} not found in input file')
    chain_filter = [a.chain_id in chain_ids for a in structure]
    structure = structure[chain_filter]
    return structure

def extract_coords_from_structure(structure: biotite.structure.AtomArray):
    """
    Args:
        structure: An instance of biotite AtomArray
    Returns:
        Tuple (coords, seq)
            - coords is an L x 3 x 3 array for N, CA, C coordinates
            - seq is the extracted sequence
    """
    coords = get_atom_coords_residuewise(["N", "CA", "C"], structure)
    residue_identities = get_residues(structure)[1]
    seq = ''.join([ProteinSequence.convert_letter_3to1(r) for r in residue_identities])
    return coords, seq

def get_atom_coords_residuewise(atoms: List[str], struct: biotite.structure.AtomArray):
    """
    Example for atoms argument: ["N", "CA", "C"]
    """
    def filterfn(s, axis=None):
        filters = np.stack([s.atom_name == name for name in atoms], axis=1)
        sum = filters.sum(0)
        if not np.all(sum <= np.ones(filters.shape[1])):
            raise RuntimeError("structure has multiple atoms with same name")
        index = filters.argmax(0)
        coords = s[index].coord
        coords[sum == 0] = float("nan")
        return coords

    return biotite.structure.apply_residue_wise(struct, struct, filterfn)

In [3]:
import numpy as np

In [33]:
os.listdir('/Data/InverseFoldingData/msas/train')[1000]


'1x6hA00_train.a3m'

In [40]:
#val = torch.load('/Data/InverseFoldingData/msas/train/4ymhD00_train.a3m.pt')
structure_dir = '/home/luchinoprince/dompdb/'
##pdb_name = '7mdhA02'
pdb_name = '1bo1A02'
pdb_path = os.path.join(structure_dir, pdb_name)

structure =  load_structure(pdb_path)
coords, native_seq = extract_coords_from_structure(structure)



In [41]:
coords.shape

(159, 3, 3)

In [35]:
val

tensor([[20, 20, 20,  ..., 20, 20, 20],
        [20, 20, 20,  ..., 20, 20, 20],
        [20, 20, 20,  ..., 20, 20, 20],
        ...,
        [20, 20, 20,  ..., 20, 20, 20],
        [20, 20, 20,  ..., 20, 20, 20],
        [20, 20, 20,  ...,  8,  8, 20]], dtype=torch.uint8)

In [28]:
coords.shape

(234, 3, 3)

In [43]:
msa_dir = "/media/luchinoprince/b1715ef3-045d-4bdf-b216-c211472fb5a2/NewData/msas/"
failures=0
iterator = 0
failed = []
failed_id = []
for msa_file in os.listdir(msa_dir):
    if msa_file.endswith('a3m'):
        iterator+=1
        pdb_id = msa_file[0:7]
        msa,q = read_fasta(os.path.join(msa_dir, msa_file))
        pdb_path = os.path.join(structure_dir, pdb_id)
        structure =  load_structure(pdb_path)
        coords, native_seq = extract_coords_from_structure(structure)
        N1 = msa.shape[1]
        N2 = coords.shape[0]
        if N1!=N2:
            failed.append((N1, N2))
            failures+=1
            failed_id.append(pdb_id)
        





RuntimeError: structure has multiple atoms with same name

In [45]:
failures

7488

In [46]:
iterator

12068

In [38]:
check = "VFSHRLTVHRKYDLKGSTVAREASDKEKAKDLPTFKDNDFLNEGQKLHVGEESKKNFLEKLKRDVEFLAQLKIMDYSLLVGIHDVDRAEQEEMEVEERAEDEEFDPSVDVYAMKSHESSPKKEVYFMAIIDILTPYDTKKKAAHAAKTVKHGAGAEISTVNPEQYSKRFNEFMSNIL"

In [39]:
len(check)

177

In [30]:
model, alphabet = pretrained.esm_if1_gvp4_t16_142M_UR50() 
model.eval()
rep = esm.inverse_folding.util.get_encoder_output(model, alphabet, coords)

In [31]:
rep.shape

torch.Size([234, 512])

In [19]:
rep_saved = torch.load('/Data/InverseFoldingData/structure_encodings/7mdhA02.encodings.pt')

In [22]:
rep_saved

{'encodings': array([[ 0.7480723 ,  0.499571  ,  0.03796096, ...,  0.09580989,
         -0.12788884, -0.65789604],
        [ 0.7037929 ,  0.11167169, -0.25585902, ...,  0.19436896,
          0.33624163, -1.0783957 ],
        [-1.2896324 , -0.9073951 , -1.4845986 , ..., -0.24444567,
          0.7893612 ,  0.25480703],
        ...,
        [-1.105352  , -0.28117   , -0.8511495 , ...,  0.6657607 ,
          0.10624966,  0.553963  ],
        [-0.64596516, -0.3589821 , -0.49763972, ...,  0.25180244,
         -0.08886597, -0.10089812],
        [ 0.28333354, -0.00579392,  0.29894474, ..., -0.01227771,
         -0.11104949, -0.03438382]], dtype=float32),
 'seq': 'LTRLDENRAKCQLALKAGVFYDKVSNVTIWGNHSTTQVPDFLNAKIDGRPVKEVIKRTKWLEEEFTITVQKRGGALIQKWGRSSAASTAVSIADAIKSLVTPTPEGDWFSTGVYTTGNPYGIAEDIVFSMPCRSKGDGDYELATDVSNDDFLWERIKKSEAELLAEKKCVAHL'}

In [21]:
native_seq

'LTRLDENRAKCQLALKAGVFYDKVSNVTIWGNHSTTQVPDFLNAKIDGRPVKEVIKRTKWLEEEFTITVQKRGGALIQKWGRSSAASTAVSIADAIKSLVTPTPEGDWFSTGVYTTGNPYGIAEDIVFSMPCRSKGDGDYELATDVSNDDFLWERIKKSEAELLAEKKCVAHL'

Let us recreate the encodings again now.

In [8]:
structure_dir = '/Data/christoph/bocconi/dompdb'
encodings_dir = '/Data/InverseFoldingData/structure_encodings'
model, alphabet = pretrained.esm_if1_gvp4_t16_142M_UR50() 
model.eval()
counter = 0
counter_fail = 0
n_files = len(os.listdir(structure_dir))
failed = []
counter_bk = 0 ##where the code previously stopped
device = 3
model.to(device)
warnings.filterwarnings('ignore')



for pdb_name in (os.listdir(structure_dir)):
    print(f"We are at iteration {counter} out of {n_files-counter_bk}, we failed:{counter_fail}", end="\r")
    try:
        aux = {}
        pdb_path = os.path.join(structure_dir, pdb_name)
        structure =  load_structure(pdb_path)
        coords, native_seq = extract_coords_from_structure(structure)
        coords = torch.from_numpy(coords)
        coords = coords.to(device)
        rep = esm.inverse_folding.util.get_encoder_output(model, alphabet, coords, device=device)
        rep = rep.to('cpu')
        aux['encodings'] = rep
        aux['seq'] = native_seq
        encoding_name = pdb_name + ".encodings.pt"
        encoding_path = os.path.join(encodings_dir, encoding_name)
        torch.save(aux, encoding_path)
        counter+=1
    except Exception:
        counter+=1
        counter_fail+=1
        failed.append(pdb_path)
        pass





We are at iteration 31884 out of 31885, we failed:1

In [2]:
torch.cuda.empty_cache()

In [9]:
len(failed)

1

In [61]:
#os.listdir('/Data/christoph/bocconi/dompdb')

Let us try to see if when we load we have to trim something

In [25]:
structure_dir = '/Data/christoph/bocconi/dompdb'
encodings_dir = '/Data/InverseFoldingData/structure_encodings'
msa_folder = '/Data/InverseFoldingData/msas/train/'
msa_file = os.listdir(msa_folder)[5]
msa_path = os.path.join(msa_folder, msa_file)
encoding_file = msa_file[0:7] + '.encodings.pt'
encoding_path = os.path.join(encodings_dir, encoding_file)

structure_dir = '/Data/christoph/bocconi/dompdb'
structure_file = msa_file[0:7]
##pdb_path=os.path.join(structure_dir, structure_file)
##structure=load_structure(pdb_path)
##coords, native_seq = extract_coords_from_structure(structure)

In [31]:
pdb_path = os.path.join(structure_dir, structure_file)
structure =  load_structure(pdb_path)
coords, native_seq = extract_coords_from_structure(structure)
coords = torch.from_numpy(coords)
coords = coords.to(device)
rep = esm.inverse_folding.util.get_encoder_output(model, alphabet, coords, device=device)

In [32]:
rep.shape

torch.Size([83, 512])

In [33]:
coords.shape

torch.Size([83, 3, 3])

In [35]:
len(native_seq)

83

In [39]:
msa = torch.load(msa_path)
encodings = read_encodings(encoding_path, trim=False)

In [40]:
encodings.shape

torch.Size([83, 512])

In [41]:
msa.shape

torch.Size([4429, 83])

In [6]:
device=3
structure_dir = '/Data/christoph/bocconi/dompdb'
encodings_dir = '/Data/InverseFoldingData/structure_encodings'
model, alphabet = pretrained.esm_if1_gvp4_t16_142M_UR50() 
model.eval()
model.to(device)


pdb_name = os.listdir(structure_dir)[0]
os.listdir(structure_dir)[0]
aux = {}
pdb_path = os.path.join(structure_dir, pdb_name)
structure =  load_structure(pdb_path)
coords, native_seq = extract_coords_from_structure(structure)
coords = torch.from_numpy(coords)
coords = coords.to(device)
rep = esm.inverse_folding.util.get_encoder_output(model, alphabet, coords, device=device)
aux['encodings'] = rep
aux['seq'] = native_seq

  warn("{} elements were guessed from atom_name.".format(rep_num))
  F.pad(torch.tensor(cd), (0, 0, 0, 0, 1, 1), value=np.inf)


In [7]:
encodings

NameError: name 'encodings' is not defined

In [69]:
?esm.inverse_folding.util.get_encoder_output

[0;31mSignature:[0m [0mesm[0m[0;34m.[0m[0minverse_folding[0m[0;34m.[0m[0mutil[0m[0;34m.[0m[0mget_encoder_output[0m[0;34m([0m[0mmodel[0m[0;34m,[0m [0malphabet[0m[0;34m,[0m [0mcoords[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mFile:[0m      /Data/silva/esm/esm/inverse_folding/util.py
[0;31mType:[0m      function


In [62]:
next(model.parameters()).is_cuda

True

In [63]:
coords.is_cuda

True

In [46]:
torch.from_numpy(coords).

tensor([[[14.5220, 23.2910,  8.7280],
         [15.5340, 22.3840,  9.2780],
         [16.1090, 22.9010, 10.6110]],

        [[16.1930, 24.2280, 10.7850],
         [16.7900, 24.8280, 11.9850],
         [15.8980, 24.6880, 13.1810]],

        [[16.4360, 25.0090, 14.3490],
         [15.6590, 25.0660, 15.5850],
         [15.7710, 26.4170, 16.2410]],

        [[14.7150, 26.8080, 16.9510],
         [14.7760, 27.9930, 17.7720],
         [15.7430, 27.7220, 18.9230]],

        [[16.4860, 28.7600, 19.3070],
         [17.3860, 28.6380, 20.4090],
         [16.5950, 28.5090, 21.7120]],

        [[16.8580, 27.4680, 22.5180],
         [16.0730, 27.3280, 23.7700],
         [16.5170, 28.2590, 24.8550]],

        [[17.6630, 28.8730, 24.6630],
         [18.1450, 29.7720, 25.6580],
         [17.7490, 31.2190, 25.4110]],

        [[18.0340, 31.7290, 24.2140],
         [17.9610, 33.1530, 23.9690],
         [16.6300, 33.7630, 24.4390]],

        [[15.4540, 33.2460, 23.9890],
         [14.1920, 33.9140, 24.389

In [15]:
pdb_name = os.listdir(structure_dir)[15571]

In [10]:
len(os.listdir(structure_dir)[counter_bk:])

16315

In [16]:
pdb_path = os.path.join(structure_dir, pdb_name)
structure =  load_structure(pdb_path)
coords, native_seq = extract_coords_from_structure(structure)
rep = esm.inverse_folding.util.get_encoder_output(model, alphabet, coords)

NameError: name 'load_structure' is not defined