# Introduction

This notebook is an adaptation of the ESMFold notebook of [ColabFold](https://github.com/sokrypton/ColabFold#making-protein-folding-accessible-to-all-via-google-colab)

It takes a dataframe with two columns:
1. protein_name: the identifier of your proteins
2. seq: the amino acid sequence

If your dataset does not have this format you have to change it. 

You have to create a directory (variable = `folder_id`) in the main root of your Google Drive. This notebook will access to this directory to save each predicted protein in PDB format. 



In [1]:
%%time
#@title Set the environment and connect to google drive
# @markdown It will ask for access permissions

# import libs 
from pydrive.drive import GoogleDrive
from pydrive.auth import GoogleAuth
from google.colab import auth
from oauth2client.client import GoogleCredentials
from tqdm import tqdm
import pandas as pd
import plotly.express as px
from google.colab import files

# google drive login
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


CPU times: user 1.08 s, sys: 125 ms, total: 1.2 s
Wall time: 21.6 s


In [2]:
%%time
#@title Set directory in google drive to save the PDBs
#@markdown You have to create it in your google drive session

# get the name of the dir
folder_id = 'name_of_your_directory' #@param {type:"string"}
file_list = drive.ListFile({'q': "mimeType='application/vnd.google-apps.folder' and trashed=false"}).GetList()
for file in file_list:
    if file['title'] == folder_id:
        folder_id = file['id']
        break

CPU times: user 27 ms, sys: 3.02 ms, total: 30 ms
Wall time: 1.97 s


In [3]:
%%time
#@title install
#@markdown install ESMFold, OpenFold and download Params (~2min 30s)

import os, time
if not os.path.isfile("esmfold.model"):
  # download esmfold params
  os.system("apt-get install aria2 -qq")
  os.system("aria2c -q -x 16 https://colabfold.steineggerlab.workers.dev/esm/esmfold.model &")

  # install libs
  os.system("pip install -q omegaconf pytorch_lightning biopython ml_collections einops py3Dmol")
  os.system("pip install -q git+https://github.com/NVIDIA/dllogger.git")

  # install openfold
  commit = "6908936b68ae89f67755240e2f588c09ec31d4c8"
  os.system(f"pip install -q git+https://github.com/aqlaboratory/openfold.git@{commit}")

  # install esmfold
  os.system(f"pip install -q git+https://github.com/sokrypton/esm.git")

  # wait for Params to finish downloading...
  if not os.path.isfile("esmfold.model"):
    # backup source!
    os.system("aria2c -q -x 16 https://files.ipd.uw.edu/pub/esmfold/esmfold.model")
  else:
    while os.path.isfile("esmfold.model.aria2"):
      time.sleep(5)

CPU times: user 764 ms, sys: 120 ms, total: 884 ms
Wall time: 3min 32s


In [4]:
%%time
#@title Import libs
from tqdm import tqdm
import plotly.express as px
import pandas as pd

from string import ascii_uppercase, ascii_lowercase
import hashlib, re, os
import numpy as np
from jax.tree_util import tree_map
import matplotlib.pyplot as plt
from scipy.special import softmax

def parse_output(output):
  pae = (output["aligned_confidence_probs"][0] * np.arange(64)).mean(-1) * 31
  plddt = output["plddt"][0,:,1]
  
  bins = np.append(0,np.linspace(2.3125,21.6875,63))
  sm_contacts = softmax(output["distogram_logits"],-1)[0]
  sm_contacts = sm_contacts[...,bins<8].sum(-1)
  xyz = output["positions"][-1,0,:,1]
  mask = output["atom37_atom_exists"][0,:,1] == 1
  o = {"pae":pae[mask,:][:,mask],
       "plddt":plddt[mask],
       "sm_contacts":sm_contacts[mask,:][:,mask],
       "xyz":xyz[mask]}
  return o

def get_hash(x): return hashlib.sha1(x.encode()).hexdigest()
alphabet_list = list(ascii_uppercase+ascii_lowercase)

CPU times: user 587 ms, sys: 86.6 ms, total: 674 ms
Wall time: 678 ms


In [6]:
# load your data 
df = pd.read_csv("part_1.csv")

for idx, row in df.head(3).iterrows():
  protein_name = row["protein_name"]
  print(protein_name)
  seq = row["seq"]
  print(seq)

Class_A_AFAM2492_1_seq_2559
MRARLHALERRHGARLGVYAANVRTGATVSHRAGERFALCSTFKPLAAAAILRDRDHHGEFLARTIHYTEHDLVEYSPVTEEHVGTGMRVDALCAAAIRYSDNTAGNLLLRQIGGPAGLTAFCRSLGDPVTRLDRWETDLNSALPGDPRDTTTPAAIGRDYARLVLGDALGREDNALLTEWLLGNTTSGDRFRAGLPDGWRIGDKTGSGDYGTANDVGLAWTTLGTPLVLAVLSTKHAPEADWDDELIAETARLLAHDLAPGE
Class_A_AFAM2493_1_seq_2560
MRALEERYSARLGVHARNTRTGQSVGYRAGERFALCSTFKVFAAGAVLRDHAGSAPLDKVVRYPDRDILLNSPVTQQHVGSGMTVGELCAAAIRHSDNCAGNLLLRELGGPAGLTAFFRSLGDRVSRLDRWEPDLNSAGPGELRDSTTPEALGASLERLTVGDELSGAAREQLLTWLKGNTTSDRRFRAGLPRGWVVGDKTGTGDYASANDIGVAWTTRGTPLVLVVLTSKDAPDATVDEALIADAAAVLADTLAPGE
Class_A_AFAM2494_1_seq_2561
MSDLEQRYGARLGVYAHNVRTGRTVAYRAGERFAMCSTFKTFAAAAVLRDHGGCAPLDRVIHYPPRDILPNSPKTEEHLATGMSVGDLCAAAIQYSDNAAGNLLLRQIGGPDGLTRFFRSLGDRVSRLDRWETDLNTAVPGDPRDTTTPEAIGRSFERLTLGRALDGTDREQLVTWLKGNTTSAERFGRGLPQGWVLGDKTGTGDYATANDIGVAWTTRGTPIVLAVLSTKAAKDAPVDNALVADTARLLARTLAPGE


In [None]:
#@title ##run **ESMFold**
model_lst = []
pae_lst = []
plddt_lst = []
seq_len_lst = []

for idx, row in tqdm(df.iterrows(),  desc = "Predicting folds with ESMFold"):
  
  protein_name = str(row["protein_name"])
  seq = str(row["seq"])

  jobname = protein_name
  jobname = re.sub(r'\W+', '', jobname)[:50]

  sequence = seq
  sequence = re.sub("[^A-Z:]", "", sequence.replace("/",":").upper())
  sequence = re.sub(":+",":",sequence)
  sequence = re.sub("^[:]+","",sequence)
  sequence = re.sub("[:]+$","",sequence)
  copies = 1 
  if copies == "" or copies <= 0: copies = 1
  sequence = ":".join([sequence] * copies)
  num_recycles = 3
  chain_linker = 25 

  ID = jobname+"_"+get_hash(sequence)[:5]
  seqs = sequence.split(":")
  lengths = [len(s) for s in seqs]
  length = sum(lengths)

  u_seqs = list(set(seqs))
  if len(seqs) == 1: mode = "mono"
  elif len(u_seqs) == 1: mode = "homo"
  else: mode = "hetero"

  if "model" not in dir():
    import torch
    model = torch.load("esmfold.model")
    model.eval().cuda().requires_grad_(False)

  # optimized for Tesla T4
  if length > 700:
    model.set_chunk_size(64)
  else:
    model.set_chunk_size(128)

  torch.cuda.empty_cache()
  output = model.infer(sequence,
                       num_recycles=num_recycles,
                       chain_linker="X"*chain_linker,
                       residue_index_offset=512)

  pdb_str = model.output_to_pdb(output)[0]
  output = tree_map(lambda x: x.cpu().numpy(), output)
  ptm = output["ptm"][0]
  plddt = output["plddt"][0,...,1].mean()
  O = parse_output(output)
  #os.system(f"mkdir -p {ID}")
  #prefix = f"{ID}/ptm{ptm:.3f}_r{num_recycles}_default"
  #np.savetxt(f"{prefix}.pae.txt",O["pae"],"%.3f")
  with open(f"{protein_name}.pdb","w") as out:
    out.write(pdb_str)


  model_lst.append(jobname)
  pae_lst.append(ptm)
  plddt_lst.append(plddt)
  
  # upload the results to google drive
  uploaded = drive.CreateFile({'title': f"{protein_name}.pdb", 'parents': [{'id': folder_id}]})
  uploaded.SetContentFile(f"{protein_name}.pdb")
  uploaded.Upload()

Predicting folds with ESMFold: 154it [1:15:05, 30.04s/it]

In [None]:
#@title Plot the quality of the models

# create the csv
df = pd.DataFrame()
df["model"] = model_lst
df["ptm"] = pae_lst
df["plddt"] = plddt_lst

# plot 
fig = px.histogram(df, x="model", y="plddt", title='Model quality', height = 600, width = 800, hover_name="model")
fig.update_traces(marker=dict(color = "red", size=8, line=dict(width=1, color='black')),selector=dict(mode='markers'))
fig.update_layout(template="plotly_white")
fig.update_yaxes(showline=True, linewidth=1, linecolor='LightGrey')
fig.update_xaxes(showline=True, linewidth=1, linecolor='LightGrey')
fig.show()