# Instructions
This notebook takes a protein sequence as input and predict a structure model per each amino acid in the sequence with ESMFold. That is, if you gives a sequence of length = 10, it will predict 10 models.

**To run this notebook, you will need**:
1. Enable the use of a GPU in the google colab enviroment
2. A directory in your google drive to save the models (`folder_id`)
3. A protein sequence (`sequence`)
4. A protein name (`jobname`)


Inspired by: 
- https://twitter.com/MartinPacesa/status/1640005299124707328

The code was adapted from the ESMFold notebook of ColabFold
- https://github.com/sokrypton/ColabFold 

Notebook made by GAMA
- https://twitter.com/miangoar



In [None]:
#@title Install libraries

import os, time
if not os.path.isfile("esmfold.model"):
  # download esmfold params
  os.system("apt-get install aria2 -qq")
  os.system("aria2c -q -x 16 https://colabfold.steineggerlab.workers.dev/esm/esmfold.model &")

  # install libs
  os.system("pip install -q omegaconf pytorch_lightning biopython ml_collections einops py3Dmol")
  os.system("pip install -q git+https://github.com/NVIDIA/dllogger.git")

  # install openfold
  commit = "6908936b68ae89f67755240e2f588c09ec31d4c8"
  os.system(f"pip install -q git+https://github.com/aqlaboratory/openfold.git@{commit}")

  # install esmfold
  os.system(f"pip install -q git+https://github.com/sokrypton/esm.git")

  # wait for Params to finish downloading...
  if not os.path.isfile("esmfold.model"):
    # backup source!
    os.system("aria2c -q -x 16 https://files.ipd.uw.edu/pub/esmfold/esmfold.model")
  else:
    while os.path.isfile("esmfold.model.aria2"):
      time.sleep(5)

In [None]:
#@title Set the environment and connect to google drive

# import libs 
from pydrive.drive import GoogleDrive
from pydrive.auth import GoogleAuth
from google.colab import auth
from oauth2client.client import GoogleCredentials
from tqdm import tqdm
import pandas as pd
import plotly.express as px
from google.colab import files

# google drive login
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


# import libs to use ESMFold 
from string import ascii_uppercase, ascii_lowercase
import hashlib, re, os
import numpy as np
from jax.tree_util import tree_map
import matplotlib.pyplot as plt
from scipy.special import softmax

# create functions to do the structure predictions
def parse_output(output):
  pae = (output["aligned_confidence_probs"][0] * np.arange(64)).mean(-1) * 31
  plddt = output["plddt"][0,:,1]
  
  bins = np.append(0,np.linspace(2.3125,21.6875,63))
  sm_contacts = softmax(output["distogram_logits"],-1)[0]
  sm_contacts = sm_contacts[...,bins<8].sum(-1)
  xyz = output["positions"][-1,0,:,1]
  mask = output["atom37_atom_exists"][0,:,1] == 1
  o = {"pae":pae[mask,:][:,mask],
       "plddt":plddt[mask],
       "sm_contacts":sm_contacts[mask,:][:,mask],
       "xyz":xyz[mask]}
  return o

def get_hash(x): return hashlib.sha1(x.encode()).hexdigest()
alphabet_list = list(ascii_uppercase+ascii_lowercase)

In [None]:
#@title Set directory in google drive to save the PDBs
#@markdown You have to create it in your google drive session

# get the name of the dir
folder_id = 'ESMFold_results' #@param {type:"string"}
file_list = drive.ListFile({'q': "mimeType='application/vnd.google-apps.folder' and trashed=false"}).GetList()
for file in file_list:
    if file['title'] == folder_id:
        folder_id = file['id']
        break

In [None]:
#@title Structure prediction
#@markdown For example, the Top7 protein: https://www.rcsb.org/structure/1qys

# set a seq to predict
sequence = 'MGDIQVQVNIDDNGKNFDYTYTVTTESELQKVLNELMDYIKKQGAKRVRISITARTKKEAEKFAAILIKVFAELGYNDINVTFDGDTVTVEGQLEGGSLEHHHHHH' #@param {type:"string"}

# to sample data results
pae_lst = []
plddt_lst = []
model_lst = []
seq_len_lst = []

# do the structural preds
for i in tqdm(range(len(sequence)-3),  desc = "Predicting folds with ESMFold"):
    sequence = sequence[:-1]
    seq_len = len(sequence)
    jobname = "Top7" #@param {type:"string"}
    jobname = jobname + "_" + str(seq_len).zfill(5)
    jobname = re.sub(r'\W+', '', jobname)[:50]
    
    sequence = re.sub("[^A-Z:]", "", sequence.replace("/",":").upper())
    sequence = re.sub(":+",":",sequence)
    sequence = re.sub("^[:]+","",sequence)
    sequence = re.sub("[:]+$","",sequence)
    copies = 1 
    if copies == "" or copies <= 0: copies = 1
    sequence = ":".join([sequence] * copies)
    num_recycles = 3 
    chain_linker = 25 

    ID = jobname+"_"+get_hash(sequence)[:5]
    seqs = sequence.split(":")
    lengths = [len(s) for s in seqs]
    length = sum(lengths)

    u_seqs = list(set(seqs))
    if len(seqs) == 1: mode = "mono"
    elif len(u_seqs) == 1: mode = "homo"
    else: mode = "hetero"

    if "model" not in dir():
      import torch
      model = torch.load("esmfold.model")
      model.eval().cuda().requires_grad_(False)

    # optimized for Tesla T4
    if length > 700:
      model.set_chunk_size(64)
    else:
      model.set_chunk_size(128)

    torch.cuda.empty_cache()
    output = model.infer(sequence,
                        num_recycles=num_recycles,
                        chain_linker="X"*chain_linker,
                        residue_index_offset=512)

    pdb_str = model.output_to_pdb(output)[0]
    output = tree_map(lambda x: x.cpu().numpy(), output)
    ptm = output["ptm"][0]
    plddt = output["plddt"][0,...,1].mean()
    O = parse_output(output)
    prefix = f"{jobname}"
    with open(f"{prefix}.pdb","w") as out:
      out.write(pdb_str)

    model_lst.append(jobname)
    pae_lst.append(ptm)
    plddt_lst.append(plddt)
    seq_len_lst.append(seq_len)

    # upload the results to google drive
    uploaded = drive.CreateFile({'title': f"{jobname}.pdb", 'parents': [{'id': folder_id}]})
    uploaded.SetContentFile(f"{jobname}.pdb")
    uploaded.Upload()

Predicting folds with ESMFold: 100%|██████████| 103/103 [06:08<00:00,  3.58s/it]


In [None]:
#@title Plot the quality of the models

# create the csv
df = pd.DataFrame()
df["model"] = model_lst
df["ptm"] = pae_lst
df["plddt"] = plddt_lst
df["number_of_aa"] = seq_len_lst
df.to_csv('sequentially_ESMFold_results.csv', index=False)

# plot 
fig = px.scatter(df, x="number_of_aa", y="plddt", title='Model quality', height = 600, width = 1400, hover_name="model")
fig.update_traces(marker=dict(color = "red", size=8, line=dict(width=1, color='black')),selector=dict(mode='markers'))
fig.update_layout(template="plotly_white")
fig.update_yaxes(showline=True, linewidth=1, linecolor='LightGrey')
fig.update_xaxes(showline=True, linewidth=1, linecolor='LightGrey')
fig.add_shape(type="line",x0=0, y0=90, x1=df['number_of_aa'].max(), y1=90,line=dict(color="darkblue", width=3), opacity = 0.5, line_dash="dash")
fig.add_shape(type="line",x0=0, y0=70, x1=df['number_of_aa'].max(), y1=70,line=dict(color="lightskyblue", width=3), opacity = 0.5, line_dash="dash")
fig.add_shape(type="line",x0=0, y0=50, x1=df['number_of_aa'].max(), y1=50,line=dict(color="yellow", width=3), opacity = 0.8, line_dash="dash")
fig.show()

In [None]:
#@title (Optional) Download a CSV with the data
files.download('sequentially_ESMFold_results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Make a movie with ChimeraX

To record our movie we need to download all of the PDBs saved in google drive. Then, open ChimeraX and run the next commands (assuming you´re using a machine with Windows 11):

1. open all PDB files in the directory ESMFold_results

  `open C:\Desktop\ESMFold_results\*.pdb`

2. Align the structures (take into account the order and number of your models models, here an example for 100 PDBs)

  `matchmaker #2-99 to #100`

3. (optional) Set de style of the structures

  `sel #0-100; rainbow res; graphics silhouettes true width 2`

1. Run the make_chimerax_movie.py script to record a movie

  `runscript C:\Desktop\make_chimerax_movie.py`

  **Note: take care about your paths (C:\Desktop\ is only for windows users)** 


## make_chimerax_movie.py 

Code provided by Martin Pacesa:
- https://twitter.com/MartinPacesa/status/1640234188874952704 

Copy and paste the next code into a script called `make_chimerax_movie.py`, and save it into your desktop (or any other directory of your preference)

In [None]:
# import libs
from chimerax.core.commands import run

# set a function to record a movie
def make_movie(session, output_path, supersample=3, quality="highest", framerate=60, speed=10, rotate=True, rotation_degrees_per_frame=0.25, color="bfactor"):

  # Hide all models initially and set the enviroment
  run(session, "hide models")
  run(session, "set bgColor white")
  run(session, "graphics silhouettes true")
  run(session, "lighting multishadow 512")

  # Set up movie recording
  run(session, f"movie record supersample {supersample}")

  # Iterate through the models and show them one by one
  for model in session.models.list():
    # Show the current ,pdeñ amd colour it based on input
    run(session, f"show #{model.id_string} models")
    if color == "rainbow":
      run(session, f"rainbow #{model.id_string}")
    elif color == "bfactor":
      run(session, f"color bfactor palette alphafold")
    else:
      run(session, f"color {color}")

    if rotate:
      # Rotate and advanve the movie timeline
      for _ in range(speed): # Advance this amount of frames per rotation degrees, speed/60 seconds
        # Rotate the scene around the Y axis
        run(session, f"turn y {rotation_degrees_per_frame}")
        # Advance the movie timeline by one frame
        run(session, f"wait 1")
      else:
        # Advance the movie timeline by speed/60 seconds
        run(session, f"wait {speed}")

      # Hide the current model before moving to the next one
      run(session, f"hide #{model.id_string} models")

  # Stop movie recording
  run(session, "movie stop") # la línea movida fuera del bucle for

  # Encode and save the movie
  run(session, f"movie encode {output_path} quality {quality} framerate {framerate}")

# Run the maike_movie function with your output file path
make_movie(session, "D:\Desktop\my_movie.mp4")