# <div align="left"> 🧬 Capstone: b-factor prediction for alphaFold structures for epitope prediction </div>

<img src="https://userguide.mdanalysis.org/stable/_images/rmsf-view.gif" height="256" align="right" style="height:256px">

#### Our goal is to leverage AlphaFold2 to improve epitope predictions beyond sequence-based methods through considering structural constraints on antigen processing, similar to the referenced paper.


<div align="left">
  <h3> 📁 Google Drive </h3>
</div>

Upon running this notebook, a new folder gets created in your Drive. You define the name of this folder and it will store all the data generated via this notebook.

<div align="left">
  <h3> 📖 Reference Materials </h3>
</div>

This notebook takes inspiration from these resources:

- [CD4+ T-cell Epitope Prediction Using Antigen Processing Constraints](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5321161/#SD1)
- [Highly accurate protein structure prediction with AlphaFold](https://doi.org/10.1038/s41586-021-03819-2)

<div align="left">
  <h3> 🌐 Legal & Data Formats </h3>
</div>

The provided code operates under the [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0). The license of the [structural prediction model parameters](https://github.com/deepmind/alphafold/#model-parameters-license) is Creative Commons Attribution 4.0 International ([CC BY 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)).
For details regarding the PAE file format, consult the [AFDB FAQ](https://alphafold.ebi.ac.uk/faq/#faq-7).

# Start by loading your data

In [None]:
%pip install biopython --q
import os, requests
import pandas as pd
from Bio.PDB import PDBParser, PPBuilder
import warnings
from Bio import BiopythonWarning

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/3.1 MB[0m [31m2.0 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/3.1 MB[0m [31m10.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.1/3.1 MB[0m [31m30.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
#@title Set Up Google Drive

#from google.colab import drive
#drive.mount('/content/drive')

%pip install biopython --q
import os, requests
import pandas as pd
from Bio.PDB import PDBParser, PPBuilder
import warnings
from Bio import BiopythonWarning

class DataSheet:
    def __init__(self):
        self.dframe = pd.read_csv(os.path.join(project_dir, 'files', 'DataSheet.csv'), index_col='ID').drop("Unnamed: 0", axis=1)[['Antigen/Gene', 'PDB ID']]
        self.ids = [pdb_id for pdb_id in self.dframe['PDB ID'] if len(pdb_id) == 4]
        self.dframe = self.dframe[self.dframe['PDB ID'].isin(self.ids)]
        self.dframe['pdb path'] = None  # Add new column 'path' with default None
        self.load()

    def load(self):
        parser = PDBParser()
        ppb = PPBuilder()
        for pdb_id in self.ids:
            pdb_path = os.path.join("/content/drive/MyDrive/EpiFold2/", "files", "pdb files", f"{pdb_id}.pdb")
            if not os.path.exists(pdb_path):
                response = requests.get(f"https://files.rcsb.org/download/{pdb_id}.pdb")
                if response.status_code == 200:
                    pdb_path = os.path.join("/content/drive/MyDrive/EpiFold2/", "files", "pdb files", f"{pdb_id}.pdb")
                    if not os.path.exists(os.path.join("/content/drive/MyDrive/EpiFold2/", "files", "pdb files")):
                      os.mkdir(os.path.join("/content/drive/MyDrive/EpiFold2/", "files", "pdb files"))
                    with open(pdb_path, 'wb') as f:
                        f.write(response.content)
                else:
                    self.dframe.drop(self.dframe['PDB ID'] == pdb_id, axis=0)
            self.dframe.loc[self.dframe['PDB ID'] == pdb_id, 'pdb path'] = pdb_path

            fasta_sequence = self.extract_sequence(parser, ppb, pdb_path)
            self.dframe.loc[self.dframe['PDB ID'] == pdb_id, 'FastA'] = str(fasta_sequence)
        self.dframe = self.dframe.reset_index(drop=True)

    def extract_sequence(self, parser, ppb, pdb_file_path):
        structure = parser.get_structure('pdb', pdb_file_path)
        for pp in ppb.build_peptides(structure):
            return pp.get_sequence()

def custom_warning(message, category, filename, lineno, file=None, line=None):
    if "Chain" in str(message) and "is discontinuous" in str(message):
        return
    else:
        warnings.showwarning(message, category, filename, lineno, file, line)
warnings.showwarning = custom_warning

project_name = input('I want to save my project results in my google drive in a folder called ')
project_dir = f'/content/drive/MyDrive/{project_name}'

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

if not os.path.exists(project_dir):
    os.mkdir(project_dir)
    os.mkdir(os.path.join(project_dir, 'files'))
print("\nPlease save you datasheet as DataSheet.csv")
from google.colab import files
files.upload()
import shutil
current_colab_dir = f'/content/drive/MyDrive/{project_name}.ipynb'
source_dir = '/content/DatasetSheet.csv'
target_dir = f'/content/drive/MyDrive/{project_name}/files/DataSheet.csv'
shutil.move(source_dir, target_dir)
datasheet = DataSheet()


#-------------------------------------------------------------------------------------------------------------

df = datasheet.dframe
df[['PDB ID', 'FastA']]


def write_fasta(filename, sequence):
    with open(filename, 'w') as f:
        f.write(">protein\n")
        f.write(sequence)

fasta_files_dir = "/content/drive/MyDrive/EpiFold2/files/fasta files"
os.makedirs(fasta_files_dir, exist_ok=True)  # Creates the directory if it doesn't already exist

for pdb_id, seq in zip(df['PDB ID'], df['FastA']):
    write_fasta(os.path.join(fasta_files_dir, f"{pdb_id}.fasta"), seq)


input_dir = fasta_files_dir
result_dir = "/content/drive/MyDrive/EpiFold2/files/results"



In [None]:
#@title Install AlphaFold2

%%bash -s $use_amber $use_templates $python_version

set -e

USE_AMBER=$1
USE_TEMPLATES=$2
PYTHON_VERSION=$3

if [ ! -f COLABFOLD_READY ]; then
  # install dependencies
  # We have to use "--no-warn-conflicts" because colab already has a lot preinstalled with requirements different to ours
  pip install -q --no-warn-conflicts "colabfold[alphafold-minus-jax] @ git+https://github.com/sokrypton/ColabFold" "tensorflow-cpu==2.11.0"
  pip uninstall -yq jax jaxlib
  pip install -q "jax[cuda]==0.3.25" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
  touch COLABFOLD_READY
fi

# Download params (~1min)
python -m colabfold.download

# setup conda
if [ ${USE_AMBER} == "True" ] || [ ${USE_TEMPLATES} == "True" ]; then
  if [ ! -f CONDA_READY ]; then
    wget -qnc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
    bash Miniconda3-latest-Linux-x86_64.sh -bfp /usr/local 2>&1 1>/dev/null
    rm Miniconda3-latest-Linux-x86_64.sh
    conda config --set auto_update_conda false
    touch CONDA_READY
  fi
fi
# setup template search
if [ ${USE_TEMPLATES} == "True" ] && [ ! -f HH_READY ]; then
  conda install -y -q -c conda-forge -c bioconda kalign2=2.04 hhsuite=3.3.0 python="${PYTHON_VERSION}" 2>&1 1>/dev/null
  touch HH_READY
fi
# setup openmm for amber refinement
if [ ${USE_AMBER} == "True" ] && [ ! -f AMBER_READY ]; then
  conda install -y -q -c conda-forge openmm=7.7.0 python="${PYTHON_VERSION}" pdbfixer 2>&1 1>/dev/null
  touch AMBER_READY
fi


'\n%%bash -s $use_amber $use_templates $python_version\n\nset -e\n\nUSE_AMBER=$1\nUSE_TEMPLATES=$2\nPYTHON_VERSION=$3\n\nif [ ! -f COLABFOLD_READY ]; then\n  # install dependencies\n  # We have to use "--no-warn-conflicts" because colab already has a lot preinstalled with requirements different to ours\n  pip install -q --no-warn-conflicts "colabfold[alphafold-minus-jax] @ git+https://github.com/sokrypton/ColabFold" "tensorflow-cpu==2.11.0"\n  pip uninstall -yq jax jaxlib\n  pip install -q "jax[cuda]==0.3.25" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html\n  touch COLABFOLD_READY\nfi\n\n# Download params (~1min)\npython -m colabfold.download\n\n# setup conda\nif [ ${USE_AMBER} == "True" ] || [ ${USE_TEMPLATES} == "True" ]; then\n  if [ ! -f CONDA_READY ]; then\n    wget -qnc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh\n    bash Miniconda3-latest-Linux-x86_64.sh -bfp /usr/local 2>&1 1>/dev/null\n    rm Miniconda3-latest-Linux-x86_64.sh\n 

In [None]:
#@title Get AlphaFold2 Structures
import sys

from colabfold.batch import get_queries, run
from colabfold.download import default_data_dir
from colabfold.utils import setup_logging
from pathlib import Path

# For some reason we need that to get pdbfixer to import
if False and f"/usr/local/lib/python{python_version}/site-packages/" not in sys.path:
    sys.path.insert(0, f"/usr/local/lib/python{python_version}/site-packages/")

if 'logging_setup' not in globals():
    setup_logging(Path(result_dir).joinpath("log.txt"))
    logging_setup = True

queries, is_complex = get_queries(input_dir)
run(queries=queries,
    result_dir=result_dir,
    use_templates=False,
    use_amber=False,
    msa_mode="MMseqs2 (UniRef+Environmental)",
    model_type="auto",
    num_models=2,
    num_recycles=3,
    model_order=[1, 2],
    is_complex=is_complex,
    data_dir=default_data_dir,
    keep_existing_results=True,
    rank_by="auto",
    pair_mode="unpaired+paired",
    stop_at_score=98,
    zip_results=True,
    user_agent="colabfold/google-colab-batch",)

'\nfrom colabfold.batch import get_queries, run\nfrom colabfold.download import default_data_dir\nfrom colabfold.utils import setup_logging\nfrom pathlib import Path\n\n# For some reason we need that to get pdbfixer to import\nif False and f"/usr/local/lib/python{python_version}/site-packages/" not in sys.path:\n    sys.path.insert(0, f"/usr/local/lib/python{python_version}/site-packages/")\n\nif \'logging_setup\' not in globals():\n    setup_logging(Path(result_dir).joinpath("log.txt"))\n    logging_setup = True\n\nqueries, is_complex = get_queries(input_dir)\nrun(queries=queries,\n    result_dir=result_dir,\n    use_templates=False,\n    use_amber=False,\n    msa_mode="MMseqs2 (UniRef+Environmental)",\n    model_type="auto",\n    num_models=2,\n    num_recycles=3,\n    model_order=[1, 2],\n    is_complex=is_complex,\n    data_dir=default_data_dir,\n    keep_existing_results=True,\n    rank_by="auto",\n    pair_mode="unpaired+paired",\n    stop_at_score=98,\n    zip_results=True,\n   

In [None]:

import os
import zipfile
import shutil
!pwd

def extract_best_pdb(zip_path, target_dir):
    pdb_id = os.path.basename(zip_path).split(".")[0]
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Extract the specific rank_001.pdb file
        for file in zip_ref.namelist():
            if "rank_001" in file and file.endswith(".pdb"):
                zip_ref.extract(file)
                # Copy the file to the target directory
                shutil.copy(file, os.path.join(target_dir, file))

def rename_files_in_folder(folder_path, prefix="AF-"):
    files = os.listdir(folder_path)
    for file_name in files:
        if file_name.endswith(".pdb"):
            old_file_path = os.path.join(folder_path, file_name)
            new_file_name = f"{prefix}{file_name.split('_')[0]}.pdb"
            new_file_path = os.path.join(folder_path, new_file_name)
            os.rename(old_file_path, new_file_path)


results_dir = '/content/drive/MyDrive/EpiFold2/files/results/'
target_dir = '/content/drive/MyDrive/EpiFold2/files/AlphaFoldStructures/'

os.makedirs(target_dir, exist_ok=True)

zip_paths = [os.path.join(results_dir, i) for i in os.listdir(results_dir) if '.zip' in i]
for zip_path in zip_paths:
    extract_best_pdb(zip_path, target_dir)

rename_files_in_folder(target_dir)

'\nimport os\nimport zipfile\nimport shutil\n!pwd\n\ndef extract_best_pdb(zip_path, target_dir):\n    pdb_id = os.path.basename(zip_path).split(".")[0]\n    with zipfile.ZipFile(zip_path, \'r\') as zip_ref:\n        # Extract the specific rank_001.pdb file\n        for file in zip_ref.namelist():\n            if "rank_001" in file and file.endswith(".pdb"):\n                zip_ref.extract(file)\n                # Copy the file to the target directory\n                shutil.copy(file, os.path.join(target_dir, file))\n\ndef rename_files_in_folder(folder_path, prefix="AF-"):\n    files = os.listdir(folder_path)\n    for file_name in files:\n        if file_name.endswith(".pdb"):\n            old_file_path = os.path.join(folder_path, file_name)\n            new_file_name = f"{prefix}{file_name.split(\'_\')[0]}.pdb"\n            new_file_path = os.path.join(folder_path, new_file_name)\n            os.rename(old_file_path, new_file_path)\n\n\nresults_dir = \'/content/drive/MyDrive/EpiFold2

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

# Predict Epitopes

In [None]:
#@title Stability Profile Comparison: Experimental v.s. AlphaFold2

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

ValueError: ignored

In [None]:
#!pip install mdanalysis
#!pip install mdbenchmark
!pip install biobb_gromacs
!pip install py3Dmol

Collecting biobb_gromacs
  Downloading biobb_gromacs-4.1.1-py3-none-any.whl (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.7/52.7 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting biobb-common==4.1.0 (from biobb_gromacs)
  Downloading biobb_common-4.1.0-py3-none-any.whl (25 kB)
Installing collected packages: biobb-common, biobb_gromacs
Successfully installed biobb-common-4.1.0 biobb_gromacs-4.1.1
Collecting py3Dmol
  Downloading py3Dmol-2.0.4-py2.py3-none-any.whl (12 kB)
Installing collected packages: py3Dmol
Successfully installed py3Dmol-2.0.4


In [None]:
#import MDAnalysis as md
#import mdbenchmark
import biobb_gromacs as bg
import py3Dmol
from zipfile import ZipFile
from io import BytesIO
#!pwd

In [None]:
#source cite for code: https://william-dawson.github.io/using-py3dmol.html

with open('/content/drive/MyDrive/EpiFold2/files/pdb files/1AOL.pdb') as pfile:
  system = "".join([x for x in pfile])

view = py3Dmol.view(width=400, height=300)
view.addModelsAsFrames(system)
view.setStyle({'model': -1}, {"cartoon": {'color': 'spectrum'}})
view.zoomTo()
view.show()
print('1AOL original pdb')

FileNotFoundError: ignored

In [None]:
zipfilepath = '/content/drive/MyDrive/EpiFold2/files/results/1AOL.result.zip'
fileiwant = '1AOL_unrelaxed_rank_001_alphafold2_ptm_model_2_seed_000.pdb'
maybe_othercomf = '1AOL_unrelaxed_rank_002_alphafold2_ptm_model_1_seed_000.pdb'



with ZipFile(zipfilepath, 'r') as z:
    if fileiwant in z.namelist():
        with z.open(fileiwant) as f:
            content = "".join([x.decode('utf-8') for x in f])

view = py3Dmol.view(width=400, height=300)
view.addModelsAsFrames(content)
view.setStyle({'model': -1}, {"cartoon": {'color': 'spectrum'}})
view.zoomTo()
view.show()
print('1AOL alphafold best model')

FileNotFoundError: ignored

testing md simulation possibilities

In [None]:
!pip install nglview
!pip install mdanalysis
!pip install openmm

In [None]:
#from simtk.open#mm.app import *
#from simtk.openmm import *
#from simtk.unit import *
import MDAnalysis as md
import nglview as ng
from sys import stdout

In [None]:
from google.colab import output
output.enable_custom_widget_manager()

In [None]:
#with open('/content/drive/MyDrive/EpiFold2/files/pdb files/1AOL.pdb') as pfile:
#  system = "".join([x for x in pfile])

pdbog = '/content/drive/MyDrive/EpiFold2/files/pdb files/1AOL.pdb'
print('md simulat for 1AOL orig')
v = md.Universe(pdbog)
ng.show_mdanalysis(v, gui=True)


md simulat for 1AOL orig


NameError: ignored

In [None]:
original = '/content/drive/MyDrive/EpiFold2/files/pdb files/1AOL.pdb'
with open(original) as f:
  q = f.read()
print(q)

FileNotFoundError: ignored

In [None]:
from Bio import PDB


zipfilepath = '/content/drive/MyDrive/EpiFold2/files/results/1AOL.result.zip'
fileiwant = '1AOL_unrelaxed_rank_001_alphafold2_ptm_model_2_seed_000.pdb'

with ZipFile(zipfilepath, 'r') as z:
    if fileiwant in z.namelist():
        with z.open(fileiwant) as f:
          content = f.read()

decode = content.decode('utf-8')
print(decode)
val = md.Universe(decode)
ng.show_mdanalysis(val, gui=True)

In [None]:
edited_alpha = '/content/drive/MyDrive/edited_alphafold_1AOL.pdb'
val = md.Universe(edited_alpha)
ng.show_mdanalysis(val, gui=True)

In [None]:

new = md.Universe(edited_alpha)
ref = md.Universe(pdbog)
#ref = MDAnalysis.Universe(PSF,CRD)    # reference open AdK (4AKE)

import MDAnalysis.analysis.rms
from MDAnalysis.analysis import rms
from MDAnalysis.analysis import align

u = md.Universe(new, ref, in_memory=True)
protein = u.select_atoms("protein")


prealigner = align.AlignTraj(u, select="protein and name CA", in_memory=True).run()

# 3) reference = average structure
reference_coordinates = u.trajectory.timeseries(asel=protein).mean(axis=1)
# make a reference structure (need to reshape into a 1-frame "trajectory")
reference = md.Merge(protein).load_new(
            reference_coordinates[:, None, :], order="afc")



#common_residues = set(ref.residues) & set(u.residues)

#common_atoms = ref.select_atoms(f"resid {' '.join(str(res.id) for res in common_residues)}")

#rmsd_analysis = rms.RMSD(u, ref)
#rmsd_analysis = rms.RMSD(common_residues)
#rmsd_analysis.run()





Support for third party widgets will remain active for the duration of the session. To disable support:

In [None]:
from google.colab import output
output.disable_custom_widget_manager()