# 01 [FINAL] rebuilding missing residue
but try to achieve high accuracy getter rmsd

In [1]:
# -------------------------------
# Step 0: Install Dependencies
# -------------------------------
!pip install biopython pdb-tools py3Dmol

# -------------------------------
# Step 1: Download and Install Modeller
# -------------------------------
!wget -q https://salilab.org/modeller/10.4/modeller-10.4.tar.gz
!tar -zxf modeller-10.4.tar.gz
!mkdir -p /content/compiled/MODELLER

%cd modeller-10.4
with open('modeller_config', "w") as f:
    f.write("2\n")
    f.write("/content/compiled/MODELLER\n")
    f.write("MODELIRANJE\n")  # <-- Replace with your license key if needed
!./Install < modeller_config

# Link mod10.4 to PATH
!ln -sf /content/compiled/MODELLER/bin/mod10.4 /usr/bin/
!mod10.4 | awk 'NR==1{if($1=="usage:") print "✅ Modeller successfully installed"; else print "❌ Installation failed"}'
%cd /content

# -------------------------------
# Step 2: Download Input Files
# -------------------------------
# In this example, we use PDB ID 4bgq and UniProt ID O76039.
# (We will use the PDB's SEQRES information for numbering.)
pdb_id = "4bgq"
uniprot_id = "O76039"

!wget -q https://files.rcsb.org/download/{pdb_id}.pdb
!wget -q https://www.uniprot.org/uniprot/{uniprot_id}.fasta -O {uniprot_id}.fasta

# Create working directory and move files
!mkdir -p /content/4bgq_fix
!mv {pdb_id}.pdb {uniprot_id}.fasta /content/4bgq_fix/
%cd /content/4bgq_fix

# Save a copy of the original PDB (with SEQRES records) for sequence extraction.
!cp {pdb_id}.pdb {pdb_id}_orig.pdb

# -------------------------------
# Step 3: Clean the PDB (Keep ATOM records only)
# -------------------------------
from Bio.PDB import PDBParser, PDBIO, Select

class StandardResidueSelect(Select):
    def accept_residue(self, residue):
        # Accept only standard amino acid residues (skip hetero atoms)
        return residue.id[0] == ' '

parser = PDBParser(QUIET=True)
structure = parser.get_structure(pdb_id, f"{pdb_id}.pdb")
io = PDBIO()
io.set_structure(structure)
io.save("4bgq_clean.pdb", select=StandardResidueSelect())
print("✅ Cleaned PDB saved as 4bgq_clean.pdb")

# For modeling, Modeller must see a file named exactly as the template in the PIR header.
# Replace the original PDB with the cleaned version.
!cp 4bgq_clean.pdb 4bgq.pdb

# -------------------------------
# Step 4: Extract Full Intended Sequence from SEQRES Records
# -------------------------------
from Bio.Data.IUPACData import protein_letters_3to1

def three_to_one(resname):
    # Convert three-letter code to one-letter code.
    return protein_letters_3to1.get(resname.capitalize(), 'X')

def get_seqres_sequence(pdb_file, chain_id="A"):
    # Parse SEQRES lines from the original PDB file.
    full_seq = ""
    with open(pdb_file, "r") as f:
        for line in f:
            # SEQRES records: chain id is at column 12 (index 11)
            if line.startswith("SEQRES") and line[11] == chain_id:
                parts = line.split()
                # Residue names start at field 5
                for res in parts[4:]:
                    full_seq += three_to_one(res)
    return full_seq

# Extract full sequence from the original PDB (with SEQRES).
full_seq = get_seqres_sequence(f"{pdb_id}_orig.pdb", chain_id="A")
print(f"✅ Full SEQRES sequence for chain A (raw): {len(full_seq)} residues.")
# For this example, our UniProt aa_range is 1-303.
# If full_seq is one residue longer (304) and the first residue is extra, trim it.
if len(full_seq) == 304 and full_seq[0] != full_seq[1]:
    full_seq = full_seq[1:]
    print("✅ Full SEQRES sequence trimmed to match intended aa_range.")
full_length = len(full_seq)
print(f"✅ Final intended sequence length: {full_length} residues.")

# -------------------------------
# Step 5: Build Template Sequence from Observed ATOM Records
# -------------------------------
# Parse the cleaned PDB (now named 4bgq.pdb) to get observed residues.
pdb_parser = PDBParser(QUIET=True)
structure = pdb_parser.get_structure(pdb_id, "4bgq.pdb")
chain = next(structure[0].get_chains())  # Assume chain A

# Build a dictionary mapping residue number -> one-letter residue (from ATOM records)
observed_dict = {}
for r in chain.get_residues():
    if r.id[0] == ' ':
        observed_dict[r.id[1]] = three_to_one(r.get_resname())

# If the numbering starts at 0, shift by +1.
if min(observed_dict.keys()) == 0:
    observed_dict = {k+1: v for k, v in observed_dict.items()}

if observed_dict:
    observed_start = min(observed_dict.keys())
    observed_end = max(observed_dict.keys())
    print(f"✅ Observed ATOM records span from residue {observed_start} to {observed_end}")
else:
    raise Exception("No observed residues found in the cleaned PDB.")

# Build the template sequence: for positions 1 to full_length, use observed residue if present, else a gap ("-").
template_seq = "".join([observed_dict.get(i, "-") for i in range(1, full_length+1)])
assert len(template_seq) == full_length, f"Template sequence length {len(template_seq)} != {full_length}"
print("✅ Template sequence constructed.")

# -------------------------------
# (We now have:)
#  - observed_start (e.g. 1) and observed_end (e.g. 277 if 277 residues observed)
#  - full_length (303, based on the intended UniProt region)
#  - template_seq for full_length positions (with gaps for missing residues)
#  - full_seq from SEQRES (303 residues)
# -------------------------------

# Adjusted Step 6: Write the PIR Alignment File with Correct Template Header
with open("alignment.ali", "w") as f:
    f.write(f""">P1;{pdb_id}
structureX:{pdb_id}:{observed_start}:A:{observed_end}:A::::
{template_seq}*
>P1;target
sequence:target:1:A:{full_length}:A::::
{full_seq}*
""")
print(f"✅ Alignment written with template range {observed_start}-{observed_end} and target range 1-{full_length}.")

# -------------------------------
# Step 7: Run Modeller to Rebuild Missing Residues
# -------------------------------
modeller_script = """
from modeller import *
from modeller.automodel import *

log.verbose()
env = environ()
env.io.hetatm = True
env.io.atom_files_directory = ['.']

a = automodel(env,
              alnfile='alignment.ali',
              knowns='4bgq',
              sequence='target',
              assess_methods=(assess.DOPE, assess.GA341))
a.starting_model = 1
a.ending_model = 1
a.make()
"""

with open("rebuild_missing_residues.py", "w") as f:
    f.write(modeller_script)

print("✅ Modeller script written. Running it...")
!mod10.4 rebuild_missing_residues.py


Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting pdb-tools
  Downloading pdb_tools-2.5.0-py3-none-any.whl.metadata (6.6 kB)
Collecting py3Dmol
  Downloading py3Dmol-2.4.2-py2.py3-none-any.whl.metadata (1.9 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdb_tools-2.5.0-py3-none-any.whl (207 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading py3Dmol-2.4.2-py2.py3-none-any.whl (7.0 kB)
Installing collected packages: py3Dmol, pdb-tools, biopython
Successfully installed biopython-1.85 pdb-tools-2.5.0 py3Dmol-2.4.2
/content/modeller-10.4
[H[2JInstallation of MODELLER 10.4

This script will install MODELLER 10.4 into a specified d