# 04 Docking automation

# 4.00 Preview data

In [1]:
import pandas as pd

# Define the file path
file_path = '03_updated_alphafold_entries.xlsx'

# Load the first sheet of the Excel file
df = pd.read_excel(file_path, sheet_name=0)

# Display the entire DataFrame
print(df.to_string())  # Converts the entire DataFrame to a string for full display

    SL UniProt                   Gene  pSite Consensus_motif_(R-P-X-[S/T]-[A/G/P/S])                                                             Protein                                                                                                                                                                                                                                Protein function in relation to CDKL5                                    Model system (e.g. animal, cell line, etc.)   Full length or kinase domain           In vitro or in vivo                                                                                                                                                                                                                                                                                              Method of identification        Citation Consensus Motifs Found Consensus Motif Range Extraction Method AlphaFold_PDB_ID                                         

# 4.01 AF_PDB chain A2B
with automatic detect failed entries 

In [2]:
!pwd
!ls
!ls 03_alphafold_structures/

/project/ealexov/compbio/shamrat/250519_energy/03_haddock
00_cdkl5_targets.xlsx
01_updated_consensus_motifs_psite_position.xlsx
02_updated_consensus_motifs_with_pdb.xlsx
03_alphafold_structures
03_updated_alphafold_entries.xlsx
250401_01_docking_data_processing.ipynb
250401_02_docking_automation_test.ipynb
AF-A0MZ66-F1.pdb  AF-P51608-F1.pdb  AF-Q14241-F1.pdb  AF-Q8TAP9-F1.pdb
AF-O76039-F1.pdb  AF-P56524-F1.pdb  AF-Q15555-F1.pdb  AF-Q8TDM6-F1.pdb
AF-P26358-F1.pdb  AF-P78347-F1.pdb  AF-Q15878-F1.pdb  AF-Q92974-F1.pdb
AF-P46940-F1.pdb  AF-P78352-F1.pdb  AF-Q66K74-F1.pdb  AF-Q9HCJ2-F1.pdb
AF-P48436-F1.pdb  AF-P84022-F1.pdb  AF-Q86YP4-F1.pdb  AF-Q9P2Y4-F1.pdb
AF-P49418-F1.pdb  AF-Q13501-F1.pdb  AF-Q8TAP8-F1.pdb  AF-Q9UPN4-F1.pdb


In [3]:
import os
import pandas as pd
import time

def change_chain_id(pdb_file, old_chain_id, new_chain_id, output_file):
    """Changes the chain identifier in a PDB file."""
    try:
        with open(pdb_file, "r") as input_f:
            lines = input_f.readlines()
    except FileNotFoundError:
        print(f"Error: {pdb_file} not found.")
        return False  # Return False to indicate failure

    modified_lines = []
    for line in lines:
        if line.startswith(("ATOM", "HETATM")):
            if line[21].strip() == old_chain_id:
                modified_lines.append(line[:21] + new_chain_id + line[22:])
            else:
                modified_lines.append(line)
        else:
            modified_lines.append(line)

    try:
        with open(output_file, "w") as output_f:
            output_f.writelines(modified_lines)
        print(f"Processed: {output_file}")
        return True  # Return True to indicate success
    except Exception as e:
        print(f"Error writing {output_file}: {e}")
        return False

def download_alphafold_structure(uniprot_id, output_path):
    """Downloads AlphaFold structure using UniProt ID."""
    url = f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb"
    try:
        os.system(f"wget -O {output_path} {url}")  # Using wget to download the file
        time.sleep(2)  # Pause to ensure the download completes
        if os.path.exists(output_path):
            print(f"Downloaded structure for {uniprot_id}: {output_path}")
            return True
        else:
            print(f"Failed to download structure for {uniprot_id}.")
            return False
    except Exception as e:
        print(f"Error downloading structure for {uniprot_id}: {e}")
        return False

# Load Excel data
file_path = "03_updated_alphafold_entries.xlsx"
df = pd.read_excel(file_path)

failed_entries = []

# Process all PDB files
for index, row in df.iterrows():
    uniprot_id = row["UniProt"]
    pdb_filename = row["AlphaFold_PDB_File_Path"]

    if pd.isna(pdb_filename) or pd.isna(uniprot_id):
        print(f"Skipping entry {index}: Missing PDB file or UniProt ID")
        continue

    input_pdb_path = os.path.join(pdb_filename)
    output_pdb_path = input_pdb_path.replace(".pdb", "_chainB.pdb")

    if os.path.exists(input_pdb_path):
        success = change_chain_id(input_pdb_path, "A", "B", output_pdb_path)
        if not success:
            failed_entries.append((uniprot_id, input_pdb_path))
    else:
        print(f"File not found: {input_pdb_path}")
        failed_entries.append((uniprot_id, input_pdb_path))

# Retry failed entries by downloading and refining
if failed_entries:
    print("\nRetrying failed entries...\n")
    for uniprot_id, pdb_path in failed_entries:
        print(f"Retrying UniProt ID: {uniprot_id}")
        success = download_alphafold_structure(uniprot_id, pdb_path)
        if success:
            output_pdb_path = pdb_path.replace(".pdb", "_chainB.pdb")
            change_chain_id(pdb_path, "A", "B", output_pdb_path)


Processed: 03_alphafold_structures/AF-O76039-F1_chainB.pdb
Processed: 03_alphafold_structures/AF-Q92974-F1_chainB.pdb
Processed: 03_alphafold_structures/AF-P49418-F1_chainB.pdb
Processed: 03_alphafold_structures/AF-Q9UPN4-F1_chainB.pdb
Processed: 03_alphafold_structures/AF-Q8TDM6-F1_chainB.pdb
Processed: 03_alphafold_structures/AF-Q8TDM6-F1_chainB.pdb
Processed: 03_alphafold_structures/AF-Q14241-F1_chainB.pdb
File not found: Download failed
Processed: 03_alphafold_structures/AF-Q66K74-F1_chainB.pdb
Processed: 03_alphafold_structures/AF-Q66K74-F1_chainB.pdb
Processed: 03_alphafold_structures/AF-Q66K74-F1_chainB.pdb
Processed: 03_alphafold_structures/AF-Q66K74-F1_chainB.pdb
Processed: 03_alphafold_structures/AF-Q15555-F1_chainB.pdb
Processed: 03_alphafold_structures/AF-Q8TAP9-F1_chainB.pdb
Processed: 03_alphafold_structures/AF-Q8TAP9-F1_chainB.pdb
Processed: 03_alphafold_structures/AF-P26358-F1_chainB.pdb
Processed: 03_alphafold_structures/AF-P56524-F1_chainB.pdb
Processed: 03_alphafold_

--2025-05-20 16:24:25--  http://failed/
Resolving failed (failed)... failed: Name or service not known.
wget: unable to resolve host address ‘failed’
--2025-05-20 16:24:25--  https://alphafold.ebi.ac.uk/files/AF-Q96L91-F1-model_v4.pdb
Resolving alphafold.ebi.ac.uk (alphafold.ebi.ac.uk)... 34.149.152.8
Connecting to alphafold.ebi.ac.uk (alphafold.ebi.ac.uk)|34.149.152.8|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2025-05-20 16:24:25 ERROR 404: Not Found.



Failed to download structure for Q96L91.


# 4.02 [success] Generate *.tbl file

## 4.2.1 preview input

In [4]:
import pandas as pd

# Define the file path
file_path = '03_updated_alphafold_entries.xlsx'

# Load the first sheet of the Excel file
df = pd.read_excel(file_path, sheet_name=0)

# Display the entire DataFrame
print(df.to_string())  # Converts the entire DataFrame to a string for full display

    SL UniProt                   Gene  pSite Consensus_motif_(R-P-X-[S/T]-[A/G/P/S])                                                             Protein                                                                                                                                                                                                                                Protein function in relation to CDKL5                                    Model system (e.g. animal, cell line, etc.)   Full length or kinase domain           In vitro or in vivo                                                                                                                                                                                                                                                                                              Method of identification        Citation Consensus Motifs Found Consensus Motif Range Extraction Method AlphaFold_PDB_ID                                         

## 4.2.2 [success] gen tbl file

In [8]:
import pandas as pd
import os

def generate_ambig_tbl(output_path, cdkl5_range, partner_range, cdkl5_pdb, partner_pdb):
    """Generates an ambiguous restraint file for HADDOCK."""
    try:
        with open(output_path, "w") as f:
            f.write("! cl1-cl2-act-act\n")
            f.write("! HADDOCK AIR restraints\n")
            f.write("!\n")
            
            for res1 in range(cdkl5_range[0], cdkl5_range[1] + 1):
                for res2 in range(partner_range[0], partner_range[1] + 1):
                    f.write(f"assign ( resid {res1} and segid A ) ( resid {res2} and segid B ) 2.0 2.0 0.0\n")
                    f.write(f"assign ( resid {res2} and segid B ) ( resid {res1} and segid A ) 2.0 2.0 0.0\n")
        return True
    except Exception as e:
        print(f"Error: {e}")
        return False

def main(input_excel, output_log):
    df = pd.read_excel(input_excel)
    results = []
    output_dir = "04_tbl_files"
    os.makedirs(output_dir, exist_ok=True)
    
    cdkl5_info = df[df['SL'] == 1].iloc[0]
    cdkl5_range = tuple(map(int, cdkl5_info['Consensus Motif Range'].split('-')))
    cdkl5_pdb = "target.B99990001_with_cryst.pdb"
    
    for _, row in df.iterrows():
        if row['SL'] == 1 or not isinstance(row['Consensus Motif Range'], str):
            continue
        
        partner_range = tuple(map(int, row['Consensus Motif Range'].split('-')))
        partner_pdb = f"{row['AlphaFold_PDB_ID']}_chainB.pdb"
        output_path = os.path.join(output_dir, f"CDKL5_{row['UniProt']}_{row['Gene']}_{partner_range[0]}-{partner_range[1]}.tbl")
        
        success = generate_ambig_tbl(output_path, cdkl5_range, partner_range, cdkl5_pdb, partner_pdb)
        results.append([row['SL'], row['UniProt'], row['Gene'], partner_range, "Success" if success else "Failed"])
    
    log_df = pd.DataFrame(results, columns=['SL', 'UniProt', 'Gene', 'Motif Range', 'Status'])
    log_df.to_excel(output_log, index=False)
    print(f"Processing complete. Log saved to {output_log}")

if __name__ == "__main__":
    main("03_updated_alphafold_entries.xlsx", "04_tbl_files/docking_log.xlsx")

Processing complete. Log saved to 04_tbl_files/docking_log.xlsx


## 4.2.3 verify motif ranges in tbl file

In [9]:
import os
import re

def parse_tbl_file(tbl_file_path):
    """Parses a HADDOCK .tbl file and extracts protein names and motif ranges."""
    cdkl5_residues = set()
    partner_residues = set()

    try:
        with open(tbl_file_path, 'r') as f:
            for line in f:
                if line.startswith('assign'):
                    match = re.search(r'resid (\d+) and segid ([A-Z])', line)
                    if match:
                        resid = int(match.group(1))
                        segid = match.group(2)
                        if segid == 'A':
                            cdkl5_residues.add(resid)
                        elif segid == 'B':
                            partner_residues.add(resid)

        cdkl5_range = (min(cdkl5_residues), max(cdkl5_residues)) if cdkl5_residues else None
        partner_range = (min(partner_residues), max(partner_residues)) if partner_residues else None

        return cdkl5_range, partner_range, "CDKL5", "Partner Protein"  # Assuming A=CDKL5 and B=Partner
    except FileNotFoundError:
        return None, None, None, None
    except Exception as e:
        print(f"Error parsing {tbl_file_path}: {e}")
        return None, None, None, None

def process_tbl_files(tbl_folder):
    """Processes all .tbl files in a folder and prints the extracted information."""
    for filename in os.listdir(tbl_folder):
        if filename.endswith('.tbl'):
            tbl_file_path = os.path.join(tbl_folder, filename)
            cdkl5_range, partner_range, cdkl5_protein, partner_protein = parse_tbl_file(tbl_file_path)

            if cdkl5_range and partner_range:
                print(f"File: {filename}")
                print(f"First Protein: {cdkl5_protein}, Motif Range: {cdkl5_range}")
                print(f"Interaction Protein: {partner_protein}, Motif Range: {partner_range}")
                print("-" * 20)
            else:
                print(f"Error processing {filename}")
                print("-" * 20)

if __name__ == "__main__":
    tbl_folder = "04_tbl_files"  # Replace with the path to your .tbl files folder
    process_tbl_files(tbl_folder)

File: CDKL5_P48436_SOX9_197-202.tbl
First Protein: CDKL5, Motif Range: (169, 174)
Interaction Protein: Partner Protein, Motif Range: (197, 202)
--------------------
File: CDKL5_P49418_AMPH1_290-294.tbl
First Protein: CDKL5, Motif Range: (169, 174)
Interaction Protein: Partner Protein, Motif Range: (290, 294)
--------------------
File: CDKL5_P56524_HDAC4_630-635.tbl
First Protein: CDKL5, Motif Range: (169, 174)
Interaction Protein: Partner Protein, Motif Range: (630, 635)
--------------------
File: CDKL5_P78347_GTF2I_671-675.tbl
First Protein: CDKL5, Motif Range: (169, 174)
Interaction Protein: Partner Protein, Motif Range: (671, 675)
--------------------
File: CDKL5_Q13501_SQSTM1_p62_270-275.tbl
First Protein: CDKL5, Motif Range: (169, 174)
Interaction Protein: Partner Protein, Motif Range: (270, 275)
--------------------
File: CDKL5_Q14241_ELOA_282-286.tbl
First Protein: CDKL5, Motif Range: (169, 174)
Interaction Protein: Partner Protein, Motif Range: (282, 286)
--------------------
F

# 4.03 Generate CFG files
in cfg file folder name issue


In [10]:
import os
import glob

# Paths
tbl_folder = "04_tbl_files"
cfg_folder = "05_cfg_files"
alphafold_folder = "03_alphafold_structures"

# Ensure cfg output folder exists
os.makedirs(cfg_folder, exist_ok=True)

# Scan for all tbl files
tbl_files = glob.glob(os.path.join(tbl_folder, "CDKL5_*.tbl"))

for tbl_file in tbl_files:
    # Extract partner info from tbl file name
    tbl_name = os.path.basename(tbl_file).replace(".tbl", "")  # Remove .tbl extension
    parts = tbl_name.split("_")  # Split filename by underscores

    if len(parts) < 4:
        print(f"Skipping malformed filename: {tbl_name}")
        continue

    # Extract components dynamically
    partner_uniprot = parts[1]  # Always the second part
    motif_range = parts[-1]  # Always the last part
    partner_gene = "_".join(parts[2:-1])  # Everything in between

    # Define output cfg file name
    cfg_filename = f"CDKL5_{partner_uniprot}_{partner_gene}_{motif_range}.cfg"
    cfg_path = os.path.join(cfg_folder, cfg_filename)

    # Define run directory to include motif range
    run_dir = f"{partner_uniprot}_{partner_gene}_{motif_range}"

    # Define the PDB file paths
    cdkl5_pdb = os.path.join(alphafold_folder, "target.B99990001_with_cryst.pdb")
    partner_pdb = os.path.join(alphafold_folder, f"AF-{partner_uniprot}-F1_chainB.pdb")

    # Generate .cfg file content
    cfg_content = f"""
# ====================================================================
# Protein-protein docking configuration for {partner_gene} ({partner_uniprot})

# Directory for docking
run_dir = "{run_dir}"

# Compute mode
mode = "local"
ncores = 60

# Molecules to be docked
molecules = [
  "{cdkl5_pdb}",
  "{partner_pdb}"
]

# ====================================================================
[topoaa]
autohis=true

[rigidbody]
tolerance = 20
sampling = 20
ambig_fname="{tbl_file}"

[caprieval]

[seletop]
select = 20

[flexref]
tolerance = 20
previous_ambig = true

[caprieval]

[emref]
previous_ambig = true

[caprieval]

[clustfcc]

[seletopclusts]
top_models = 4

[caprieval]

# ====================================================================
"""

    # Write the cfg file
    with open(cfg_path, "w") as cfg_file:
        cfg_file.write(cfg_content)

    print(f"Generated: {cfg_path}")

print("\nAll .cfg files generated in 05_cfg_files/")


Generated: 05_cfg_files/CDKL5_P48436_SOX9_197-202.cfg
Generated: 05_cfg_files/CDKL5_P49418_AMPH1_290-294.cfg
Generated: 05_cfg_files/CDKL5_P56524_HDAC4_630-635.cfg
Generated: 05_cfg_files/CDKL5_P78347_GTF2I_671-675.cfg
Generated: 05_cfg_files/CDKL5_Q13501_SQSTM1_p62_270-275.cfg
Generated: 05_cfg_files/CDKL5_Q14241_ELOA_282-286.cfg
Generated: 05_cfg_files/CDKL5_Q15555_EB2_MAPRE2_220-224.cfg
Generated: 05_cfg_files/CDKL5_Q15878_CACNA1E_Cav2.3_11-15.cfg
Generated: 05_cfg_files/CDKL5_Q15878_CACNA1E_Cav2.3_561-565.cfg
Generated: 05_cfg_files/CDKL5_Q66K74_MAP1S_635-639.cfg
Generated: 05_cfg_files/CDKL5_Q66K74_MAP1S_897-901.cfg
Generated: 05_cfg_files/CDKL5_Q86YP4_GATAD2A_97-101.cfg
Generated: 05_cfg_files/CDKL5_Q8TAP8_PPP1R35_49-53.cfg
Generated: 05_cfg_files/CDKL5_Q8TAP9_TTDN1_37-41.cfg
Generated: 05_cfg_files/CDKL5_Q8TAP9_TTDN1_7-11.cfg
Generated: 05_cfg_files/CDKL5_Q8TDM6_DLG5_1112-1116.cfg
Generated: 05_cfg_files/CDKL5_Q8TDM6_DLG5_981-985.cfg
Generated: 05_cfg_files/CDKL5_Q92974_ARHGEF2_

## 4.04 Run haddock serial onpalmetto
06_run_haddock_serial_jobs.sh 

## 4.05 organize cluster1_model1

In [1]:
import os
import shutil

# Define the base working directory
base_dir = "/project/ealexov/compbio/shamrat/250519_energy/03_haddock"
output_dir = os.path.join(base_dir, "06_cluster1_models")

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# List all entries in the base directory
for folder_name in os.listdir(base_dir):
    folder_path = os.path.join(base_dir, folder_name)

    # Check if it's a directory and matches the folder name pattern
    if os.path.isdir(folder_path) and "_" in folder_name:
        pdb_file = os.path.join(folder_path, "09_seletopclusts", "cluster_1_model_1.pdb")
        
        if os.path.exists(pdb_file):
            new_filename = f"{folder_name}_cluster_1_model_1.pdb"
            destination_path = os.path.join(output_dir, new_filename)
            
            shutil.copyfile(pdb_file, destination_path)
            print(f"Copied: {new_filename}")
        else:
            print(f"Missing: {pdb_file}")


Missing: /project/ealexov/compbio/shamrat/250519_energy/03_haddock/.ipynb_checkpoints/09_seletopclusts/cluster_1_model_1.pdb
Missing: /project/ealexov/compbio/shamrat/250519_energy/03_haddock/03_alphafold_structures/09_seletopclusts/cluster_1_model_1.pdb
Missing: /project/ealexov/compbio/shamrat/250519_energy/03_haddock/04_tbl_files/09_seletopclusts/cluster_1_model_1.pdb
Missing: /project/ealexov/compbio/shamrat/250519_energy/03_haddock/05_cfg_files/09_seletopclusts/cluster_1_model_1.pdb
Missing: /project/ealexov/compbio/shamrat/250519_energy/03_haddock/06_cluster1_models/09_seletopclusts/cluster_1_model_1.pdb
Copied: P48436_SOX9_197-202_cluster_1_model_1.pdb
Copied: P49418_AMPH1_290-294_cluster_1_model_1.pdb
Copied: P56524_HDAC4_630-635_cluster_1_model_1.pdb
Copied: P78347_GTF2I_671-675_cluster_1_model_1.pdb
Copied: Q13501_SQSTM1_p62_270-275_cluster_1_model_1.pdb
Copied: Q14241_ELOA_282-286_cluster_1_model_1.pdb
Copied: Q15555_EB2_MAPRE2_220-224_cluster_1_model_1.pdb
Copied: Q15878_CA