In [2]:
import pandas as pd
import xml.etree.ElementTree as ET
import os
from urllib.request import urlretrieve as download
from glob import glob as g
from Bio.Blast import NCBIWWW, NCBIXML
from mypdb import PDB_file as mypdb
from Bio.Blast.Applications import NcbipsiblastCommandline
from time import time as t
from tqdm.notebook import tqdm


import glob
from collections import defaultdict

import re
import MDAnalysis as mda


Due to the on going maintenance burden of keeping command line application
wrappers up to date, we have decided to deprecate and eventually remove these
modules.

We instead now recommend building your command line and invoking it directly
with the subprocess module.


### Get PDBs general information from the InterPro tsv file: `structure-matching-IPR011009.tsv`

In [3]:
structure_path = 'structure-matching-IPR011009.tsv'
pdb_data = pd.read_csv(structure_path, sep = "\t", header=0, engine='python')
pdb_data['Accession'] = pdb_data['Accession'].str.upper()

## Download PDBs

In [3]:
"""
We use a multi-thread program to download PDBs. According to the problems encountered, each failure will retry 3 times.
There should be no empty .pdb files in ./PDBs (i.e. files that are 0 KB)
It takes about 13 minutes for my device to complete the download. I think within 30 minutes could be a normal time. 
"""

import os
import requests
import time
import concurrent.futures
import multiprocessing

def download2(code, pdir=None, max_retries=3):
    """Download a PDB file with retry mechanism and failure handling"""
    base_url = "https://files.rcsb.org/download"
    pdb_url = f"{base_url}/{code}.pdb"
    f_p = os.path.join(pdir, f"{code}.pdb")

    for attempt in range(max_retries):
        try:
            response = requests.get(pdb_url, stream=True, timeout=10)
            if response.status_code == 404:
                print(f"{code} does not exist (404 Not Found)")
                return None
            response.raise_for_status()
            
            with open(f_p, "wb") as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)

            # Check file size to prevent empty files
            if os.path.getsize(f_p) == 0:
                print(f"{code}.pdb download failed (empty file), retrying {attempt+1}/{max_retries}...")
                os.remove(f_p)
                continue  # Retry

            print(f"{code}.pdb downloaded successfully")
            return f_p
        except requests.exceptions.RequestException as e:
            print(f"{code}.pdb download failed, retrying {attempt+1}/{max_retries}... Error: {e}")
            time.sleep(2)

    print(f"{code}.pdb download ultimately failed")
    return None


def download_pdbs(pdb_list, pdir=None):
    """Download multiple PDB files"""
    default_dir = "./PDBs"
    pdir = os.path.abspath(pdir if pdir else default_dir)
    os.makedirs(pdir, exist_ok=True)

    # Get already downloaded PDB files to avoid duplicate downloads
    existing_files = {os.path.splitext(f)[0] for f in os.listdir(pdir)}

    for code in pdb_list:
        if code not in existing_files:
            file_path = download2(code, pdir=pdir)
            if file_path:
                print(f"{code} downloaded successfully")
            else:
                print(f"{code} download failed")


def parallel_download(pdb_list, pdir=None):
    """Download PDB files in parallel"""
    num_workers = min(20, multiprocessing.cpu_count()*2)  # Limit the number of threads
    chunk_size = max(10, len(pdb_list) // num_workers)  # Each thread handles at least 10 PDB files

    splited_pdb_lists = [pdb_list[i:i+chunk_size] for i in range(0, len(pdb_list), chunk_size)]

    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        executor.map(download_pdbs, splited_pdb_lists)

In [None]:
pdbs_ids = pdb_data['Accession'].tolist()
parallel_download(pdbs_ids)

8ATL does not exist (404 Not Found)
8ATL download failed
7OZB does not exist (404 Not Found)
7OZB download failed
6BCU does not exist (404 Not Found)
6BCU download failed
8PYI does not exist (404 Not Found)
8PYI download failed
6Q38 does not exist (404 Not Found)
6Q38 download failed
6TLU does not exist (404 Not Found)
6TLU download failed
7BL1 does not exist (404 Not Found)
7BL1 download failed
9INW does not exist (404 Not Found)
9INW download failed
8ATN does not exist (404 Not Found)
8ATN download failed
6BCX does not exist (404 Not Found)
6BCX download failed
8PYJ does not exist (404 Not Found)
8PYJ download failed
7OZD does not exist (404 Not Found)
6Q4Q does not exist (404 Not Found)
7OZD download failed
6Q4Q download failed
6YKG does not exist (404 Not Found)
6YKG download failed
9INX does not exist (404 Not Found)
9INX download failed
7EGB does not exist (404 Not Found)
7EGB download failed
8BH3 does not exist (404 Not Found)
8BH3 download failed
8PYK does not exist (404 Not Fo

### Count download results

In [4]:
import os
import pandas as pd

folder_path = ".\PDBs"
file_names = [os.path.splitext(f)[0] for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
pdb_raw = pd.DataFrame({"PDBs": file_names})

pdb_data['Downloaded'] = pdb_data['Accession'].str.upper().isin(pdb_raw['PDBs']).map({True: True, False: False})

counts = pdb_data['Downloaded'].value_counts().to_dict()
print(f"Downloaded: {counts[True]}, Failed: {counts[False]}")

  folder_path = ".\PDBs"


Downloaded: 8054, Failed: 169


### Save failure downloads to `fail_list.csv`

In [5]:
fail_list = pdb_data[pdb_data['Downloaded']==False]
fail_list.to_csv('fail_list.csv')

## Stripping the downloaded PDBs to chains of interests.

In [6]:
""" 
Use multiprocessing to accelerate stripping process.
The main code is in `strip_pdb.py`
"""
import sys
import subprocess

# Use the current python environment 
python_executable = sys.executable

cmd = [python_executable, "strip_pdb.py"]
result = subprocess.run(cmd, capture_output=True, text=True)

# Output prints
print(result.stdout)
print(result.stderr)

Processed Results/activation_segments/unaligned\1A06_A.pdb
Processed Results/activation_segments/unaligned\1A9U_A.pdb
Processed Results/activation_segments/unaligned\1ZTF_A.pdb
Processed Results/activation_segments/unaligned\2H96_A.pdb
Processed Results/activation_segments/unaligned\3BE9_A.pdb
Processed Results/activation_segments/unaligned\1OPL_A.pdb
Processed Results/activation_segments/unaligned\3ION_A.pdb
Processed Results/activation_segments/unaligned\1AD5_A.pdb
Processed Results/activation_segments/unaligned\3NW7_A.pdb
Processed Results/activation_segments/unaligned\2H96_B.pdb
Processed Results/activation_segments/unaligned\1ZTH_A.pdb
Processed Results/activation_segments/unaligned\3EQP_A.pdb
Processed Results/activation_segments/unaligned\2QLU_A.pdb
Processed Results/activation_segments/unaligned\2WXM_A.pdb
Processed Results/activation_segments/unaligned\3IOP_A.pdb
Processed Results/activation_segments/unaligned\3EQP_B.pdb
Processed Results/activation_segments/unaligned\1ZTH_B.p