# Preprocess PED data PDB files

In this code, PDB files from the PED database (url : https://proteinensemble.org/) are downloaded using API. An excel list with the PED entry of only experimentally obtained proteins isused to filter the downloads. The files are downloaded as targs using the API, after these are converted to PDB files. After, the proteins that are not single chain topologies and contain more than 200 frames are filtered out. Consequently, the hydrogen atoms are removed and residues are renumbered and only the lines not starting with 'ATOM' are removed.

Data is saved in the CT_code/PDB_files_preparation/output/pdbs_traj/single_chain' and can be copied to the protein_CT/input folder by running the last block of code.

## Import libraries

In [2]:
import pandas as pd
import requests
import os
import mdtraj
import tarfile
import shutil

## Function to renumber residue numbers

In [2]:
#Function to renumber all incorrectly numbered residues from trajectory folder. 
def renumber_residues(folder_path, start_residue_number):
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdb'):
            input_pdb = os.path.join(folder_path, filename)
    
            with open(input_pdb, 'r') as infile:
                lines = infile.readlines()
            
            # Check the residue number in the first ATOM/HETATM line
            first_residue_number = None
            for line in lines:
                if line.startswith(("ATOM", "HETATM")):
                    first_residue_number = int(line[22:26].strip())
                    break
            
            # Only proceed with renumbering if the first residue is 9999 (-1) or 0
            if first_residue_number != 1:
                with open(input_pdb, 'w') as outfile:
                    current_residue_number = start_residue_number
                    last_residue_id = None
                    first_residue = True

                    for line in lines:
                        # Only renumber residue information in ATOM/HETATM records
                        if line.startswith(("ATOM", "HETATM")):
                            residue_id = line[22:26].strip()

                            # Check if it is on a new residue
                            if residue_id != last_residue_id:
                                last_residue_id = residue_id
                                # Increment only after the first residue
                                if not first_residue:
                                    current_residue_number += 1
                                first_residue = False

                            # Replace residue number in the line
                            new_line = (
                                line[:22] +
                                "{:4}".format(current_residue_number) +  
                                line[26:]
                            )
                            outfile.write(new_line)
                        else:
                            # Write non-ATOM/HETATM lines as is
                            outfile.write(line)
        
                print("Residues renumbered successfully in {}".format(input_pdb))
            else:
                print("Skipped renumbering for {} (first residue is already 1)".format(filename))


## Download PDB files with API

Only proteins that are experimentally obtained are used. The PED entries of these proteins are kept in the Excel list.

In [3]:
# Read in excel file with all PED's and create list
df_PED_entry = pd.read_excel('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/PED_data/Excel_experimental_PED.xlsx')
PED_list = df_PED_entry.iloc[:, 0].tolist()
# Remove duplicates and sort
ped_names = sorted(set(PED_list))

In [4]:
# This URL is used to load in all entries of a specific PED
url_ped_entries = "https://deposition.proteinensemble.org/api/v1/entries/{ped_name}"

# This URL is used to get all the ensemble-sample data of the specific PED
url_ensemble = "https://deposition.proteinensemble.org/api/v1/entries/{ped_name}/ensembles/{ensemble}/ensemble-pdb?response_format=csv"

In [5]:
# Make directory to output.targzs
if not os.path.isdir('output/PED_data/targzs'): os.makedirs('output/PED_data/targzs')

# Loop through the PED_names to get all the ensembles for this PED
for ped_name in ped_names:
    # Construct the API URL to get all ensembles for this PED
    api_url = url_ped_entries.format(ped_name=ped_name)
    print('api_url: ', api_url)

    # Get all ensemble for each PED
    response = requests.get(api_url)
    if response.status_code == 200:
        data = response.json()  
        ensembles = data.get('ensembles')  
        ensembles = [ensemble.get('ensemble_id') for ensemble in ensembles]
        print('ensembles:', ensembles)

    # Loop through all ensembles to get ensemble sample data of all ensembles
        for ensemble in ensembles:
            
            # Save all ensembles in the format: PEDxxxx_exxx.targz in the output folder
            targz_name = '{}_{}.targz'.format(ped_name, ensemble)
            targz_path = 'output/PED_data/targzs/{targz_name}'.format(targz_name=targz_name)
            if not os.path.isfile(targz_path):
                    
                # Specifiy URL to get ensemble data from
                ensemble_api_url = url_ensemble.format(ped_name=ped_name, ensemble=ensemble)
                response = requests.get(ensemble_api_url)
                # Check if the request was successful
                if response.status_code == 200:
                        # If successful, save the content to a file
                        with open(targz_path, "wb") as file:
                            file.write(response.content)
                        print("file downloaded successfully!")
                    
                else:
                    print("Failed to retrieve data for PED: {ped_names}, Ensemble: {ensemble}".format(ped_names=ped_names, ensemble=ensemble))

api_url:  https://deposition.proteinensemble.org/api/v1/entries/PED00001
ensembles: ['e001', 'e002', 'e003']
file downloaded successfully!
file downloaded successfully!
file downloaded successfully!
api_url:  https://deposition.proteinensemble.org/api/v1/entries/PED00002
ensembles: ['e001']
file downloaded successfully!
api_url:  https://deposition.proteinensemble.org/api/v1/entries/PED00003
ensembles: ['e001']
file downloaded successfully!
api_url:  https://deposition.proteinensemble.org/api/v1/entries/PED00004
ensembles: ['e001']
file downloaded successfully!
api_url:  https://deposition.proteinensemble.org/api/v1/entries/PED00005
ensembles: ['e001']
file downloaded successfully!
api_url:  https://deposition.proteinensemble.org/api/v1/entries/PED00006
ensembles: ['e001']
file downloaded successfully!
api_url:  https://deposition.proteinensemble.org/api/v1/entries/PED00007
ensembles: ['e001']
file downloaded successfully!
api_url:  https://deposition.proteinensemble.org/api/v1/entries

## Unzip targzs and save as pdbs in pdb folder

In [3]:
# Make a sorted list of PEDXXXXX_eXXX.targz in targzs folder
path_targzs = '/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/PED_data/targzs'
list_targzs = sorted([f for f in os.listdir(path_targzs) if os.path.isfile(os.path.join(path_targzs, f)) and not f.startswith('.')])

In [27]:
# Create unqiue folder per targz file, and rename
for targz in list_targzs:
    
    if not os.path.isdir('output/PED_data/pdbs/{}'.format(targz[:-6])): 
        os.makedirs('output/PED_data/pdbs/{}'.format(targz[:-6]))
        
        os.system('tar -xvf output/PED_data/targzs/{targz} -C output/PED_data/pdbs/{folder_name}'.format(
            targz=targz, folder_name=targz[:-6]))
        
        folder = 'output/PED_data/pdbs/{}'.format(targz[:-6])
        file_list = sorted(os.listdir(folder))  # Get the list BEFORE renaming
        
        for i, file in enumerate(file_list):
            old_name = '{}/{}'.format(folder, file)
            
            if i == 0:
                new_name = '{}/{}.pdb'.format(folder, targz[:-6])
            else:
                new_name = '{}/{}.{}.pdb'.format(folder, targz[:-6], i)
            
            os.rename(old_name, new_name)

In [28]:
# Make list of the PED in folder pdbs and loop through this list
root = 'output/PED_data/pdbs'
list_pdbs = []
for path, subdirs, files in os.walk(root):
    for name in files:
        if name.endswith('.pdb'):
            list_pdbs.append(os.path.join(path, name))

list_pdbs.sort()
list_pdbs

['output/PED_data/pdbs/PED00001_e001/PED00001_e001.pdb',
 'output/PED_data/pdbs/PED00001_e002/PED00001_e002.pdb',
 'output/PED_data/pdbs/PED00001_e003/PED00001_e003.pdb',
 'output/PED_data/pdbs/PED00002_e001/PED00002_e001.pdb',
 'output/PED_data/pdbs/PED00003_e001/PED00003_e001.pdb',
 'output/PED_data/pdbs/PED00004_e001/PED00004_e001.pdb',
 'output/PED_data/pdbs/PED00005_e001/PED00005_e001.pdb',
 'output/PED_data/pdbs/PED00006_e001/PED00006_e001.pdb',
 'output/PED_data/pdbs/PED00007_e001/PED00007_e001.pdb',
 'output/PED_data/pdbs/PED00008_e001/PED00008_e001.pdb',
 'output/PED_data/pdbs/PED00009_e001/PED00009_e001.pdb',
 'output/PED_data/pdbs/PED00010_e001/PED00010_e001.pdb',
 'output/PED_data/pdbs/PED00011_e001/PED00011_e001.pdb',
 'output/PED_data/pdbs/PED00012_e001/PED00012_e001.pdb',
 'output/PED_data/pdbs/PED00013_e001/PED00013_e001.pdb',
 'output/PED_data/pdbs/PED00014_e001/PED00014_e001.pdb',
 'output/PED_data/pdbs/PED00014_e002/PED00014_e002.pdb',
 'output/PED_data/pdbs/PED00014

## Filter on single chain and < 200 frames

In [29]:
exception_list = []
# Make a new folder for PED.pdb trajectories
if not os.path.isdir('output/pdbs_traj/PED_single_frame'):
    os.makedirs('output/pdbs_traj/PED_single_frame')

# Load each pdb_traj file
for pdb in list_pdbs:
    try:
        # Define the base name for saving frames
        base_name = os.path.basename(pdb)[:-4]

        # Check if the first frame already exists
        if not os.path.isfile(f'output/pdbs_traj/PED_single_chain/{base_name}.0.pdb'):
            print(f"Processing file: {base_name}")

            # Load the trajectory
            traj = mdtraj.load(pdb)

            # Check if the trajectory has less than 200 frames
            if traj.n_frames < 200:
                # Ensure it's a single-chain topology before saving frames
                if traj.topology.n_chains == 1:
                    for i, frame in enumerate(traj):
                        frame.save(f'output/pdbs_traj/PED_single_chain/{base_name}.{i}.pdb')
                    print(f"Saved {traj.n_frames} frames from {base_name} to output/pdbs_traj/PED_single_chain")
                else:
                    print(f"Skipping {base_name}: Not a single-chain topology.")
            else:
                print(f"Skipping {base_name}: Contains {traj.n_frames} frames (exceeds limit).")
    except Exception as e:
        print(f"File {os.path.basename(pdb)} is skipped due to error: {e}")
        exception_list.append(pdb)

Processing file: PED00001_e001
Saved 11 frames from PED00001_e001 to output/pdbs_traj/PED_single_chain
Processing file: PED00001_e002
Saved 10 frames from PED00001_e002 to output/pdbs_traj/PED_single_chain
Processing file: PED00001_e003
Saved 11 frames from PED00001_e003 to output/pdbs_traj/PED_single_chain
Processing file: PED00002_e001
File PED00002_e001.pdb is skipped due to error: PDB Error: All MODELs must contain the same number of ATOMs
Processing file: PED00003_e001
Skipping PED00003_e001: Contains 575 frames (exceeds limit).
Processing file: PED00004_e001
Saved 130 frames from PED00004_e001 to output/pdbs_traj/PED_single_chain
Processing file: PED00005_e001
File PED00005_e001.pdb is skipped due to error: PDB Error: All MODELs must contain the same number of ATOMs
Processing file: PED00006_e001
Skipping PED00006_e001: Contains 576 frames (exceeds limit).
Processing file: PED00007_e001
Saved 17 frames from PED00007_e001 to output/pdbs_traj/PED_single_chain
Processing file: PED00



Saved 21 frames from PED00239_e001 to output/pdbs_traj/PED_single_chain
Processing file: PED00240_e001
Skipping PED00240_e001: Not a single-chain topology.
Processing file: PED00241_e001
Skipping PED00241_e001: Not a single-chain topology.
Processing file: PED00242_e001
Saved 20 frames from PED00242_e001 to output/pdbs_traj/PED_single_chain
Processing file: PED00243_e001
Saved 20 frames from PED00243_e001 to output/pdbs_traj/PED_single_chain
Processing file: PED00244_e001
Skipping PED00244_e001: Not a single-chain topology.
Processing file: PED00245_e001
Saved 20 frames from PED00245_e001 to output/pdbs_traj/PED_single_chain
Processing file: PED00246_e001
Skipping PED00246_e001: Not a single-chain topology.
Processing file: PED00247_e001
Saved 20 frames from PED00247_e001 to output/pdbs_traj/PED_single_chain
Processing file: PED00248_e001
Saved 20 frames from PED00248_e001 to output/pdbs_traj/PED_single_chain
Processing file: PED00249_e001
Saved 20 frames from PED00249_e001 to output/p

## Only save lines starting with 'ATOM' and skip hydrogen atoms

In [4]:
# Go through folder and delete all lines from folders that don't start with \"ATOM\"\n",
files = sorted([f for f in os.listdir('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/PED_single_frame') if os.path.isfile(os.path.join('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/PED_single_frame',f))and not f.startswith('.')])
for file_name in files:
    file_path = os.path.join('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/PED_single_frame', file_name)
    with open(file_path, 'r') as file:
        lines = [
            line for line in file
            if line.startswith("ATOM") and not line[12:16].strip().startswith("H")
        ]
    
    with open(file_path, 'w') as file:
        file.writelines(lines)

## Renumber residues

In [31]:
# Renumber the residue numbers in pdb files
folder_path = '/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/PED_single_frame'
start_residue_number = 1
renumber_residues(folder_path, start_residue_number)

Residues renumbered successfully in /Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/PED_single_chain/PED00258_e001.25.pdb
Skipped renumbering for PED00213_e001.116.pdb (first residue is already 1)
Skipped renumbering for PED00366_e001.10.pdb (first residue is already 1)
Skipped renumbering for PED00022_e006.80.pdb (first residue is already 1)
Skipped renumbering for PED00238_e003.33.pdb (first residue is already 1)
Skipped renumbering for PED00192_e001.18.pdb (first residue is already 1)
Skipped renumbering for PED00302_e001.9.pdb (first residue is already 1)
Residues renumbered successfully in /Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/PED_single_chain/PED00318_e001.3.pdb
Skipped renumbering for PED00142_e003.46.pdb (first residue is already 1)
Skipped renumbering for PED00212_e001.49.pdb (first residue is already 1)
Residues renumbered successfully in /Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/PED_singl

## Copy files to Protein_CT/input/pdb/

Only run this piece of code when you want to export PDB files to protein_CT/input_files/pdb/

In [39]:
#If folder does not exist, create folder
if not os.path.isdir('/Users/murielhammond/RP1/CT_code/Protein_CT/input_files/pdb'): os.makedirs('/Users/murielhammond/RP1/CT_code/Protein_CT/input_files/pdb')

#Copy pdb files to /Protein_CT/input_files/pdb
for num,files in enumerate(os.listdir('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/single_chain')):
    if files.endswith(('cif','pdb')):
        trajectory_PDB_files_preparation_path = os.path.join('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/single_chain', files)
        trajectory_protein_CT_path = '/Users/murielhammond/RP1/CT_code/Protein_CT/input_files/pdb'
        shutil.copy(trajectory_PDB_files_preparation_path, trajectory_protein_CT_path)