# Preprocess SCOPe data PDB files 

In this code, PDB files from the SCOPe database (url : https://scop.berkeley.edu/ are downloaded and preprocessed before it can be used for analyses. The proteins that are not single chain topologies and contain more than 200 frames are filtered out. Consequently, the hydrogen atoms are removed and residues are renumbered and only the lines not starting with 'ATOM' are removed.

Data is saved in the CT_code/PDB_files_preparation/output/pdbs_traj/SCOPe_single_chain' and can be copied to the protein_CT/input folder by running the last block of code.

## Importing libraries

In [6]:
import pandas as pd
import requests
import os
import mdtraj
import tarfile
import shutil
from requests import get

## Function to renumber residues

In [6]:
def renumber_residues(folder_path, start_residue_number):
    
#Function to renumber all incorrectly numbered residues from trajectory folder. 
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdb'):
            input_pdb = os.path.join(folder_path, filename)
    
            with open(input_pdb, 'r') as infile:
                lines = infile.readlines()
            
            # Check the residue number in the first ATOM/HETATM line
            first_residue_number = None
            for line in lines:
                if line.startswith(("ATOM", "HETATM")):
                    first_residue_number = int(line[22:26].strip())
                    break
            
            # Only proceed with renumbering if the first residue is 9999 (-1) or 0
            if first_residue_number != 1:
                with open(input_pdb, 'w') as outfile:
                    current_residue_number = start_residue_number
                    last_residue_id = None
                    first_residue = True  # Flag to handle the first residue separately

                    for line in lines:
                        # Only renumber residue information in ATOM/HETATM records
                        if line.startswith(("ATOM", "HETATM")):
                            residue_id = line[22:26].strip()

                            # Check if we're on a new residue
                            if residue_id != last_residue_id:
                                last_residue_id = residue_id
                                # Increment only after the first residue
                                if not first_residue:
                                    current_residue_number += 1
                                first_residue = False  # Set flag to False after first residue

                            # Replace residue number in the line
                            new_line = (
                                line[:22] +
                                "{:4}".format(current_residue_number) +  # Format the new residue number
                                line[26:]
                            )
                            outfile.write(new_line)
                        else:
                            # Write non-ATOM/HETATM lines as is
                            outfile.write(line)
        
                print("Residues renumbered successfully in {}".format(input_pdb))
            else:
                print("Skipped renumbering for {} (first residue is already 1)".format(filename))


## Download each pdb file from web URL

From the SCOPe database, one protein from every number of folds is taken, which can be filtered using the following linkes. To see all protein classes: https://scop.berkeley.edu/statistics/ver=2.08
To download the corresponding sid numbers to folds: https://scop.berkeley.edu/astral/subsets/ver=2.08

In [4]:
# Make a list of all sids
file_path = "/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/SCOPe_data/astral-scopedom-seqres-gd-sel-gs-sc-cf-2.08.id"

# Read the file and store each value as an element in a list
with open(file_path, "r") as file:
    sid_list = [line.strip() for line in file]

pdb_list = [item[1:-2] for item in sid_list]

In [7]:
# Go through folder and delete all lines from folders that don't start with \"ATOM\"\n" and skip hydrogen atoms
files = sorted([f for f in os.listdir('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/SCOPe_data/SCOPe/') if os.path.isfile(os.path.join('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/SCOPe_data/SCOPe/',f)) and not f.startswith('.')])

for file_name in files:
    file_path = os.path.join('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/SCOPe_data/SCOPe/', file_name)
    with open(file_path, 'r') as file:
        lines = [
            line for line in file
            if line.startswith("ATOM") and not line[12:16].strip().startswith("H")
        ]
    with open(file_path, 'w') as file:
        file.writelines(lines)

In [10]:
# Make list of the PED in folder pdbs and loop through this list
root = 'output/SCOPe_data/SCOPe'
list_SCOPe = []
for path, subdirs, files in os.walk(root):
    for name in files:
        if name.endswith('.pdb'):
            list_SCOPe.append(os.path.join(path, name))

list_SCOPe.sort()
list_SCOPe

['output/pdbs_traj/SCOPe/.ipynb_checkpoints/16VP-checkpoint.pdb',
 'output/pdbs_traj/SCOPe/.ipynb_checkpoints/1AD2-checkpoint.pdb',
 'output/pdbs_traj/SCOPe/.ipynb_checkpoints/1AEP-checkpoint.pdb',
 'output/pdbs_traj/SCOPe/.ipynb_checkpoints/1AF7-checkpoint.pdb',
 'output/pdbs_traj/SCOPe/.ipynb_checkpoints/1B7C-checkpoint.pdb',
 'output/pdbs_traj/SCOPe/.ipynb_checkpoints/1BGF-checkpoint.pdb',
 'output/pdbs_traj/SCOPe/.ipynb_checkpoints/1F5N-checkpoint.pdb',
 'output/pdbs_traj/SCOPe/.ipynb_checkpoints/1GZS-checkpoint.pdb',
 'output/pdbs_traj/SCOPe/.ipynb_checkpoints/1IX9-checkpoint.pdb',
 'output/pdbs_traj/SCOPe/.ipynb_checkpoints/1JX4-checkpoint.pdb',
 'output/pdbs_traj/SCOPe/.ipynb_checkpoints/1O70-checkpoint.pdb',
 'output/pdbs_traj/SCOPe/.ipynb_checkpoints/1O7J-checkpoint.pdb',
 'output/pdbs_traj/SCOPe/.ipynb_checkpoints/1OAI-checkpoint.pdb',
 'output/pdbs_traj/SCOPe/.ipynb_checkpoints/1PFV-checkpoint.pdb',
 'output/pdbs_traj/SCOPe/.ipynb_checkpoints/1PUC-checkpoint.pdb',
 'output/p

## Filter on single chain and < 200 frames

In [11]:
exception_list = []

# Make a new folder for PED.pdb trajectories
output_dir = 'output/pdbs_traj/SCOPe_single_frame'
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

# Process each PDB trajectory file
for pdb in list_SCOPe:
    try:
        # Define the base name for saving frames
        base_name = os.path.basename(pdb)[:-4] + "_e001"

        # Check if the first frame already exists to avoid redundant processing
        if not os.path.isfile(f'{output_dir}/{base_name}.0.pdb'):
            print(f"Processing file: {base_name}")

            # Load the trajectory
            traj = mdtraj.load(pdb)

            # Check if the trajectory has less than 200 frames
            if traj.n_frames < 200:
                # Ensure it's a single-chain topology before saving frames
                if traj.topology.n_chains == 1:
                    current_segment = []
                    frame_index = 0

                    # Read the PDB content
                    with open(pdb, 'r') as file:
                        pdb_content = file.readlines()

                    # Split the PDB file into frames based on atom serial resets
                    for line in pdb_content:
                        if line.startswith("ATOM") or line.startswith("HETATM"):
                            # Extract the atom serial number
                            serial_number = int(line[6:11].strip())
                            if serial_number == 1 and current_segment:
                                # Save the completed segment to a new file
                                output_file = os.path.join(output_dir, f"{base_name}.{frame_index}.pdb")
                                with open(output_file, 'w') as frame_file:
                                    frame_file.writelines(current_segment)
                                frame_index += 1
                                current_segment = []

                        current_segment.append(line)

                    # Save the last segment if it exists
                    if current_segment:
                        output_file = os.path.join(output_dir, f"{base_name}.{frame_index}.pdb")
                        with open(output_file, 'w') as frame_file:
                            frame_file.writelines(current_segment)

                    print(f"Saved frames from {base_name} to {output_dir}")
                else:
                    print(f"Skipping {base_name}: Not a single-chain topology.")
            else:
                print(f"Skipping {base_name}: Contains {traj.n_frames} frames (exceeds limit).")
    except Exception as e:
        print(f"File {os.path.basename(pdb)} is skipped due to error: {e}")
        exception_list.append(pdb)

print("Processing completed.")
if exception_list:
    print(f"The following files were skipped due to errors: {exception_list}")

Processing file: 16VP-checkpoint_e001
Skipping 16VP-checkpoint_e001: Not a single-chain topology.
Processing file: 1AD2-checkpoint_e001
Saved frames from 1AD2-checkpoint_e001 to output/pdbs_traj/SCOPe_single_frame
Processing file: 1AEP-checkpoint_e001
Saved frames from 1AEP-checkpoint_e001 to output/pdbs_traj/SCOPe_single_frame
Processing file: 1AF7-checkpoint_e001
Skipping 1AF7-checkpoint_e001: Not a single-chain topology.
Processing file: 1B7C-checkpoint_e001
Skipping 1B7C-checkpoint_e001: Not a single-chain topology.
Processing file: 1BGF-checkpoint_e001
Skipping 1BGF-checkpoint_e001: Not a single-chain topology.
Processing file: 1F5N-checkpoint_e001
Saved frames from 1F5N-checkpoint_e001 to output/pdbs_traj/SCOPe_single_frame
Processing file: 1GZS-checkpoint_e001
Skipping 1GZS-checkpoint_e001: Not a single-chain topology.
Processing file: 1IX9-checkpoint_e001
Skipping 1IX9-checkpoint_e001: Not a single-chain topology.
Processing file: 1JX4-checkpoint_e001
Skipping 1JX4-checkpoint_e

## Only save lines starting with 'ATOM' and skip hydrogen atoms

In [None]:
# Go through folder and delete all lines from folders that don't start with \"ATOM\"\n",
files = sorted([f for f in os.listdir('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/SCOPe_single_frame') if os.path.isfile(os.path.join('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/SCOPe_single_frame',f))and not f.startswith('.')])
for file_name in files:
    file_path = os.path.join('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/SCOPe_single_frame', file_name)
    with open(file_path, 'r') as file:
        lines = [
            line for line in file
            if line.startswith("ATOM") and not line[12:16].strip().startswith("H")
        ]
    
    with open(file_path, 'w') as file:
        file.writelines(lines)

## Renumber residues

In [2]:
# Renumber the residue numbers in pdb files usning function
folder_path = '/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/SCOPe_single_frame'
start_residue_number = 1
renumber_residues(folder_path, start_residue_number)

NameError: name 'renumber_residues' is not defined

## Copy files to Protein_CT/input/pdb/

Only run this piece of code when you want to export PBD files to protein_CT/input_files/pdb/

In [13]:
#If folder does not exist, create folder
if not os.path.isdir('/Users/murielhammond/RP1/CT_code/Protein_CT/input_files/pdb'): os.makedirs('/Users/murielhammond/RP1/CT_code/Protein_CT/input_files/pdb')

#Copy pdb files to /Protein_CT/input_files/pdb
for num,files in enumerate(os.listdir('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/SCOPe_single_frame')):
    if files.endswith(('cif','pdb')):
        trajectory_PDB_files_preparation_path = os.path.join('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/SCOPe_single_frame', files)
        trajectory_protein_CT_path = '/Users/murielhammond/RP1/CT_code/Protein_CT/input_files/pdb'
        shutil.copy(trajectory_PDB_files_preparation_path, trajectory_protein_CT_path)