# Preprocess ACPRO data into PDB files

PDB files from the ACPRO database (url : https://www.ats.amherst.edu/protein/) are downloaded as .gz files. They are unzipped, 
the hydrogen atoms are removed from the files and only the lines not starting with 'ATOM' are  removed. After, residues are renumbered and only single chain topology files are used.
ACPRO_df is created with folding and unfolding rate data which can also be found on the ACPRO site. This is needed later in the project.

Data is saved in the CT_code/PDB_files_preparation/output/pdbs_traj/ACPRO_single_frame and can be copied to the protein_CT/input folder by running the last block of code.

## Importing and downloading KPRO data

In [1]:
import pandas as pd
import requests
import os
import gzip
import mdtraj
import tarfile
import shutil
from requests import get
import numpy as np

## Function to renumber residues

In [4]:
def renumber_residues(folder_path, start_residue_number):
#Function to renumber all incorrectly numbered residues from trajectory folder. 
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdb'):
            input_pdb = os.path.join(folder_path, filename)
    
            with open(input_pdb, 'r') as infile:
                lines = infile.readlines()
            
            # Check the residue number in the first ATOM/HETATM line
            first_residue_number = None
            for line in lines:
                if line.startswith(("ATOM", "HETATM")):
                    first_residue_number = int(line[22:26].strip())
                    break
            
            # Only proceed with renumbering if the first residue is 9999 (-1) or 0
            if first_residue_number != 1:
                with open(input_pdb, 'w') as outfile:
                    current_residue_number = start_residue_number
                    last_residue_id = None
                    first_residue = True  # Flag to handle the first residue separately

                    for line in lines:
                        # Only renumber residue information in ATOM/HETATM records
                        if line.startswith(("ATOM", "HETATM")):
                            residue_id = line[22:26].strip()

                            # Check if we're on a new residue
                            if residue_id != last_residue_id:
                                last_residue_id = residue_id
                                # Increment only after the first residue
                                if not first_residue:
                                    current_residue_number += 1
                                first_residue = False  # Set flag to False after first residue

                            # Replace residue number in the line
                            new_line = (
                                line[:22] +
                                "{:4}".format(current_residue_number) +  # Format the new residue number
                                line[26:]
                            )
                            outfile.write(new_line)
                        else:
                            # Write non-ATOM/HETATM lines as is
                            outfile.write(line)
        
                print("Residues renumbered successfully in {}".format(input_pdb))
            else:
                print("Skipped renumbering for {} (first residue is already 1)".format(filename))


## Make a list of all PDB id's from ACPRO database to download the pdb file from PDB bank

PDB files are manually downloaded from: https://www.rcsb.org/downloads using the list of PDB id's. The files are manually downloaded as .gz and code below is to unzip these files. 

In [7]:
# Specify file_path
file_path = "/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/ACPRO_data/ACPRO_data_list.csv"

# Read the TSV file into a DataFrame
full_df = pd.read_csv(file_path)

ACPRO_list = full_df['PDB Id']
ACPRO_list = set(ACPRO_list)
ACPRO_list

{'1A6N',
 '1ADW',
 '1AON',
 '1APS',
 '1ARR',
 '1AU7',
 '1AUE',
 '1AVZ',
 '1AYI',
 '1B9C',
 '1BA5',
 '1BD8',
 '1BF4',
 '1BFE',
 '1BNI',
 '1BNZ',
 '1BRS',
 '1C9O',
 '1CDC',
 '1CSP',
 '1CUN',
 '1DIV',
 '1E0G',
 '1E0L',
 '1E0M',
 '1E41',
 '1E65',
 '1EAL',
 '1EHB',
 '1ENH',
 '1FEX',
 '1FKF',
 '1FMK',
 '1FNF',
 '1FTG',
 '1G6P',
 '1GM1',
 '1GXT',
 '1HCD',
 '1HEL',
 '1HMK',
 '1HRC',
 '1I1B',
 '1IDY',
 '1IMQ',
 '1J5U',
 '1JMQ',
 '1JO8',
 '1JON',
 '1JOO',
 '1K0S',
 '1K85',
 '1K8M',
 '1L2Y',
 '1L8W',
 '1LMB',
 '1LOP',
 '1M9S',
 '1MJC',
 '1N88',
 '1NPS',
 '1NTI',
 '1O6X',
 '1PBA',
 '1PGB',
 '1PHP',
 '1PIN',
 '1PNJ',
 '1POH',
 '1PRB',
 '1PRS',
 '1PSF',
 '1QOP',
 '1QTU',
 '1RA9',
 '1RFA',
 '1RIS',
 '1RYK',
 '1SCE',
 '1SHG',
 '1SPR',
 '1SRL',
 '1SS1',
 '1ST7',
 '1TEN',
 '1TIT',
 '1TTF',
 '1U5P',
 '1UBQ',
 '1URN',
 '1UZC',
 '1V9E',
 '1VII',
 '1W4J',
 '1WIT',
 '1YCC',
 '1YEA',
 '1YZA',
 '256B',
 '2A3D',
 '2A5E',
 '2ACY',
 '2BLM',
 '2CI2',
 '2CRO',
 '2EAL',
 '2FDQ',
 '2HPR',
 '2HQI',
 '2LZM',
 '2PDD',
 

## Preprocessing PDB files

In [23]:
path_targzs = '/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/ACPRO_data/ACPRO_gz'
list_targzs = sorted([f for f in os.listdir(path_targzs) if os.path.isfile(os.path.join(path_targzs, f)) and not f.startswith('.')])

In [24]:
input_folder = "/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/ACPRO_data/ACPRO_gz"
output_folder = "/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/ACPRO_data/ACPRO_pdb"
os.makedirs(output_folder, exist_ok=True)

for file_name in os.listdir(input_folder):
    if file_name.endswith(".pdb.gz"):
        input_path = os.path.join(input_folder, file_name)
        output_file_name = file_name[:-3]
        output_path = os.path.join(output_folder, output_file_name)

        # Decompress the file
        with gzip.open(input_path, 'rb') as gz_file:
            with open(output_path, 'wb') as output_file:
                shutil.copyfileobj(gz_file, output_file)

In [26]:
# Make list of the PED in folder pdbs and loop through this list
root = 'output/ACPRO_data/ACPRO_pdb'
list_KPRO = []
for path, subdirs, files in os.walk(root):
    for name in files:
        if name.endswith('.pdb'):
            list_KPRO.append(os.path.join(path, name))

list_KPRO.sort()
list_KPRO

['output/ACPRO_data/ACPRO_pdb/1a6n.pdb',
 'output/ACPRO_data/ACPRO_pdb/1adw.pdb',
 'output/ACPRO_data/ACPRO_pdb/1aon.pdb',
 'output/ACPRO_data/ACPRO_pdb/1aps.pdb',
 'output/ACPRO_data/ACPRO_pdb/1arr.pdb',
 'output/ACPRO_data/ACPRO_pdb/1au7.pdb',
 'output/ACPRO_data/ACPRO_pdb/1aue.pdb',
 'output/ACPRO_data/ACPRO_pdb/1avz.pdb',
 'output/ACPRO_data/ACPRO_pdb/1ayi.pdb',
 'output/ACPRO_data/ACPRO_pdb/1b9c.pdb',
 'output/ACPRO_data/ACPRO_pdb/1ba5.pdb',
 'output/ACPRO_data/ACPRO_pdb/1bd8.pdb',
 'output/ACPRO_data/ACPRO_pdb/1bf4.pdb',
 'output/ACPRO_data/ACPRO_pdb/1bfe.pdb',
 'output/ACPRO_data/ACPRO_pdb/1bni.pdb',
 'output/ACPRO_data/ACPRO_pdb/1bnz.pdb',
 'output/ACPRO_data/ACPRO_pdb/1brs.pdb',
 'output/ACPRO_data/ACPRO_pdb/1c9o.pdb',
 'output/ACPRO_data/ACPRO_pdb/1cdc.pdb',
 'output/ACPRO_data/ACPRO_pdb/1csp.pdb',
 'output/ACPRO_data/ACPRO_pdb/1cun.pdb',
 'output/ACPRO_data/ACPRO_pdb/1div.pdb',
 'output/ACPRO_data/ACPRO_pdb/1e0g.pdb',
 'output/ACPRO_data/ACPRO_pdb/1e0l.pdb',
 'output/ACPRO_d

## Filter on single chain and < 200 frames

In [27]:
exception_list = []
# Make a new folder for PED.pdb trajectories
output_dir = 'output/pdbs_traj/ACPRO_single_frame'
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

# Process each PDB trajectory file
for pdb in list_KPRO:
    try:
        # Define the base name for saving frames
        base_name = os.path.basename(pdb)[:-4] + "_e001"

        # Check if the first frame already exists to avoid redundant processing
        if not os.path.isfile(f'{output_dir}/{base_name}.0.pdb'):
            print(f"Processing file: {base_name}")

            # Load the trajectory
            traj = mdtraj.load(pdb)

            # Check if the trajectory has less than 200 frames
            if traj.n_frames < 200:
                # Ensure it's a single-chain topology before saving frames
                if traj.topology.n_chains == 1:
                    current_segment = []
                    frame_index = 0

                    # Read the PDB content
                    with open(pdb, 'r') as file:
                        pdb_content = file.readlines()

                    # Split the PDB file into frames based on atom serial resets
                    for line in pdb_content:
                        if line.startswith("ATOM") or line.startswith("HETATM"):
                            # Extract the atom serial number
                            serial_number = int(line[6:11].strip())
                            if serial_number == 1 and current_segment:
                                # Save the completed segment to a new file
                                output_file = os.path.join(output_dir, f"{base_name}.{frame_index}.pdb")
                                with open(output_file, 'w') as frame_file:
                                    frame_file.writelines(current_segment)
                                frame_index += 1
                                current_segment = []

                        current_segment.append(line)

                    # Save the last segment if it exists
                    if current_segment:
                        output_file = os.path.join(output_dir, f"{base_name}.{frame_index}.pdb")
                        with open(output_file, 'w') as frame_file:
                            frame_file.writelines(current_segment)

                    print(f"Saved frames from {base_name} to {output_dir}")
                else:
                    print(f"Skipping {base_name}: Not a single-chain topology.")
            else:
                print(f"Skipping {base_name}: Contains {traj.n_frames} frames (exceeds limit).")
    except Exception as e:
        print(f"File {os.path.basename(pdb)} is skipped due to error: {e}")
        exception_list.append(pdb)

print("Processing completed.")
if exception_list:
    print(f"The following files were skipped due to errors: {exception_list}")

Processing file: 1a6n_e001
Saved frames from 1a6n_e001 to output/pdbs_traj/ACPRO_single_frame
Processing file: 1adw_e001
Skipping 1adw_e001: Not a single-chain topology.
Processing file: 1aon_e001
Skipping 1aon_e001: Not a single-chain topology.
Processing file: 1aps_e001
Saved frames from 1aps_e001 to output/pdbs_traj/ACPRO_single_frame
Processing file: 1arr_e001
Skipping 1arr_e001: Not a single-chain topology.
Processing file: 1au7_e001
Skipping 1au7_e001: Not a single-chain topology.
Processing file: 1aue_e001
Skipping 1aue_e001: Not a single-chain topology.
Processing file: 1avz_e001
Skipping 1avz_e001: Not a single-chain topology.
Processing file: 1ayi_e001
Saved frames from 1ayi_e001 to output/pdbs_traj/ACPRO_single_frame
Processing file: 1b9c_e001
Skipping 1b9c_e001: Not a single-chain topology.
Processing file: 1ba5_e001
Saved frames from 1ba5_e001 to output/pdbs_traj/ACPRO_single_frame
Processing file: 1bd8_e001
Saved frames from 1bd8_e001 to output/pdbs_traj/ACPRO_single_fram

## Only save lines starting with 'ATOM' and skip hydrogen atoms

In [None]:
# Go through folder and delete all lines from folders that don't start with \"ATOM\"\n",
files = sorted([f for f in os.listdir('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/ACPRO_data/ACPRO_pdb/') if os.path.isfile(os.path.join('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/ACPRO_data/ACPRO_pdb/',f)) and not f.startswith('.')])

for file_name in files:
    file_path = os.path.join('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/ACPRO_data/ACPRO_pdb/', file_name)
    with open(file_path, 'r') as file:
        lines = [
            line for line in file
            if line.startswith("ATOM") and not line[12:16].strip().startswith("H")
        ]
        
    with open(file_path, 'w') as file:
        file.writelines(lines)

## Renumber the residues

In [28]:
# Renumber the residue numbers in pdb files
folder_path = '/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/ACPRO_single_frame'
start_residue_number = 1
renumber_residues(folder_path, start_residue_number)

Residues renumbered successfully in /Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/ACPRO_single_frame/1e41_e001.21.pdb
Skipped renumbering for 1l2y_e001.4.pdb (first residue is already 1)
Skipped renumbering for 1imq_e001.0.pdb (first residue is already 1)
Residues renumbered successfully in /Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/ACPRO_single_frame/1gm1_e001.22.pdb
Residues renumbered successfully in /Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/ACPRO_single_frame/1fmk_e001.0.pdb
Residues renumbered successfully in /Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/ACPRO_single_frame/1uzc_e001.16.pdb
Skipped renumbering for 1pba_e001.13.pdb (first residue is already 1)
Skipped renumbering for 1n88_e001.17.pdb (first residue is already 1)
Residues renumbered successfully in /Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/ACPRO_single_frame/1uzc_e001.9.pdb
Resi

## Save pdb files to protein_CT/input/pdb

Only run this piece of code when you want to export PDB files to protein_CT/input_files/pdb/

In [38]:
#If folder does not exist, create folder
if not os.path.isdir('/Users/murielhammond/RP1/CT_code/Protein_CT/input_files/pdb'): os.makedirs('/Users/murielhammond/RP1/CT_code/Protein_CT/input_files/pdb')

#Copy pdb files to /Protein_CT/input_files/pdb
for num,files in enumerate(os.listdir('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/ACPRO_single_frame')):
    if files.endswith(('cif','pdb')):
        trajectory_PDB_files_preparation_path = os.path.join('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/ACPRO_single_frame', files)
        trajectory_protein_CT_path = '/Users/murielhammond/RP1/CT_code/Protein_CT/input_files/pdb'
        shutil.copy(trajectory_PDB_files_preparation_path, trajectory_protein_CT_path)

## Saving data of every protein in dataframe

In [9]:
ACPRO_df = pd.read_excel('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/ACPRO_data/ACPRO_excel_list.xlsx')
ACPRO_df['log(kf)'] = ACPRO_df['ln_kf']
ACPRO_df['log_kf_error'] = ACPRO_df['ln_kf_error']
ACPRO_df = ACPRO_df.drop(columns=['Chain', 'Name', 'Protein Length', 'Structural Class', 'pH', 'Author', 'Lab Website', 'Fragment Start', 'Unnamed: 14'])
ACPRO_df

Unnamed: 0,PDB,Folding_Type,CO,ln_kf,ln_kf_error,Temperature(celsius),log(kf),log_kf_error
0,1A6N,Multi,140099,1.13,0.06,5.00,1.13,0.06
1,1ADW,Multi,153685,0.69,,15.00,0.69,
2,1AON,Multi,432627,0.18,0.08,15.00,0.18,0.08
3,1APS,Two,207283,-1.58,0.18,25.00,-1.58,0.18
4,1ARR,Two,271281,9.20,,25.00,9.20,
...,...,...,...,...,...,...,...,...
121,2VIK,Multi,165842,11.90,0.60,25.00,11.90,0.60
122,2VKN,Multi,134658,2.11,0.23,25.00,2.11,0.23
123,2WXC,Two,450322,11.20,,9.85,11.20,
124,3CHY,Multi,11312,1.00,0.10,25.00,1.00,0.10


## Save df as csv

In [10]:
# ACPRO_df.to_csv('/Users/murielhammond/RP1/CT_code/CT_analysis/Dataframes/ACPRO_folding_rate_v0')