# Preprocess KPRO data into PDB files

In this code, PDB files from the KPRO database (url : https://folding.biofold.org/k-pro/api/docs/#/) are manually downloaded as .gz files. They are unzipped, 
the hydrogen atoms are removed from the files and only the lines not starting with 'ATOM' are  removed. After, residues are renumbered and only single chain topology files are used.
KPRO_df is created with folding and unfolding rate data, obtained from the website. 

Data is saved in the CT_code/PDB_files_preparation/output/pdbs_traj/KPRO_single_frame' and can be copied to the protein_CT/input folder by running the last block of code.

## Importing and downloading KPRO data

In [4]:
import pandas as pd
import requests
import os
import gzip
import mdtraj
import tarfile
import shutil
from requests import get

## Function to renumber residues

In [2]:
def renumber_residues(folder_path, start_residue_number):
    
#Function to renumber all incorrectly numbered residues from trajectory folder. 
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdb'):
            input_pdb = os.path.join(folder_path, filename)
    
            with open(input_pdb, 'r') as infile:
                lines = infile.readlines()
            
            # Check the residue number in the first ATOM/HETATM line
            first_residue_number = None
            for line in lines:
                if line.startswith(("ATOM", "HETATM")):
                    first_residue_number = int(line[22:26].strip())
                    break
            
            # Only proceed with renumbering if the first residue is 9999 (-1) or 0
            if first_residue_number != 1:
                with open(input_pdb, 'w') as outfile:
                    current_residue_number = start_residue_number
                    last_residue_id = None
                    first_residue = True  # Flag to handle the first residue separately

                    for line in lines:
                        # Only renumber residue information in ATOM/HETATM records
                        if line.startswith(("ATOM", "HETATM")):
                            residue_id = line[22:26].strip()

                            # Check if we're on a new residue
                            if residue_id != last_residue_id:
                                last_residue_id = residue_id
                                # Increment only after the first residue
                                if not first_residue:
                                    current_residue_number += 1
                                first_residue = False  # Set flag to False after first residue

                            # Replace residue number in the line
                            new_line = (
                                line[:22] +
                                "{:4}".format(current_residue_number) +  # Format the new residue number
                                line[26:]
                            )
                            outfile.write(new_line)
                        else:
                            # Write non-ATOM/HETATM lines as is
                            outfile.write(line)
        
                print("Residues renumbered successfully in {}".format(input_pdb))
            else:
                print("Skipped renumbering for {} (first residue is already 1)".format(filename))


## Make a list of all PDB id's from KPRO database to download the pdb file from PDB bank

In [5]:
# Specify file_path
file_path = "/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/KPRO_data/KPRO_list.tsv"

# Read the TSV file into a DataFrame
full_df = pd.read_csv(file_path, sep='\t')

KRPO_list = full_df['PDB_wild'].tolist() 
KRPO_list = list(set(KRPO_list))
KRPO_list

['1wit',
 '1shf',
 '1fex',
 '1e0g',
 '1pnj',
 '1ryk',
 '1k0s',
 '1sha',
 '1l2y',
 '2hbb',
 '1c9o',
 '1coa',
 '2jws',
 '1fmk',
 '1uzc',
 '1e41',
 '2abd',
 '2ptl',
 '1n88',
 '1ss1',
 '1sb0',
 '1imq',
 '1jmq',
 '1ris',
 '2wqg',
 '2a3d',
 '1g6p',
 '2vh7',
 '1jo8',
 '1shg',
 '1rfa',
 '1yyj',
 '1opd',
 '1w4j',
 '2j5a',
 '2vwf',
 '1m9s',
 '1e0l',
 '1qtu',
 '1pgb',
 '1ubq',
 '1cyo',
 '2vxd',
 '1mjc',
 '1idy',
 '1prs',
 '1div',
 '1w4e',
 '2fs1',
 'AF-P14621',
 '2vik',
 '1rlq',
 '1bnz',
 '1ten',
 '1k8m',
 '1st7',
 '1azu',
 '1fkb',
 '1dkt',
 '2wxc',
 '1ba5',
 '1bdc',
 '3kz3',
 '1o6x',
 '1csp']

## Preprocessing PDB files

### File.gz are manually downloaded from website, now unzip these files and save as pdbs in KPRO_pdb
PDB files are manually downloaded from: https://www.rcsb.org/downloads using the list of PDB id's. The files are manually downloaded as .gz and code below is to unzip these files. 

In [5]:
path_targzs = '/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/KPRO_data/KPRO_gz'
list_targzs = sorted([f for f in os.listdir(path_targzs) if os.path.isfile(os.path.join(path_targzs, f)) and not f.startswith('.')])

In [6]:
input_folder = "/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/KPRO_data/KPRO_gz"
output_folder = "/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/KPRO_data/KPRO_pdb"
os.makedirs(output_folder, exist_ok=True)

for file_name in os.listdir(input_folder):
    if file_name.endswith(".pdb.gz"):  # Check if the file ends with .pdb.gz
        input_path = os.path.join(input_folder, file_name)
        output_file_name = file_name[:-3]  # Remove the .gz extension
        output_path = os.path.join(output_folder, output_file_name)

        # Decompress the file
        with gzip.open(input_path, 'rb') as gz_file:
            with open(output_path, 'wb') as output_file:
                shutil.copyfileobj(gz_file, output_file)

## Only save lines starting with 'ATOM' and skip hydrogen atoms

In [11]:
# Go through folder and delete all lines from folders that don't start with \"ATOM\"\n",
files = sorted([f for f in os.listdir('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/KPRO_data/KPRO_pdb/') if os.path.isfile(os.path.join('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/KPRO_data/KPRO_pdb/',f)) and not f.startswith('.')])

for file_name in files:
    file_path = os.path.join('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/KPRO_data/KPRO_pdb/', file_name)
    with open(file_path, 'r') as file:
        lines = [
            line for line in file
            if line.startswith("ATOM") and not line[12:16].strip().startswith("H")
        ]
        
    with open(file_path, 'w') as file:
        file.writelines(lines)

In [12]:
# Make list of the PED in folder pdbs and loop through this list
root = 'output/KPRO_data/KPRO_pdb'
list_KPRO = []
for path, subdirs, files in os.walk(root):
    for name in files:
        if name.endswith('.pdb'):
            list_KPRO.append(os.path.join(path, name))

list_KPRO.sort()
list_KPRO

['output/KPRO_data/KPRO_pdb/1azu.pdb',
 'output/KPRO_data/KPRO_pdb/1ba5.pdb',
 'output/KPRO_data/KPRO_pdb/1bdc.pdb',
 'output/KPRO_data/KPRO_pdb/1bnz.pdb',
 'output/KPRO_data/KPRO_pdb/1c9o.pdb',
 'output/KPRO_data/KPRO_pdb/1coa.pdb',
 'output/KPRO_data/KPRO_pdb/1csp.pdb',
 'output/KPRO_data/KPRO_pdb/1cyo.pdb',
 'output/KPRO_data/KPRO_pdb/1div.pdb',
 'output/KPRO_data/KPRO_pdb/1dkt.pdb',
 'output/KPRO_data/KPRO_pdb/1e0g.pdb',
 'output/KPRO_data/KPRO_pdb/1e0l.pdb',
 'output/KPRO_data/KPRO_pdb/1e41.pdb',
 'output/KPRO_data/KPRO_pdb/1fex.pdb',
 'output/KPRO_data/KPRO_pdb/1fkb.pdb',
 'output/KPRO_data/KPRO_pdb/1fmk.pdb',
 'output/KPRO_data/KPRO_pdb/1g6p.pdb',
 'output/KPRO_data/KPRO_pdb/1idy.pdb',
 'output/KPRO_data/KPRO_pdb/1imq.pdb',
 'output/KPRO_data/KPRO_pdb/1jmq.pdb',
 'output/KPRO_data/KPRO_pdb/1jo8.pdb',
 'output/KPRO_data/KPRO_pdb/1k0s.pdb',
 'output/KPRO_data/KPRO_pdb/1k8m.pdb',
 'output/KPRO_data/KPRO_pdb/1l2y.pdb',
 'output/KPRO_data/KPRO_pdb/1m9s.pdb',
 'output/KPRO_data/KPRO_p

## Filter on single chain and < 200 frames

In [None]:
exception_list = []

# Make a new folder for PED.pdb trajectories
output_dir = 'output/pdbs_traj/KPRO_single_frame'
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

# Process each PDB trajectory file
for pdb in list_KPRO:
    try:
        # Define the base name for saving frames
        base_name = os.path.basename(pdb)[:-4] + "_e001"

        # Check if the first frame already exists to avoid redundant processing
        if not os.path.isfile(f'{output_dir}/{base_name}.0.pdb'):
            print(f"Processing file: {base_name}")

            # Load the trajectory
            traj = mdtraj.load(pdb)

            # Check if the trajectory has less than 200 frames
            if traj.n_frames < 200:
                # Ensure it's a single-chain topology before saving frames
                if traj.topology.n_chains == 1:
                    current_segment = []
                    frame_index = 0

                    # Read the PDB content
                    with open(pdb, 'r') as file:
                        pdb_content = file.readlines()

                    # Split the PDB file into frames based on atom serial resets
                    for line in pdb_content:
                        if line.startswith("ATOM") or line.startswith("HETATM"):
                            # Extract the atom serial number
                            serial_number = int(line[6:11].strip())
                            if serial_number == 1 and current_segment:
                                # Save the completed segment to a new file
                                output_file = os.path.join(output_dir, f"{base_name}.{frame_index}.pdb")
                                with open(output_file, 'w') as frame_file:
                                    frame_file.writelines(current_segment)
                                frame_index += 1
                                current_segment = []

                        current_segment.append(line)

                    # Save the last segment if it exists
                    if current_segment:
                        output_file = os.path.join(output_dir, f"{base_name}.{frame_index}.pdb")
                        with open(output_file, 'w') as frame_file:
                            frame_file.writelines(current_segment)

                    print(f"Saved frames from {base_name} to {output_dir}")
                else:
                    print(f"Skipping {base_name}: Not a single-chain topology.")
            else:
                print(f"Skipping {base_name}: Contains {traj.n_frames} frames (exceeds limit).")
    except Exception as e:
        print(f"File {os.path.basename(pdb)} is skipped due to error: {e}")
        exception_list.append(pdb)

print("Processing completed.")
if exception_list:
    print(f"The following files were skipped due to errors: {exception_list}")

### Renumber the residues

In [14]:
# Renumber the residue numbers in pdb files
folder_path = '/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/KPRO_single_frame'
start_residue_number = 1
renumber_residues(folder_path, start_residue_number)

Residues renumbered successfully in /Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/KPRO_single_frame/1e41_e001.21.pdb
Residues renumbered successfully in /Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/KPRO_single_frame/2j5a_e001.0.pdb
Skipped renumbering for 1l2y_e001.4.pdb (first residue is already 1)
Skipped renumbering for 1imq_e001.0.pdb (first residue is already 1)
Residues renumbered successfully in /Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/KPRO_single_frame/1fmk_e001.0.pdb
Residues renumbered successfully in /Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/KPRO_single_frame/1uzc_e001.16.pdb
Skipped renumbering for 1n88_e001.17.pdb (first residue is already 1)
Residues renumbered successfully in /Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/KPRO_single_frame/1uzc_e001.9.pdb
Residues renumbered successfully in /Users/murielhammond/RP1/CT_code/PDB_files_p

### Save pdb files to protein_CT/input/pdb

Only run this piece of code when you want to export PDB files to protein_CT/input_files/pdb/

In [22]:
#If folder does not exist, create folder
if not os.path.isdir('/Users/murielhammond/RP1/CT_code/Protein_CT/input_files/pdb'): os.makedirs('/Users/murielhammond/RP1/CT_code/Protein_CT/input_files/pdb')

#Copy pdb files to /Protein_CT/input_files/pdb
for num,files in enumerate(os.listdir('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/KPRO_single_frame')):
    if files.endswith(('cif','pdb')):
        trajectory_PDB_files_preparation_path = os.path.join('/Users/murielhammond/RP1/CT_code/PDB_files_preparation/output/pdbs_traj/KPRO_single_frame', files)
        trajectory_protein_CT_path = '/Users/murielhammond/RP1/CT_code/Protein_CT/input_files/pdb'
        shutil.copy(trajectory_PDB_files_preparation_path, trajectory_protein_CT_path)

## Saving energy data of every protein

In [6]:
KPRO_df = full_df[['PROTEIN', 'PDB_wild', 'ln(kf)_H2O', 'ln(ku)_H2O', 'T', "dln(ku)_H2O",  "dln(kf)_H2O"]].copy()
KPRO_df.rename(columns={
    "PROTEIN": "protid",
    "PDB_wild": "PDB",
    "dln(ku)_H2O": "dln(ku)",
    "dln(kf)_H2O": "dln(kf)"
}, inplace=True)
KPRO_df

Unnamed: 0,protid,PDB,ln(kf)_H2O,ln(ku)_H2O,T,dln(ku),dln(kf)
0,Azurin,1azu,4.84,-16.96,25.0,,
1,Azurin,1azu,4.60,-15.59,25.0,1.37,-0.24
2,Azurin,1azu,4.39,-11.75,25.0,5.21,-0.45
3,Azurin,1azu,4.23,-13.53,25.0,3.43,-0.61
4,Azurin,1azu,4.43,-14.54,25.0,2.42,-0.41
...,...,...,...,...,...,...,...
1524,mAcP,AF-P14621,-1.47,-7.45,28.0,2.19,-0.05
1525,mAcP,AF-P14621,-1.52,-7.35,28.0,2.29,-0.10
1526,mAcP,AF-P14621,-1.98,-6.95,28.0,2.69,-0.56
1527,mAcP,AF-P14621,-1.67,-6.44,28.0,3.20,-0.25


### Save df as csv

In [8]:
# KPRO_df.to_csv('/Users/murielhammond/RP1/CT_code/CT_analysis/Dataframes/KPRO_df_free_energy_v0')