In [1]:
import os, re, csv, glob, subprocess, shutil, sys
sys.path.append("/pscratch/sd/k/kysun/apo-holo-project/EvoStruct/utils/")

In [2]:
from extract_pdb_info import *
from fix_protein import *

## Create the csv file given a PDBBind Data Input

Note: should be generalizable to the general set but only tried on the refined set

In [17]:
def parse_and_write_to_csv(fp, output_csv):
    
    with open(fp, 'r') as f:
        text = f.read()
        
    # Regular expression to match the data lines
    data_pattern = re.compile(r'(\w+)\s+([\d.]+)\s+(\d+)\s+([\d.]+)\s+([a-zA-Z]+)([<>=]+)([\d.]+)([npfmuM]+)\s+//\s+\S+\s+\(([\w\-\&\+\/]+)\)')

    # Open the CSV file for writing
    with open(output_csv, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        
        # Write the header
        csvwriter.writerow(['PDB Code', 'Resolution', 'Release Year', '-logKd/Ki', 'Kd/Ki', 'Sign', 'Kd/Ki Value', 'Kd/Ki Unit', 'Ligand Name'])
        
        # Find all matches and write to CSV
        for match in data_pattern.findall(text):
            pdb_code, resolution, release_year, logKd_Ki, kd_ki, sign, kd_ki_value, kd_ki_unit, ligand_name = match
            csvwriter.writerow([pdb_code, resolution, release_year, logKd_Ki, kd_ki, sign, kd_ki_value, kd_ki_unit, ligand_name])


# Call the function
fp = "/pscratch/sd/k/kysun/apo-holo-project/data_curation/PDBBind_2020/index/INDEX_refined_data.2020"
output_csv = "refined_data.csv"
parse_and_write_to_csv(fp, output_csv)


print(f'CSV file "{output_csv}" created successfully.')

CSV file "refined_data.csv" created successfully.


In [29]:
# simple check to make sure there's no missing data
import pandas as pd
refined_data = pd.read_csv('refined_data.csv')
pdb_codes = list(refined_data['PDB code'].values)

with open(fp, "r") as f:
    lines = f.read().splitlines()

for line in lines:
    if line[:4] not in pdb_codes:
        print(line) 

# List of the protein-ligand complexes in the PDBbind refined set v.2020
# 5316 protein-ligand complexes in total, which are ranked by binding data
# Latest update: July 2021
# PDB code, resolution, release year, -logKd/Ki, Kd/Ki, reference, ligand name


## Download the original PDB File and separate files based on different ligands of interest

Note: 
1. one ligand for each subdirectory under the protein folder
2. include all of the chains from a certain distance cutoff with the ligand
3. for the protein chains that are included, include all the HETATM molecules within a certain cutoff of the distance to capture interfacial water
4. in each protein file, the remark should contain seqres; fixed residues; mutated residues; fixed atoms; ss bond record
5. in the metadata, identify using the normal uniprot sequence of their references if they have any
6. a final step of sanity check - compare the sequence to the actual uniprot sequence [extracted]

** pdb code  
** protein.pdb [downloaded from rcsb] 
** protein.cif [downloaded from rcsb]  
** {pdbid}\_{ligid}\_{chain}_rcsb.pdb [extracted from the rcsb file with necessary fields]  
** {pdbid}\_{ligid}\_{chain}_protein.pdb [fixed protein file with fixing information in the file header]  
** {pdbid}\_{ligid}\_{chain}_hetatm.pdb [heteroatoms that are near the included chains]   

Default: keep every heteroatoms that are around including crystal artifacts + skip N/C terminal when fixing

## To Eric

1. The dataframe needs to contain a pdbid and its ligandid. I used the above code to generate the csv but feel free to change the code to adapt for nonstandard ligand names. Some of the cases that needs additional attentions are: a. expired pdb code; b. expired ligand code. c. ligand code means a polymer/polysaccharides, d. ...

2. Alignment Data and Difference Data are all about the chains that we used - these can be stored separatedly as a metadata. Should we include PDBFixer info in this metadata as well? Currently they are only in the PDB files.

In [None]:
import pandas as pd
refined_data = pd.read_csv('refined_data.csv')

# taken from the example in the py file
refined_data_fp = "/pscratch/sd/k/kysun/apo-holo-project/EvoStruct/utils/refined_data.csv"
total_alignment_df, total_differences_df, failed_pdbs = batch_process_wf(refined_data_fp)
total_alignment_df.to_csv("/pscratch/sd/k/kysun/apo-holo-project/EvoStruct/utils/alignment_data.csv", index=False)
total_differences_df.to_csv("/pscratch/sd/k/kysun/apo-holo-project/EvoStruct/utils/differences_data.csv", index=False)
with open("/pscratch/sd/k/kysun/apo-holo-project/EvoStruct/utils/failed_pdbs.pkl", "wb") as f:
    pickle.dump(failed_pdbs, f)

In [20]:
import pickle
failed_pdbs = pickle.load(open('failed_pdbs.pkl', 'rb'))

In [21]:
failed_pdbs

['6ghj', '5oxk']