In [42]:
from rdkit import Chem
from rdkit import Chem
import os
import pandas as pd
import os
from rdkit import Chem
from rdkit.Chem import Descriptors


def calculate_descriptors(smiles_list):
    """Calculates RDKit descriptors for a list of SMILES strings.

    Args:
        smiles_list (list): A list of SMILES strings.

    Returns:
        pd.DataFrame: A DataFrame containing the calculated descriptors.
    """

    descriptor_names = [
        'MolLogP', 'MolMR', 'ExactMolWt', 'HeavyAtomCount', 'NumHAcceptors', 'NumHDonors', 
        'NumHeteroatoms', 'NumRotatableBonds', 'NumAromaticRings', 'NumAliphaticRings',
        'RingCount', 'TPSA', 'LabuteASA', 'Kappa1', 'Kappa2', 'Kappa3', 
        'Chi0', 'Chi1', 'Chi0n', 'Chi1n', 'Chi2n', 'Chi3n', 'Chi4n',
        'Chi0v', 'Chi1v', 'Chi2v', 'Chi3v', 'Chi4v',
        'PEOE_VSA1', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 
        'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'PEOE_VSA10', 
        'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 
        'SMR_VSA1', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6',
        'SMR_VSA7', 'SMR_VSA9', 'SMR_VSA10',
        'SlogP_VSA1', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 
        'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA10', 'SlogP_VSA11', 
        'SlogP_VSA12',
        'EState_VSA1', 'EState_VSA2', 'EState_VSA3', 'EState_VSA4', 'EState_VSA5',
        'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9', 'EState_VSA10',
        'VSA_EState1', 'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5',
        'VSA_EState6', 'VSA_EState7', 'VSA_EState8', 'VSA_EState9', 'VSA_EState10'
    ]

    descriptors = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            desc_values = [getattr(Descriptors, desc)(mol) for desc in descriptor_names]
            descriptors.append(desc_values)
        else:
            descriptors.append([None] * len(descriptor_names))  # For invalid SMILES

    return pd.DataFrame(descriptors, columns=descriptor_names)

def fix_pdb_residue_names(pdb_block, resname="LIG"):
    lines = pdb_block.splitlines()
    fixed_lines = []
    for line in lines:
        if line.startswith("HETATM") or line.startswith("ATOM  "):
            # Columns 18–20: Residue name
            line = f"{line[:17]}{resname:<3}{line[20:]}"
        fixed_lines.append(line)
    return "\n".join(fixed_lines)

from rdkit import Chem
from rdkit.Chem import AllChem

def combine_protein_ligand(protein_pdb_path, ligand_sdf_path, output_pdb_path):
    # Load and convert ligand to PDB with residue name LIG
    mol = Chem.MolFromMolFile(ligand_sdf_path, removeHs=False)
    if mol is None:
        
        raise ValueError(f"Failed to read ligand SDF : {ligand_sdf_path}")
    Chem.MolToPDBFile(mol, "temp_ligand.pdb")

    # Fix ligand PDB residue name to LIG
    with open("temp_ligand.pdb", "r") as f:
        ligand_lines = []
        for line in f:
            if line.startswith("HETATM") or line.startswith("ATOM"):
                line = line[:17] + "LIG" + line[20:]  # Replace residue name
            ligand_lines.append(line)

    # Read protein PDB (exclude END line if present)
    with open(protein_pdb_path, "r") as f:
        protein_lines = [line for line in f if not line.startswith("END")]

    # Write combined PDB
    with open(output_pdb_path, "w") as out:
        out.writelines(protein_lines)
        out.writelines(ligand_lines)
        out.write("END\n")

    return(output_pdb_path)

import os
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

# Ensure these are defined or imported:
# combine_protein_ligand(), calculate_descriptors()

import os
import pandas as pd
import time

import os
import pandas as pd
import time
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors

def _process_complex(idx_row):
    idx, row = idx_row
    smiles = row['smiles']
    protein_pdb_path = row['standardized_protein_pdb']
    ligand_sdf_path = row['standardized_ligand_sdf']

    complex_path = os.path.join(complex_dir, f"{idx}.pdb")
    desc_path = os.path.join(descriptor_dir, f"{idx}.csv")
    dpocket_output_dir = f"./dpout"
    dpocket_output_file = "dpout_explicitp.txt"
    dpocket_input_file = f"dp_input.txt"

    try:
        # 1. Combine protein and ligand into one PDB
        complex_file = combine_protein_ligand(protein_pdb_path, ligand_sdf_path, complex_path)

        # 2. Prepare input and run dpocket
        with open(dpocket_input_file, "w") as f:
            f.write(f"{complex_file}\t{lig_code}\n")

        import shutil

        # Remove and recreate dpocket output dir
        if os.path.exists(dpocket_output_dir):
            shutil.rmtree(dpocket_output_dir)
        os.makedirs(dpocket_output_dir)

        # Run dpocket
        os.system(f"dpocket -f {dpocket_input_file}")


        # 4. Read dpocket output
        pocket_df = pd.read_csv(dpocket_output_file, sep='\s+')
        pocket_df['pdb'] = pocket_df['pdb'].str.replace('.pdb', '', regex=False)

        # 5. Compute ligand descriptors
        ligand_df = calculate_descriptors([smiles])

        # 6. Merge and save
        merged = pd.concat([ligand_df, pocket_df], axis=1)
        merged.to_csv(desc_path, index=False)
        return complex_file, desc_path, merged

    except Exception as e:
        # Log error index
        tqdm.write(f"[ERROR] idx={idx}: {str(e)}")

        # Fill everything with NaNs to match expected output shape
        ligand_df = calculate_descriptors([smiles])
        num_ligand_cols = ligand_df.shape[1]

        # Estimate dpocket column count from any other file (or hardcode if known)
        num_pocket_cols = 41  # Replace with real count if exact
        pocket_cols = [f'pocket_{i}' for i in range(num_pocket_cols)]
        pocket_nan_df = pd.DataFrame([[np.nan]*num_pocket_cols], columns=pocket_cols)

        merged = pd.concat([ligand_df, pocket_nan_df], axis=1)
        merged.to_csv(desc_path, index=False)

        return complex_path, desc_path, merged

def generate_all_complexes(df, complex_dir):
    os.makedirs(complex_dir, exist_ok=True)
    complex_paths = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Generating complexes", dynamic_ncols=True):
        complex_path = os.path.join(complex_dir, f"{idx}.pdb")
        combine_protein_ligand(row['standardized_protein_pdb'], row['standardized_ligand_sdf'], complex_path)
        complex_paths.append(complex_path)

    return complex_paths

def write_dpocket_input(complex_paths, lig_code="LIG", output_file="dp_input.txt"):
    with open(output_file, "w") as f:
        for path in complex_paths:
            if path:
                f.write(f"{path}\t{lig_code}\n")

def run_dpocket_batch(dp_input_file="dp_input.txt", output_dir="./"):
    os.system(f"dpocket -f {dp_input_file}")

    
def parse_dpocket_outputs(df, dpocket_output_file="dpout_explicitp.txt"):
    try:
        pocket_df = pd.read_csv(dpocket_output_file, sep='\s+')
    except Exception as e:
        tqdm.write(f"[ERROR] couldn't read {dpocket_output_file}: {str(e)}")
        pocket_df = pd.DataFrame()

    # Normalize "pdb" column to match index
    pocket_df["complex_id"] = pocket_df["pdb"].apply(lambda x: os.path.basename(x).replace(".pdb", ""))
    pocket_df = pocket_df.drop(columns=["pdb"])

    # Convert to int if possible
    try:
        pocket_df["complex_id"] = pocket_df["complex_id"].astype(int)
    except:
        tqdm.write("[WARN] complex_id is not integer-based")

    # Match back to df
    pocket_df = pocket_df.set_index("complex_id")
    pocket_df = pocket_df.loc[df.index.intersection(pocket_df.index)]  # align with your df

    return pocket_df.reset_index()

def compute_all_ligand_descriptors(df):
    return calculate_descriptors(df["smiles"].tolist())


def merge_all_outputs(ligand_df, pocket_dfs):
    merged = []
    for lig, pocket in zip(ligand_df.iterrows(), pocket_dfs):
        idx, lig_row = lig
        lig_df = pd.DataFrame([lig_row])
        full = pd.concat([lig_df.reset_index(drop=True), pocket.reset_index(drop=True)], axis=1)
        merged.append(full)
    return pd.concat(merged, ignore_index=True)

    
    
lig_code = "LIG"

columns_to_drop_set1 = [
    "pdb", "lig", "overlap", "PP-crit", "PP-dst", "crit4", 
    "crit5", "crit6", "crit6_continue", "nb_AS_norm", "apol_as_prop_norm", 
    "mean_loc_hyd_dens_norm", "polarity_score_norm", "as_density_norm", 
    "as_max_dst_norm", "drug_score"
]

columns_to_drop_set2 = {
    "pock_vol","nb_AS","mean_as_ray","mean_as_solv_acc","apol_as_prop","mean_loc_hyd_dens","hydrophobicity_score","volume_score","polarity_score","charge_score","flex","prop_polar_atm","as_density","as_max_dst",
    "convex_hull_volume","surf_pol_vdw14","surf_pol_vdw22","surf_apol_vdw14","surf_apol_vdw22","n_abpa","ALA","ARG","ASN","ASP","CYS","GLN","GLU","GLY","HIS","ILE","LEU","LYS","MET","PHE","PRO","SER","THR","TRP","TYR","VAL","pKd"
}

# Input

In [43]:
base_dir = "../data/standardized_clean"
complex_dir = os.path.join(base_dir, "complex")
descriptor_dir = os.path.join(base_dir, "descriptor")


df = pd.read_parquet("../data/standardized/standardized_input.parquet")[:10]

# Run

In [44]:
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

# Setup output dirs
os.makedirs(complex_dir, exist_ok=True)
os.makedirs(descriptor_dir, exist_ok=True)

In [45]:
complex_paths = generate_all_complexes(df, complex_dir)
write_dpocket_input(complex_paths)
run_dpocket_batch()




Generating complexes:   0%|          | 0/10 [00:00<?, ?it/s][A[A[A


Generating complexes:  30%|███       | 3/10 [00:00<00:00, 23.08it/s][A[A[A


Generating complexes:  60%|██████    | 6/10 [00:00<00:00, 18.90it/s][A[A[A


Generating complexes: 100%|██████████| 10/10 [00:00<00:00, 23.10it/s][A[A[A


<dpocket>s 1/10 - ../data/standardized_clean/complex/0.pdb:Having 27 objects in memory


mkdir: cannot create directory ‘../data/standardized_clean/complex/0_out/pockets’: File exists


sorting atoms and vertices or something like this 
sort finished
Having 706 objects in memory
<dpocket>s 2/10 - ../data/standardized_clean/complex/1.pdb:Having 706 objects in memory


mkdir: cannot create directory ‘../data/standardized_clean/complex/1_out/pockets’: File exists


sorting atoms and vertices or something like this 
sort finished
Having 864 objects in memory
<dpocket>s 3/10 - ../data/standardized_clean/complex/2.pdb:Having 864 objects in memory


mkdir: cannot create directory ‘../data/standardized_clean/complex/2_out/pockets’: File exists


sorting atoms and vertices or something like this 
sort finished
Having 1297 objects in memory
<dpocket>s 4/10 - ../data/standardized_clean/complex/3.pdb:Having 1297 objects in memory


mkdir: cannot create directory ‘../data/standardized_clean/complex/3_out/pockets’: File exists


sorting atoms and vertices or something like this 
sort finished
Having 2151 objects in memory
<dpocket>s 5/10 - ../data/standardized_clean/complex/4.pdb:Having 2151 objects in memory


mkdir: cannot create directory ‘../data/standardized_clean/complex/4_out/pockets’: File exists


sorting atoms and vertices or something like this 
sort finished
Having 2841 objects in memory
<dpocket>s 6/10 - ../data/standardized_clean/complex/5.pdb:Having 2841 objects in memory


mkdir: cannot create directory ‘../data/standardized_clean/complex/5_out/pockets’: File exists


sorting atoms and vertices or something like this 
sort finished
Having 3092 objects in memory
<dpocket>s 7/10 - ../data/standardized_clean/complex/6.pdb:Having 3092 objects in memory


mkdir: cannot create directory ‘../data/standardized_clean/complex/6_out/pockets’: File exists


sorting atoms and vertices or something like this 
sort finished
Having 3326 objects in memory
<dpocket>s 8/10 - ../data/standardized_clean/complex/7.pdb:Having 3326 objects in memory


mkdir: cannot create directory ‘../data/standardized_clean/complex/7_out/pockets’: File exists


sorting atoms and vertices or something like this 
sort finished
Having 3444 objects in memory
<dpocket>s 9/10 - ../data/standardized_clean/complex/8.pdb:Having 3444 objects in memory


mkdir: cannot create directory ‘../data/standardized_clean/complex/8_out/pockets’: File exists


sorting atoms and vertices or something like this 
sort finished
Having 3902 objects in memory
<dpocket>s 10/10 - ../data/standardized_clean/complex/9.pdb:Having 3902 objects in memory


mkdir: cannot create directory ‘../data/standardized_clean/complex/9_out/pockets’: File exists


sorting atoms and vertices or something like this 
sort finished
Having 4382 objects in memory



In [46]:
ligand_df = calculate_descriptors(df["smiles"].tolist())
pocket_df = parse_dpocket_outputs(df, "dpout_explicitp.txt")

# Save

In [47]:
merged_df = pd.concat([ligand_df.reset_index(drop=True), pocket_df.reset_index(drop=True)], axis=1)
merged_df.to_csv("../data/standardized_clean/all_descriptors.csv", index=False)

In [48]:
merged_df

Unnamed: 0,MolLogP,MolMR,ExactMolWt,HeavyAtomCount,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumAromaticRings,NumAliphaticRings,...,LEU,LYS,MET,PHE,PRO,SER,THR,TRP,TYR,VAL
0,-0.4312,30.2269,169.998025,10,3,3,7,3,0,0,...,0,2,0,0,1,0,0,0,1,0
1,0.45789,111.4763,432.146741,30,6,4,10,9,2,0,...,1,1,0,0,0,6,1,1,1,1
2,-4.2719,47.6709,232.069536,16,6,5,8,1,0,2,...,0,0,1,0,0,1,0,3,1,0
3,-6.6696,89.1354,434.142188,28,10,10,14,8,0,2,...,0,4,0,1,1,1,1,1,2,0
4,-1.863,73.6551,347.063084,23,10,5,13,4,2,1,...,2,2,0,1,0,0,3,0,2,1
5,2.143,48.3974,174.042927,13,3,0,4,1,2,0,...,2,0,1,2,0,1,1,0,3,1
6,0.0901,73.8596,285.180884,20,3,3,6,6,0,1,...,0,0,0,0,1,2,0,1,1,1
7,-1.746,84.5654,427.029415,27,12,6,17,6,2,1,...,1,1,1,1,0,1,1,2,0,1
8,-4.4883,53.2913,235.128848,16,5,6,7,2,0,1,...,0,1,0,0,0,0,1,2,3,2
9,-0.0696,37.8761,180.01876,11,3,2,6,4,0,0,...,2,1,0,2,1,0,1,0,3,1
