# Data curation
Thi snotebook aims to curate all available protein-ligand binding data associated with experimental affinity values. 
Curation here implies to format experimental data and structure in a shared format before conducting standardization of structure.

# Functions

In [None]:
import os
import pandas as pd
from rdkit import Chem
from tqdm import tqdm
import math

def dg_to_kd(delta_g_kcal, temp_k=298.15):
    R = 1.987e-3  # kcal/mol·K
    kd_molar = -math.log10(math.exp(delta_g_kcal / (R * temp_k)))
    return kd_molar  # in Molar (M)

def prepare_wang_data(base_dir):
    data = []
    for target in tqdm(os.listdir(base_dir)):
        target_dir = os.path.join(base_dir, target)
        structures_dir = os.path.join(target_dir, "structures")
        dg_file = os.path.join(target_dir, "experimental_dG.txt")

        # Load ΔG values into a dictionary
        dg_dict = {}
        if os.path.isfile(dg_file):
            with open(dg_file, "r") as f:
                for line in f:
                    parts = line.strip().split()
                    if len(parts) >= 2:
                        ligand_name = parts[0]
                        try:
                            dg_value = float(parts[1])
                            kd_value = dg_value  # RT = 0.592 kcal/mol at 298K
                            dg_dict[ligand_name] = kd_value
                        except ValueError:
                            continue

        if os.path.isdir(structures_dir):
            for ligand_folder in os.listdir(structures_dir):
                ligand_path = os.path.join(structures_dir, ligand_folder)
                sdf_path = os.path.join(ligand_path, "ligand.sdf")
                pdb_path = os.path.join(ligand_path, "protein.pdb")

                smiles = None
                if os.path.isfile(sdf_path):
                    supplier = Chem.SDMolSupplier(sdf_path)
                    mols = [mol for mol in supplier if mol is not None]
                    if mols:
                        smiles = Chem.MolToSmiles(mols[0])

                kd = dg_dict.get(ligand_folder, None)


                combine_pdb_sdf(pdb_path, sdf_path, pdb_path.replace("protein.pdb","complex.pdb"))

                data.append({
                    "protein_name": target,
                    "ligand": ligand_folder,
                    "smiles": smiles,
                    "ligand_sdf_path": sdf_path if os.path.isfile(sdf_path) else None,
                    "protein_pdb_path": pdb_path if os.path.isfile(pdb_path) else None,
                    "complex_pdb_path": pdb_path.replace("protein.pdb","complex.pdb"),
                    "pKd (Wang, FEP)":  dg_to_kd(kd, temp_k=298.15)
                })
                
    return(pd.DataFrame(data))
        

from rdkit import Chem
import os

from rdkit import Chem
def combine_pdb_sdf(protein_pdb_path, ligand_sdf_path, output_pdb_path):
    # Load ligand from SDF
    mol = Chem.MolFromMolFile(ligand_sdf_path, removeHs=False)
    if mol is None:
        raise ValueError("Invalid SDF file or molecule failed to load")

    # Set residue and chain info properly
    for i, atom in enumerate(mol.GetAtoms()):
        info = Chem.AtomPDBResidueInfo()
        info.SetName(" {:<2}".format(atom.GetSymbol()))  # Proper spacing
        info.SetResidueName("LIG")
        info.SetResidueNumber(1)
        info.SetChainId("A")  # <-- Important for dpocket
        info.SetIsHeteroAtom(True)
        atom.SetMonomerInfo(info)

    # Convert ligand to PDB block
    ligand_pdb_block = Chem.MolToPDBBlock(mol)

    # Combine protein and ligand
    with open(output_pdb_path, "w") as out:
        with open(protein_pdb_path, "r") as prot_file:
            for line in prot_file:
                if not line.startswith("END"):  # avoid duplicating END
                    out.write(line)
        out.write("\n")
        out.write(ligand_pdb_block)
        out.write("END\n")


import pandas as pd
from pathlib import Path

def build_hqbind_pose_dataset(metadata_path="HQBind/hiqbind_metadata.csv", root="HQBind/raw_data_hiq_sm"):
    df = pd.read_csv(metadata_path)

    pose_data = []
    for _, row in df.iterrows():
        pdbid = row["PDBID"]
        ligand_name = row["Ligand Name"]
        chain = row["Ligand Chain"]
        resnum = row["Ligand Residue Number"]

        ligand_dir = f"{pdbid}_{ligand_name}_{chain}_{resnum}"
        ligand_path = Path(root) / pdbid / ligand_dir / f"{ligand_dir}_ligand.pdb"
        protein_path = Path(root) / pdbid / ligand_dir / f"{ligand_dir}_protein.pdb"

        if not ligand_path.exists() or not protein_path.exists():
            continue

        pose_data.append({
            "PDBID": pdbid,
            "Resolution": row["Resolution"],
            "Ligand Name": ligand_name,
            "Ligand Chain": chain,
            "Ligand Residue Number": resnum,
            "Ligand SMILES": row["Ligand SMILES"],
            "Binding Affinity Measurement": row["Binding Affinity Measurement"],
            "Binding Affinity Sign": row["Binding Affinity Sign"],
            "Binding Affinity Value": row["Binding Affinity Value"],
            "Binding Affinity Unit": row["Binding Affinity Unit"],
            "Binding Affinity Source": row["Binding Affinity Source"],
            "Ligand PDB Path": str(ligand_path),
            "Protein PDB Path": str(protein_path)
        })

    return pd.DataFrame(pose_data)

def format_hqbind(df_pose):

    unit_to_M = {
    "M": 1,
    "mM": 1e-3,
    "uM": 1e-6,
    "nM": 1e-9,
    "pM": 1e-12
    }
    
    df_pose = df_pose[["Ligand PDB Path", "Protein PDB Path", "Ligand SMILES", "Resolution", "Binding Affinity Measurement", "Binding Affinity Sign", "Binding Affinity Value", "Binding Affinity Unit"]]
    df_pose["Binding Affinity (M)"] = df_pose.apply(
        lambda row: row["Binding Affinity Value"] * unit_to_M.get(row["Binding Affinity Unit"], None)
        if pd.notnull(row["Binding Affinity Value"]) and row["Binding Affinity Unit"] in unit_to_M
        else None,
        axis=1
    )
    

    df_pose["neg log10(M)"] = df_pose["Binding Affinity (M)"].apply(
        lambda x: -np.log10(x) if pd.notnull(x) and x > 0 else None
    )
    return(df_pose)


from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np
from tqdm import tqdm

def process_binding_data(df):
    tqdm.pandas()  

    # Keep only standard_relation == "="
    df = df[df["standard_relation"] == "="].copy()

    unit_to_multiplier = {
        "M": 1,
        "mM": 1e-3,
        "uM": 1e-6,
        "µM": 1e-6,
        "nM": 1e-9,
        "pM": 1e-12,
        "fM": 1e-15,
        "ug.mL-1": None,
        "mg.mL-1": None,
    }

    def to_molar(row):
        unit = row["standard_units"]
        val = row["standard_value"]
        mw = row["Molecule MW"]
        if pd.isna(val) or pd.isna(unit) or pd.isna(mw):
            return np.nan
        if unit in unit_to_multiplier and unit_to_multiplier[unit] is not None:
            return float(val) * unit_to_multiplier[unit]
        elif unit == "ug.mL-1":
            return (float(val) * 1e-6) / mw
        elif unit == "mg.mL-1":
            return (float(val) * 1e-3) / mw
        else:
            return np.nan

    df["Molar"] = df.apply(to_molar, axis=1)
    df["-log10(M)"] = -np.log10(df["Molar"])

    def path(row, ext):
        return f"../data/raw/BindingNetv1/BindingNetv1/{row['Protein Source']}/target_{row['Target ChEMBLID']}/{row['Molecule ChEMBLID']}/{row['Protein Source']}_{row['Target ChEMBLID']}_{row['Molecule ChEMBLID']}.{ext}"

    df["SDF_path"] = df.progress_apply(lambda r: path(r, "sdf"), axis=1)
    df["PDB_path"] = df.progress_apply(lambda r: path(r, "pdb"), axis=1)

    # Group by standard_type
    grouped = {std_type: group.reset_index(drop=True) for std_type, group in df.groupby("standard_type")}
    return grouped


def process_binding_data_bindingdnetv2(df):
    tqdm.pandas()  
    # Keep only standard_relation == "="
    df = df[df["standard_relation"] == "="].copy()

    unit_to_multiplier = {
        "M": 1,
        "mM": 1e-3,
        "uM": 1e-6,
        "µM": 1e-6,
        "nM": 1e-9,
        "pM": 1e-12,
        "fM": 1e-15,
        "ug.mL-1": None,
        "mg.mL-1": None,
    }

    def to_molar(row):
        unit = row["standard_units"]
        val = row["standard_value"]
        mw = row["Molecule MW"]
        if pd.isna(val) or pd.isna(unit) or pd.isna(mw):
            return np.nan
        if unit in unit_to_multiplier and unit_to_multiplier[unit] is not None:
            return float(val) * unit_to_multiplier[unit]
        elif unit == "ug.mL-1":
            return (float(val) * 1e-6) / mw
        elif unit == "mg.mL-1":
            return (float(val) * 1e-3) / mw
        else:
            return np.nan

    df["Molar"] = df.apply(to_molar, axis=1)
    df["-log10(M)"] = -np.log10(df["Molar"])

    def file_exists_for(row, level):
        sdf_path = f"../data/raw/BindingNetv2/{level}/{row['Protein Source']}/target_{row['Target ChEMBLID']}/{row['Molecule ChEMBLID']}/{row['Protein Source']}_{row['Target ChEMBLID']}_{row['Molecule ChEMBLID']}.sdf"
        pdb_path = f"../data/raw/BindingNetv2/{level}/{row['Protein Source']}/target_{row['Target ChEMBLID']}/{row['Molecule ChEMBLID']}/{row['Protein Source']}_{row['Target ChEMBLID']}_{row['Molecule ChEMBLID']}.pdb"
        return os.path.exists(sdf_path) and os.path.exists(pdb_path)

    def resolve_high_low(row):
        return "high" if file_exists_for(row, "high") else "moderate" if file_exists_for(row, "moderate") else None

    df["high_low"] = df.progress_apply(resolve_high_low, axis=1)
    df = df[df["high_low"].notna()].copy()

    
    def path(row, ext):
        return f"../data/raw/BindingNetv2/{row['high_low']}/{row['Protein Source']}/target_{row['Target ChEMBLID']}/{row['Molecule ChEMBLID']}/{row['Protein Source']}_{row['Target ChEMBLID']}_{row['Molecule ChEMBLID']}.{ext}"
    import os


    df["SDF_path"] = df.progress_apply(lambda r: path(r, "sdf"), axis=1)
    df["PDB_path"] = df.progress_apply(lambda r: path(r, "pdb"), axis=1)

    # Group by standard_type
    grouped = {std_type: group.reset_index(drop=True) for std_type, group in df.groupby("standard_type")}
    return grouped

import os

def process_binding_data_bindingdnetv2(df):
    tqdm.pandas()
    # Keep only standard_relation == "="
    df = df[df["standard_relation"] == "="].copy()

    unit_to_multiplier = {
        "M": 1,
        "mM": 1e-3,
        "uM": 1e-6,
        "µM": 1e-6,
        "nM": 1e-9,
        "pM": 1e-12,
        "fM": 1e-15,
        "ug.mL-1": None,
        "mg.mL-1": None,
    }

    def to_molar(row):
        unit = row["standard_units"]
        val = row["standard_value"]
        mw = row["Molecule MW"]
        if pd.isna(val) or pd.isna(unit) or pd.isna(mw):
            return np.nan
        if unit in unit_to_multiplier and unit_to_multiplier[unit] is not None:
            return float(val) * unit_to_multiplier[unit]
        elif unit == "ug.mL-1":
            return (float(val) * 1e-6) / mw
        elif unit == "mg.mL-1":
            return (float(val) * 1e-3) / mw
        else:
            return np.nan

    df["Molar"] = df.apply(to_molar, axis=1)
    df["-log10(M)"] = -np.log10(df["Molar"])

    def file_exists_for(row, level):
        target_id = row['Target ChEMBLID']
        mol_id = row['Molecule ChEMBLID']
        base = f"../data/raw/BindingNetv2/{level}/target_{target_id}/{mol_id}"
        return os.path.exists(f"{base}/ligand.sdf") and os.path.exists(f"{base}/protein.pdb")

    def path(row, ext):
        return f"../data/raw/BindingNetv2/{row['high_low']}/target_{row['Target ChEMBLID']}/{row['Molecule ChEMBLID']}/{ext}"

    def resolve_high_low(row):
        return "high" if file_exists_for(row, "high") else "moderate" if file_exists_for(row, "moderate") else None

    df["high_low"] = df.progress_apply(resolve_high_low, axis=1)
    df = df[df["high_low"].notna()].copy()


    df["SDF_path"] = df.progress_apply(lambda r: path(r, "ligand.sdf"), axis=1)
    df["PDB_path"] = df.progress_apply(lambda r: path(r, "protein.pdb"), axis=1)

    # Group by standard_type
    grouped = {std_type: group.reset_index(drop=True) for std_type, group in df.groupby("standard_type")}
    return grouped, df


import os
import pandas as pd
from tqdm import tqdm
import re


def perpare_refined_PDBbind2020():
    base_dir = '../data/raw/PDBbind2020/PDBbind2020/main'

    index_file = os.path.join(base_dir, 'index', 'INDEX_refined_data.2020')
    refined_dir = os.path.join(base_dir, 'refined-set')

    records = []

    with open(index_file, 'r') as f:
        for line in tqdm(f):
            if line.startswith('#') or not line.strip():
                continue

            # Split by multiple spaces
            parts = re.split(r'\s{2,}', line.strip())
            if len(parts) < 5:
                continue

            try:
                pdbid = parts[0]
                resolution = float(parts[1])
                year = int(parts[2])
                log_affinity = float(parts[3])
                binding_str = parts[4]
            except:
                continue  # Skip malformed rows

            # Parse binding string like "Ki=0.1uM", "Ki<=100nM"
            if '=' in binding_str:
                activity_part, value_unit_part = binding_str.split('=')
                activity_type = activity_part.strip().replace("<", "").replace(">", "")
                value_unit_part = value_unit_part.strip().replace("<", "").replace(">", "")
                value_str = ''.join(filter(lambda x: x in '0123456789.-', value_unit_part))
                unit = ''.join(filter(lambda x: x.isalpha(), value_unit_part))

                try:
                    value = float(value_str)
                except ValueError:
                    continue  # skip invalid value
            else:
                continue

            # File paths
            protein_path = os.path.join(refined_dir, pdbid, f"{pdbid}_protein.pdb")
            ligand_path = os.path.join(refined_dir, pdbid, f"{pdbid}_ligand.sdf")

            if not os.path.exists(protein_path) or not os.path.exists(ligand_path):
                continue

            records.append({
                'pdbid': pdbid,
                'resolution': resolution,
                'year': year,
                '-log_affinity': log_affinity,
                'activity_type': activity_type,
                'value': value,
                'unit': unit,
                'protein_path': protein_path,
                'ligand_path': ligand_path
            })

    df = pd.DataFrame(records)
    import numpy as np

    unit_multipliers = {
        'M': 1,
        'mM': 1e-3,
        'uM': 1e-6,
        'nM': 1e-9,
        'pM': 1e-12,
        'fM': 1e-15
    }

    def convert_to_molar(row):
        factor = unit_multipliers.get(row['unit'], None)
        if factor is None or row['value'] <= 0:
            return None
        return -np.log10(row['value'] * factor)

    df['-log10_affinity_M'] = df.apply(convert_to_molar, axis=1)
    df
    return(df)


import os
import pandas as pd
import re
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem

def parse_affinity_column(entry):
    if pd.isna(entry):
        return {}

    pattern = r"(Ki|Kd|IC50|EC50)=([0-9eE\.\+\-]+)([a-zA-Zμµ]*)"
    result = {}
    for match in re.finditer(pattern, entry):
        aff_type, val_str, unit = match.groups()
        try:
            val = float(val_str)
            result[f"{aff_type}_value"] = val
            result[f"{aff_type}_unit"] = unit.strip()
        except ValueError:
            continue
    return result

def convert_to_log_molar(value, unit):
    unit_map = {
        'M': 1,
        'mM': 1e-3,
        'uM': 1e-6,
        'μM': 1e-6,
        'µM': 1e-6,
        'nM': 1e-9,
        'pM': 1e-12,
        'fM': 1e-15
    }
    if unit not in unit_map or value <= 0:
        return None
    return -np.log10(value * unit_map[unit])

def extract_smiles_from_pdb(pdb_file):
    try:
        mol = Chem.MolFromPDBFile(pdb_file, removeHs=False)
        if mol is None:
            return None
        Chem.SanitizeMol(mol)
        return Chem.MolToSmiles(mol)
    except:
        return None

def process_biolip_dataset(biolip_path, ligand_dir, receptor_dir):
    col_names = [
        "pdb_id", "receptor_chain", "resolution", "binding_site_id", "ligand_id",
        "ligand_chain", "ligand_serial", "bs_residues_pdb", "bs_residues_renum",
        "catalytic_residues_pdb", "catalytic_residues_renum", "ec_number",
        "go_terms", "affinity_lit", "affinity_moad", "affinity_pdbbind",
        "affinity_bindingdb", "uniprot_id", "pubmed_id", "ligand_seq_number",
        "receptor_sequence"
    ]

    df = pd.read_csv(biolip_path, sep="\t", header=None, names=col_names)

    # Extract affinities
    sources = ["affinity_lit", "affinity_moad", "affinity_pdbbind", "affinity_bindingdb"]
    for src in sources:
        parsed = df[src].apply(parse_affinity_column)
        parsed_df = parsed.apply(pd.Series).add_prefix(f"{src}_")
        df = pd.concat([df, parsed_df], axis=1)

    # Output dict
    result_dfs = {
        'Ki': [],
        'Kd': [],
        'IC50': [],
        'EC50': []
    }

    for _, row in tqdm(df.iterrows()):
        pdb_id = row["pdb_id"]
        ligand_id = row["ligand_id"]
        receptor_chain = row["receptor_chain"]

        ligand_fname = f"{pdb_id}_{ligand_id}_{row['ligand_chain']}_{row['ligand_serial']}.pdb"
        receptor_fname = f"{pdb_id}{receptor_chain}.pdb"

        ligand_path = os.path.join(ligand_dir, ligand_fname)
        receptor_path = os.path.join(receptor_dir, receptor_fname)

        if not os.path.exists(ligand_path) or not os.path.exists(receptor_path):
            continue

        smiles = extract_ligands_from_pdb(ligand_path)
        if smiles is None:
            continue

        for aff_type in result_dfs.keys():
            # Try from all sources
            for source in sources:
                val_key = f"{source}_{aff_type}_value"
                unit_key = f"{source}_{aff_type}_unit"
                value = row.get(val_key)
                unit = row.get(unit_key)
                if pd.notna(value) and pd.notna(unit):
                    log_aff = convert_to_log_molar(value, unit)
                    if log_aff is not None:
                        result_dfs[aff_type].append({
                            "protein_path": receptor_path,
                            "ligand_path": ligand_path,
                            "smiles": smiles,
                            "-log10_affinity_M": log_aff
                        })
                        break  # stop at first valid source

    # Create dataframes
    return {k: pd.DataFrame(v) for k, v in result_dfs.items()}


def extract_ligands_from_pdb(file_path):
    from rdkit import Chem
    import os

    smiles_list = []

    if not os.path.isfile(file_path):
        return [None]

    ext = os.path.splitext(file_path)[-1].lower()

    try:
        if file_path.endswith('.sdf'):
            suppl = Chem.SDMolSupplier(file_path, sanitize=False, removeHs=True)
            if isinstance(suppl, list) or hasattr(suppl, '__iter__'):
                for mol in suppl:
                    if mol:
                        smiles_list.append(Chem.MolToSmiles(mol))
                        break
            else:
                smiles_list.append(Chem.MolToSmiles(suppl))

        elif file_path.endswith('.pdb'):
            suppl = Chem.MolFromPDBFile(file_path, sanitize=False, removeHs=True)
            if isinstance(suppl, list) or hasattr(suppl, '__iter__'):
                for mol in suppl:
                    if mol:
                        smiles_list.append(Chem.MolToSmiles(mol))
                        break
            else:
                smiles_list.append(Chem.MolToSmiles(suppl))
        else:
            smiles_list.append(None)

    except Exception:
        smiles_list.append(None)

    return smiles_list[0]


## 1. Wang 2015 FEP - Ross structures (from dG to Kd using Gibbs)

In [None]:
base_dir = "../data/raw/Wang_2015_Ross_structures"
df = prepare_wang_data(base_dir)
df = df[["protein_pdb_path","ligand_sdf_path", "smiles", "pKd (Wang, FEP)"]]
print(df.columns.tolist())
df["resolution"] = None
df.to_parquet("../data/curated/exp/pKd_FEP_Wang_2015.parquet", index = False)
df.head(1)

## 2. Zariquiey structures extended from Wang 2015 (from dG to Kd using Gibbs)

In [None]:
base_dir = "../data/raw/Wang_2015_Zariquiey_structures_extended"
df = prepare_wang_data(base_dir)
df = df[["protein_pdb_path","ligand_sdf_path", "smiles", "pKd (Wang, FEP)"]]
print(df.columns.tolist())
df["resolution"] = None
df.to_parquet("../data/curated/exp/pKd_FEP_Zariquiey_extended_Wang_2015.parquet", index = False)
df.head(1)

## 3. PDBbind2020

In [None]:
df = perpare_refined_PDBbind2020()
df['smiles'] = [extract_ligands_from_pdb(i) for i in tqdm(df["ligand_path"].tolist())]

In [None]:
df.columns.tolist()

a. pKi

In [None]:
df_Ki = df[df["activity_type"]=="Ki"]
df_Ki = df_Ki[["protein_path","ligand_path", "smiles", "-log10_affinity_M", 'resolution']]
df_Ki.columns = ["protein_pdb_path","ligand_sdf_path", "smiles", "pKi", 'resolution']
df_Ki.to_parquet("../data/curated/exp/pKi_PDBbind2020.parquet", index = False)
df_Ki.head(1)

b. pKd

In [None]:
df_Kd = df[df["activity_type"]=="Kd"]
df_Kd = df_Kd[["protein_path","ligand_path", "smiles", "-log10_affinity_M", 'resolution']]
df_Kd.columns = ["protein_pdb_path","ligand_sdf_path", "smiles", "pKd", 'resolution']
df_Kd.to_parquet("../data/curated/exp/pKd_PDBbind2020.parquet", index = False)
df_Kd.head(1)

## 4. HQBind 

In [None]:
metadata_path="../data/raw/HiQBind//hiqbind_metadata.csv"
root_path="../data/raw/HiQBind//raw_data_hiq_sm"

df = build_hqbind_pose_dataset(metadata_path, root_path)
df = format_hqbind(df)

In [None]:
print(df.columns.tolist())

a. pKi

In [None]:
df_Ki = df[df["Binding Affinity Measurement"]=="ki"]
df_Ki = df_Ki[["Protein PDB Path","Ligand PDB Path", "Ligand SMILES", "neg log10(M)", "Resolution"]]
df_Ki.columns = ["protein_pdb_path","ligand_sdf_path", "smiles", "pKi", "resolution"]
df_Ki.to_parquet("../data/curated/exp/pKi_HiQBind.parquet", index = False)
df_Ki.head(1)

b. pKd

In [None]:
df_Kd = df[df["Binding Affinity Measurement"]=="kd"]
df_Kd = df_Kd[["Protein PDB Path","Ligand PDB Path", "Ligand SMILES", "neg log10(M)", "Resolution"]]
df_Kd.columns = ["protein_pdb_path","ligand_sdf_path", "smiles", "pKd", "resolution"]
df_Kd.to_parquet("../data/curated/exp/pKd_HiQBind.parquet", index = False)
df_Kd.head(1)

c. pIC50

In [None]:
df_IC50 = df[df["Binding Affinity Measurement"]=="ic50"]
df_IC50 = df_IC50[["Protein PDB Path","Ligand PDB Path", "Ligand SMILES", "neg log10(M)", "Resolution"]]
df_IC50.columns = ["protein_pdb_path","ligand_sdf_path", "smiles", "pIC50", "resolution"]
df_IC50.to_parquet("../data/curated/exp/pIC50_HiQBind.parquet", index = False)
df_IC50.head(1)

a. pEC50

In [None]:
df_EC50 = df[df["Binding Affinity Measurement"]=="ec50"]
df_EC50 = df_EC50[["Protein PDB Path","Ligand PDB Path", "Ligand SMILES", "neg log10(M)", "Resolution"]]
df_EC50.columns = ["protein_pdb_path","ligand_sdf_path", "smiles", "pEC50", "resolution"]
df_EC50.to_parquet("../data/curated/exp/pEC50_HiQBind.parquet", index = False)
df_EC50.head(1)

# 5. BioLip2

In [None]:
biolip_path = "../data/raw/BioLip2/BioLigInfo.txt"
ligand_dir = "../data/raw/BioLip2/biolip_downloads/biolip_redundant_all/ligand"
receptor_dir = "../data/raw/BioLip2/biolip_downloads/biolip_redundant_all/receptor"
datasets = process_biolip_dataset(biolip_path, ligand_dir, receptor_dir)

In [None]:
df = datasets["Ki"]
df.columns = ["protein_pdb_path","ligand_sdf_path", "smiles", "pKi"]
df["resolution"] = None
df.to_parquet("../data/curated/exp/pKi_BioLip2.parquet", index = False)
df.head(1)

In [None]:
df = datasets["Kd"]
df.columns = ["protein_pdb_path","ligand_sdf_path", "smiles", "pKd"]
df["resolution"] = None
df.to_parquet("../data/curated/exp/pKd_BioLip2.parquet", index = False)
df.head(1)

In [None]:
df = datasets["IC50"]
df.columns = ["protein_pdb_path","ligand_sdf_path", "smiles", "pIC50"]
df["resolution"] = None
df.to_parquet("../data/curated/exp/pIC50_BioLip2.parquet", index = False)
df.head(1)

In [None]:
df = datasets["EC50"]
df.columns = ["protein_pdb_path","ligand_sdf_path", "smiles", "pEC50"]
df["resolution"] = None
df.to_parquet("../data/curated/exp/pEC50_BioLip2.parquet", index = False)
df.head(1)

# 6. BindingNet v1

In [None]:
df = pd.read_csv("../data/raw/BindingNetv2/Index_for_BindingNetv1_and_BindingNetv2.csv")
df = df[df["Dataset"]=="BindingNet v1"]
print(df.columns.tolist())
data_by_type = process_binding_data(df)
# Example: access Ki

In [None]:
df_Ki = data_by_type.get("Ki")
df_Ki = df_Ki[["PDB_path","SDF_path","Molecule SMILES", "-log10(M)"]]
df_Ki.columns = ["protein_pdb_path","ligand_sdf_path", "smiles", "pKi"]
df["resolution"] = None
df_Ki.to_parquet("../data/curated/exp/pKi_BindingNetv1.parquet", index = False)
df_Ki.head(1)

In [None]:
df_Kd = data_by_type.get("Kd")
df_Kd = df_Kd[["PDB_path","SDF_path","Molecule SMILES", "-log10(M)"]]
df_Kd.columns = ["protein_pdb_path","ligand_sdf_path", "smiles", "pKd"]
df["resolution"] = None
df_Kd.to_parquet("../data/curated/exp/pKd_BindingNetv1.parquet", index = False)
df_Kd.head(1)

In [None]:
df_IC50 = data_by_type.get("IC50")
df_IC50 = df_IC50[["PDB_path","SDF_path","Molecule SMILES", "-log10(M)"]]
df_IC50.columns = ["protein_pdb_path","ligand_sdf_path", "smiles", "pEC50"]
df["resolution"] = None
df_IC50.to_parquet("../data/curated/exp/pIC50_BindingNetv1.parquet", index = False)
df_IC50.head(1)

In [None]:
df_EC50 = data_by_type.get("EC50")
df_EC50 = df_EC50[["PDB_path","SDF_path","Molecule SMILES", "-log10(M)"]]
df_EC50.columns = ["protein_pdb_path","ligand_sdf_path", "smiles", "pEC50"]
df["resolution"] = None
df_EC50.to_parquet("../data/curated/exp/pEC50_BindingNetv1.parquet", index = False)
df_EC50.head(1)

# 7. BindingNet v2
> I only downloaded the mid and high quality, not the low, for storage reasons

In [None]:
df = pd.read_csv("../data/raw/BindingNetv2/Index_for_BindingNetv1_and_BindingNetv2.csv")
df = df[df["Dataset"]=="BindingNet v2"]
print(df.columns.tolist())
data_by_type, df = process_binding_data_bindingdnetv2(df)

In [None]:
df = pd.read_csv("../data/raw/BindingNetv2/Index_for_BindingNetv1_and_BindingNetv2.csv")
df = df[df["Dataset"]=="BindingNet v2"]
df.head().style

In [None]:
df_Ki = data_by_type.get("Ki")
df_Ki = df_Ki[["PDB_path","SDF_path","Molecule SMILES", "-log10(M)"]]
df_Ki.columns = ["protein_pdb_path","ligand_sdf_path", "smiles", "pKi"]
df["resolution"] = None
df_Ki.to_parquet("../data/curated/exp/pKi_BindingNetv2.parquet", index = False)
df_Ki.head(1)

In [None]:
df_Kd = data_by_type.get("Kd")
df_Kd = df_Kd[["PDB_path","SDF_path","Molecule SMILES", "-log10(M)"]]
df_Kd.columns = ["protein_pdb_path","ligand_sdf_path", "smiles", "pKd"]
df["resolution"] = None
df_Kd.to_parquet("../data/curated/exp/pKd_BindingNetv2.parquet", index = False)
df_Kd.head(1)

In [None]:
df_IC50 = data_by_type.get("IC50")
df_IC50 = df_IC50[["PDB_path","SDF_path","Molecule SMILES", "-log10(M)"]]
df_IC50.columns = ["protein_pdb_path","ligand_sdf_path", "smiles", "pIC50"]
df["resolution"] = None
df_IC50.to_parquet("../data/curated/exp/pIC50_BindingNetv2.parquet", index = False)
df_IC50.head(1)

In [None]:
df_EC50 = data_by_type.get("EC50")
df_EC50 = df_EC50[["PDB_path","SDF_path","Molecule SMILES", "-log10(M)"]]
df_EC50.columns = ["protein_pdb_path","ligand_sdf_path", "smiles", "pEC50"]
df["resolution"] = None
df_EC50.to_parquet("../data/curated/exp/pEC50_BindingNetv2.parquet", index = False)
df_EC50.head(1)

## 8. SAIR Claude

- unzip with 
> 7z x file.zip.001

In [None]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import time
import pickle
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

# Import required libraries
from rdkit import Chem
from rdkit.Chem import rdmolfiles
from Bio.PDB import MMCIFParser, PDBIO, Select

class ProteinSelect(Select):
    def accept_residue(self, residue):
        return residue.get_id()[0] == ' '

class LigandSelect(Select):
    def accept_residue(self, residue):
        return residue.get_id()[0] != ' '

def check_files_exist_and_valid(protein_path, ligand_path, min_size_bytes=50):
    """Check if both protein and ligand files exist and are valid"""
    try:
        if not (os.path.exists(protein_path) and os.path.exists(ligand_path)):
            return False
        
        protein_size = os.path.getsize(protein_path)
        ligand_size = os.path.getsize(ligand_path)
        
        if protein_size < min_size_bytes or ligand_size < min_size_bytes:
            return False
            
        # Quick content validation
        try:
            with open(protein_path, 'r') as f:
                first_line = f.readline().strip()
                if not (first_line.startswith(('ATOM', 'HETATM', 'MODEL', 'HEADER'))):
                    return False
            
            with open(ligand_path, 'r') as f:
                content = f.read(100)
                if len(content.strip()) < 10:
                    return False
                    
        except Exception:
            return False
            
        return True
        
    except Exception:
        return False

def process_single_cif(cif_path, protein_dir, ligand_dir):
    """Process a single CIF file"""
    
    cif_filename = os.path.basename(cif_path)
    pdb_filename = cif_filename.replace(".cif", ".pdb")
    sdf_filename = cif_filename.replace(".cif", ".sdf")
    
    pdb_path = os.path.join(protein_dir, pdb_filename)
    sdf_path = os.path.join(ligand_dir, sdf_filename)
    
    # Skip if files already exist and are valid
    if check_files_exist_and_valid(pdb_path, sdf_path):
        return {
            'protein_path': pdb_path,
            'ligand_path': sdf_path,
            'status': 'already_exists',
            'success': True
        }
    
    # Clean up any partially created files
    for path in [pdb_path, sdf_path]:
        if os.path.exists(path) and os.path.getsize(path) < 50:
            try:
                os.remove(path)
            except:
                pass
    
    try:
        # Parse CIF file
        parser = MMCIFParser(QUIET=True)
        structure = parser.get_structure("complex", cif_path)
        
        # Write protein PDB
        io = PDBIO()
        io.set_structure(structure)
        io.save(pdb_path, select=ProteinSelect())
        
        # Write ligand to temporary PDB first
        ligand_temp_pdb = sdf_path.replace(".sdf", "_temp.pdb")
        io.save(ligand_temp_pdb, select=LigandSelect())
        
        # Convert ligand PDB to SDF using RDKit
        mol = rdmolfiles.MolFromPDBFile(ligand_temp_pdb, removeHs=False)
        
        if mol is not None:
            writer = Chem.SDWriter(sdf_path)
            writer.write(mol)
            writer.close()
            
            # Clean up temp file
            if os.path.exists(ligand_temp_pdb):
                os.remove(ligand_temp_pdb)
            
            # Final validation
            if check_files_exist_and_valid(pdb_path, sdf_path):
                return {
                    'protein_path': pdb_path,
                    'ligand_path': sdf_path,
                    'status': 'converted_successfully',
                    'success': True
                }
            else:
                return {
                    'protein_path': pdb_path,
                    'ligand_path': sdf_path,
                    'status': 'validation_failed',
                    'success': False
                }
        else:
            # RDKit conversion failed
            if os.path.exists(ligand_temp_pdb):
                os.remove(ligand_temp_pdb)
            return {
                'protein_path': pdb_path,
                'ligand_path': sdf_path,
                'status': 'rdkit_failed',
                'success': False
            }
            
    except Exception as e:
        # Clean up any partial files
        for path in [pdb_path, sdf_path]:
            if os.path.exists(path):
                try:
                    os.remove(path)
                except:
                    pass
                    
        return {
            'protein_path': pdb_path,
            'ligand_path': sdf_path,
            'status': f'error: {str(e)[:30]}',
            'success': False
        }

class SimpleSAIRProcessor:
    """Simple row-by-row SAIR processor"""
    
    def __init__(self, csv_path, structure_dir, output_dir):
        self.csv_path = csv_path
        self.structure_dir = structure_dir
        self.output_dir = output_dir
        
        # Set up directories
        self.protein_dir = os.path.join(output_dir, "protein")
        self.ligand_dir = os.path.join(output_dir, "ligand")
        
        Path(self.protein_dir).mkdir(parents=True, exist_ok=True)
        Path(self.ligand_dir).mkdir(parents=True, exist_ok=True)
        
        self.checkpoint_file = f"{output_dir}/simple_checkpoint.pkl"
        
        print(f"🔬 Simple SAIR Processor initialized")
        print(f"📁 Output: {output_dir}")
    
    def get_available_cif_files(self):
        """Get all CIF files actually available in the structure directory"""
        print("📂 Scanning structure directory for CIF files...")
        
        cif_files = []
        if os.path.exists(self.structure_dir):
            for file in os.listdir(self.structure_dir):
                if file.endswith('.cif'):
                    cif_files.append(file)
        
        print(f"Found {len(cif_files):,} CIF files in structure directory")
        return set(cif_files)  # Return as set for fast lookup
    
    def filter_dataframe_by_available_files(self):
        """Filter experimental data to only include rows with existing CIF files"""
        print("📊 Loading experimental data...")
        df_exp = pd.read_csv(self.csv_path)
        print(f"Original dataset: {len(df_exp):,} entries")
        
        # Get available CIF files
        available_cifs = self.get_available_cif_files()
        
        # Filter dataframe
        print("🔍 Filtering data by available CIF files...")
        
        # Extract CIF filename from path and check if it exists
        def cif_exists(path):
            cif_filename = os.path.basename(path)
            return cif_filename in available_cifs
        
        df_filtered = df_exp[df_exp['path'].apply(cif_exists)].copy()
        
        print(f"Filtered dataset: {len(df_filtered):,} entries")
        print(f"Removed {len(df_exp) - len(df_filtered):,} entries without CIF files")
        
        return df_filtered
    
    def check_existing_processed(self, df):
        """Check how many files are already processed"""
        print("🔍 Checking existing processed files...")
        
        existing_count = 0
        for idx, row in df.iterrows():
            cif_filename = os.path.basename(row["path"])
            pdb_filename = cif_filename.replace(".cif", ".pdb")
            sdf_filename = cif_filename.replace(".cif", ".sdf")
            
            pdb_path = os.path.join(self.protein_dir, pdb_filename)
            sdf_path = os.path.join(self.ligand_dir, sdf_filename)
            
            if check_files_exist_and_valid(pdb_path, sdf_path):
                existing_count += 1
        
        print(f"Already processed: {existing_count:,}/{len(df):,} ({existing_count/len(df)*100:.1f}%)")
        return existing_count
    
    def process_all_simple(self, save_every=1000):
        """Simple row-by-row processing with minimal output"""
        
        print("🚀 Starting simple row-by-row processing...")
        
        # Filter data by available CIF files
        df_filtered = self.filter_dataframe_by_available_files()
        
        if len(df_filtered) == 0:
            print("❌ No matching CIF files found!")
            return None, None
        
        # Check existing processed files
        existing_count = self.check_existing_processed(df_filtered)
        
        # Initialize tracking
        results = {}
        processed_count = 0
        success_count = existing_count
        start_time = time.time()
        
        # Load checkpoint if exists
        if os.path.exists(self.checkpoint_file):
            try:
                with open(self.checkpoint_file, 'rb') as f:
                    checkpoint_data = pickle.load(f)
                    results = checkpoint_data.get('results', {})
                    processed_count = len(results)
                print(f"📋 Loaded checkpoint: {processed_count:,} files in progress")
            except:
                print("⚠️ Could not load checkpoint, starting fresh")
        
        print(f"⚡ Processing {len(df_filtered):,} entries...")
        print("🔄 Progress will be shown every 1000 files")
        
        # Process row by row
        with tqdm(total=len(df_filtered), desc="Processing", 
                  initial=processed_count, unit="files") as pbar:
            
            for idx, row in df_filtered.iterrows():
                # Skip if already processed
                if idx in results:
                    continue
                
                cif_path = os.path.join(self.structure_dir, os.path.basename(row["path"]))
                
                # Process the file
                result = process_single_cif(cif_path, self.protein_dir, self.ligand_dir)
                results[idx] = result
                
                if result['success']:
                    success_count += 1
                
                processed_count += 1
                pbar.update(1)
                
                # Update progress info occasionally
                if processed_count % 100 == 0:
                    pbar.set_postfix({
                        'Success': f"{success_count:,}",
                        'Rate': f"{success_count/processed_count*100:.1f}%"
                    })
                
                # Save checkpoint periodically
                if processed_count % save_every == 0:
                    self.save_checkpoint(results, processed_count)
        
        # Final save
        self.save_checkpoint(results, processed_count)
        
        # Print summary
        elapsed_time = time.time() - start_time
        print(f"\n🎉 Processing complete!")
        print(f"  Total entries processed: {processed_count:,}")
        print(f"  Successfully converted: {success_count:,}")
        print(f"  Success rate: {success_count/len(df_filtered)*100:.1f}%")
        print(f"  Processing time: {elapsed_time/60:.1f} minutes")
        print(f"  Rate: {processed_count/elapsed_time:.1f} files/second")
        
        return self.create_final_dataframe(df_filtered, results)
    
    def save_checkpoint(self, results, processed_count):
        """Save checkpoint"""
        checkpoint_data = {
            'results': results,
            'processed_count': processed_count,
            'timestamp': time.time()
        }
        
        with open(self.checkpoint_file, 'wb') as f:
            pickle.dump(checkpoint_data, f)
    
    def create_final_dataframe(self, df_filtered, results):
        """Create final processed dataframe"""
        print("\n📋 Creating final datasets...")
        
        # Add result paths to dataframe
        protein_paths = []
        ligand_paths = []
        success_flags = []
        
        for idx, row in df_filtered.iterrows():
            if idx in results and results[idx]['success']:
                protein_paths.append(results[idx]['protein_path'])
                ligand_paths.append(results[idx]['ligand_path'])
                success_flags.append(True)
            else:
                protein_paths.append('')
                ligand_paths.append('')
                success_flags.append(False)
        
        df_filtered['protein_pdb_path'] = protein_paths
        df_filtered['ligand_sdf_path'] = ligand_paths
        df_filtered['conversion_success'] = success_flags
        
        # Keep only successful conversions
        df_final = df_filtered[df_filtered['conversion_success']].copy()
        df_final = df_final[["protein_pdb_path", "ligand_sdf_path", "SMILES", "potency", "assay"]]
        
        print(f"✅ Final dataset: {len(df_final):,} valid entries")
        
        # Split by assay type and save
        assay_types = ["biochem", "na", "cell", "homogenate"]
        split_dfs = {}
        
        for assay in assay_types:
            df_assay = df_final[df_final["assay"] == assay].copy()
            if len(df_assay) > 0:
                df_assay = df_assay.rename(columns={"potency": f"potency ({assay})"})
                split_dfs[assay] = df_assay
                
                # Save to CSV
                output_csv = f"{self.output_dir}/{assay}_data.csv"
                df_assay.to_csv(output_csv, index=False)
                print(f"💾 Saved {len(df_assay):,} {assay} entries")
        
        # Save complete dataset
        complete_csv = f"{self.output_dir}/complete_processed_data.csv"
        df_final.to_csv(complete_csv, index=False)
        print(f"💾 Saved complete dataset")
        
        return df_final, split_dfs

# Convenience function for Jupyter
def run_simple_processing(csv_path, structure_dir, output_dir):
    """Run simple processing"""
    processor = SimpleSAIRProcessor(csv_path, structure_dir, output_dir)
    return processor.process_all_simple()

def quick_check_available_files(csv_path, structure_dir):
    """Quick check of available files"""
    processor = SimpleSAIRProcessor(csv_path, structure_dir, "/tmp")
    df_filtered = processor.filter_dataframe_by_available_files()
    return len(df_filtered)

# SAIR

In [None]:
# =============================================================================
# Simple SAIR CIF Processor - Jupyter Usage
# =============================================================================

# Cell 1: Setup and check available files

# Configuration
CSV_PATH = "../data/raw/SAIR/best_models.csv"
STRUCTURE_DIR = "../data/raw/SAIR/structures/"
OUTPUT_DIR = "../data/raw/SAIR_split/"

print("🔬 Simple SAIR CIF Processor")
print("=" * 40)

# Quick check of how many files we can actually process
available_count = quick_check_available_files(CSV_PATH, STRUCTURE_DIR)
print(f"📊 Files available for processing: {available_count:,}")


In [None]:

# =============================================================================
# Cell 2: Run the simple processing
# =============================================================================

print("🚀 Starting simple processing...")

# Run processing - this will handle everything automatically
df_final, split_dfs = run_simple_processing(
    csv_path=CSV_PATH,
    structure_dir=STRUCTURE_DIR,
    output_dir=OUTPUT_DIR
)

print("✅ Processing complete!")


In [None]:

# =============================================================================
# Cell 3: Check results
# =============================================================================

if df_final is not None:
    print("📊 FINAL RESULTS")
    print("=" * 40)
    print(f"Total processed entries: {len(df_final):,}")
    
    if split_dfs:
        for assay_type, df in split_dfs.items():
            print(f"  {assay_type}: {len(df):,} entries")
    
    # Show sample
    print("\n📋 Sample of processed data:")
    print(df_final.head())
else:
    print("❌ No files were processed")


In [None]:

# =============================================================================
# Cell 4: Monitor progress (run during processing if needed)
# =============================================================================

import pickle
import os
from datetime import datetime

checkpoint_file = f"{OUTPUT_DIR}/simple_checkpoint.pkl"

if os.path.exists(checkpoint_file):
    with open(checkpoint_file, 'rb') as f:
        data = pickle.load(f)
    
    results = data.get('results', {})
    processed_count = data.get('processed_count', 0)
    timestamp = data.get('timestamp', 0)
    
    successful = sum(1 for r in results.values() if r['success'])
    
    print(f"📊 Current Progress ({datetime.fromtimestamp(timestamp)})")
    print(f"  Processed: {processed_count:,}")
    print(f"  Successful: {successful:,}")
    if processed_count > 0:
        print(f"  Success rate: {successful/processed_count*100:.1f}%")
else:
    print("No checkpoint file found")


In [None]:

# =============================================================================
# Cell 5: Validate some output files
# =============================================================================

if df_final is not None and len(df_final) > 0:
    
    # Check a few files
    sample_size = min(5, len(df_final))
    sample_files = df_final.sample(sample_size)
    
    print(f"🔍 Validating {sample_size} random files:")
    
    for _, row in sample_files.iterrows():
        protein_path = row['protein_pdb_path']
        ligand_path = row['ligand_sdf_path']
        
        is_valid = check_files_exist_and_valid(protein_path, ligand_path)
        status = "✅" if is_valid else "❌"
        
        protein_size = os.path.getsize(protein_path) if os.path.exists(protein_path) else 0
        ligand_size = os.path.getsize(ligand_path) if os.path.exists(ligand_path) else 0
        
        print(f"{status} Protein: {protein_size:,}B, Ligand: {ligand_size:,}B")


In [None]:

# =============================================================================
# Cell 6: Check final output structure
# =============================================================================

print("📁 Output directory contents:")

if os.path.exists(OUTPUT_DIR):
    # Count files
    protein_files = len([f for f in os.listdir(f"{OUTPUT_DIR}/protein") if f.endswith('.pdb')])
    ligand_files = len([f for f in os.listdir(f"{OUTPUT_DIR}/ligand") if f.endswith('.sdf')])
    csv_files = [f for f in os.listdir(OUTPUT_DIR) if f.endswith('.csv')]
    
    print(f"  🧬 Protein PDB files: {protein_files:,}")
    print(f"  💊 Ligand SDF files: {ligand_files:,}")
    print(f"  📊 CSV datasets: {len(csv_files)}")
    
    for csv_file in tqdm(csv_files):
        if "complete_processed_data.csv" in csv_file:
            df_check = pd.read_csv(f"{OUTPUT_DIR}/{csv_file}")
            parquet_file = csv_file.replace(".csv",".parquet")
            df_check.to_parquet(f'../data/curated/exp/{parquet_file}')
            print(f"    - {csv_file}: {len(df_check):,} entries")
else:
    print("  Output directory not found")

In [None]:
df_check

# 10. SIU

# B. Combine datasets

In [None]:
import os
import pandas as pd
import numpy as np

data_dir = "../data/curated/exp/"
df_list = []

for fname in os.listdir(data_dir):
    full_path = os.path.join(data_dir, fname)
    print(full_path)
    df = pd.read_parquet(full_path)
    df["source_file"] = fname
    df["is_experimental"] = any(x in fname for x in ["BioLip", "PDBbind", "HiQBind"])
    df_list.append(df)

df_combined = pd.concat(df_list, ignore_index=True)
df_combined["source_file"] = df_combined["source_file"].str.replace(r"^[^_]*_", "", regex=True).str.replace(".parquet", "", regex=False)

df_combined["resolution"] = pd.to_numeric(df_combined["resolution"], errors="coerce")
print(len(df_combined))
df_combined.replace([np.inf, -np.inf], np.nan, inplace=True)
print(len(df_combined))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Count plot for source_file with counts
plt.figure(figsize=(10, 5))
ax = sns.countplot(
    data=df_combined,
    y="source_file",
    order=df_combined["source_file"].value_counts().index
)
plt.title("Entry count per source file")
plt.xlabel("Count")
plt.ylabel("Source File")
plt.xscale('log')
plt.grid(True)

# Add count labels
for container in ax.containers:
    ax.bar_label(container, label_type="edge", padding=3)

plt.tight_layout()
plt.show()

# Count plot for is_experimental with counts
plt.figure(figsize=(5, 5))
ax = sns.countplot(data=df_combined, x="is_experimental")
plt.title("Experimental vs Non-experimental")
plt.xlabel("Is Experimental")
plt.ylabel("Count")
plt.yscale('log')
plt.grid(True)

# Add count labels
for container in ax.containers:
    ax.bar_label(container, label_type="edge", padding=3)

plt.tight_layout()
plt.show()


In [None]:
df_combined.columns.tolist()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

exp_cols = ['resolution', 'pKi', 'pEC50',
 'pKd (Wang, FEP)',
 'pKd',
 'pIC50', "potency"]

for col in exp_cols:
    if col in df_combined.columns:
        data = df_combined[col]
        data = data[np.isfinite(data)]

        if len(data) == 0:
            continue

        stats = {
            'N': len(data),
            'min': data.min(),
            'mean': data.mean(),
            'median': data.median(),
            'max': data.max()
        }

        plt.figure(figsize=(8, 4))
        sns.histplot(data, kde=True, bins=50)
        plt.title(f"Distribution of {col}")
        plt.xlabel(col)
        plt.ylabel("Count")
        plt.grid(True)

        for label in ['min', 'mean', 'median', 'max']:
            plt.axvline(stats[label], linestyle='--', label=f"{label}: {stats[label]:.2f}")

        plt.legend(title=f"N = {stats['N']}")
        plt.tight_layout()
        plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

exp_cols = [ 'pKi', 'pEC50',
 'pKd (Wang, FEP)',
 'pKd',
 'pIC50', "potency"]  # exclude 'resolution'

for col in exp_cols:
    if col not in df_combined.columns:
        continue

    df_plot = df_combined[[col, 'is_experimental']].copy()
    df_plot = df_plot[np.isfinite(df_plot[col])]

    if df_plot.empty:
        continue

    plt.figure(figsize=(8, 4))
    sns.histplot(
        data=df_plot,
        x=col,
        hue='is_experimental',
        kde=True,
        bins=50,
        palette={True: "tab:blue", False: "tab:orange"},
        element="step",
        stat="density",  # <- normalizes histograms
        common_norm=False
    )

    plt.title(f"{col} - Normalized Distribution (Experimental vs Non-Experimental)")
    plt.xlabel(col)
    plt.ylabel("Density")
    plt.grid(True)
    plt.legend(title="Experimental")
    plt.tight_layout()
    plt.show()


In [None]:
df_combined = df_combined[df_combined["pKi"].isna() | (df_combined["pKi"] > 3)]
print(len(df_combined))
df_combined = df_combined[df_combined["pEC50"].isna() | (df_combined["pEC50"] > 3)]
print(len(df_combined))
df_combined = df_combined[df_combined["pKd (Wang, FEP)"].isna() | (df_combined["pKd (Wang, FEP)"] > 3)]
print(len(df_combined))
df_combined = df_combined[df_combined["pKd"].isna() | (df_combined["pKd"] > 3)]
print(len(df_combined))
df_combined = df_combined[df_combined["pIC50"].isna() | (df_combined["pIC50"] > 3)]
print(len(df_combined))
df_combined = df_combined[df_combined["resolution"].isna() | (df_combined["resolution"] > 0)]
print(len(df_combined))
df_combined = df_combined[df_combined["resolution"].isna() | (df_combined["resolution"] < 3)]
print(len(df_combined))

df_combined

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Count plot for source_file with counts
plt.figure(figsize=(10, 5))
ax = sns.countplot(
    data=df_combined,
    y="source_file",
    order=df_combined["source_file"].value_counts().index
)
plt.title("Entry count per source file")
plt.xlabel("Count")
plt.ylabel("Source File")
plt.xscale('log')
plt.grid(True)

# Add count labels
for container in ax.containers:
    ax.bar_label(container, label_type="edge", padding=3)

plt.tight_layout()
plt.show()

# Count plot for is_experimental with counts
plt.figure(figsize=(5, 5))
ax = sns.countplot(data=df_combined, x="is_experimental")
plt.title("Experimental vs Non-experimental")
plt.xlabel("Is Experimental")
plt.ylabel("Count")
plt.yscale('log')
plt.grid(True)

# Add count labels
for container in ax.containers:
    ax.bar_label(container, label_type="edge", padding=3)

plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

exp_cols = ['resolution',  'pKi', 'pEC50',
 'pKd (Wang, FEP)',
 'pKd',
 'pIC50', "potency"]

for col in exp_cols:
    if col in df_combined.columns:
        data = df_combined[col]
        data = data[np.isfinite(data)]

        if len(data) == 0:
            continue

        stats = {
            'N': len(data),
            'min': data.min(),
            'mean': data.mean(),
            'median': data.median(),
            'max': data.max()
        }

        plt.figure(figsize=(8, 4))
        sns.histplot(data, kde=True, bins=50)
        plt.title(f"Distribution of {col}")
        plt.xlabel(col)
        plt.ylabel("Count")
        plt.grid(True)

        for label in ['min', 'mean', 'median', 'max']:
            plt.axvline(stats[label], linestyle='--', label=f"{label}: {stats[label]:.2f}")

        plt.legend(title=f"N = {stats['N']}")
        plt.tight_layout()
        plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

exp_cols = ['pKi' 'pKi', 'pEC50',
 'pKd (Wang, FEP)',
 'pKd',
 'pIC50', "potency"]  # exclude 'resolution'

for col in exp_cols:
    if col not in df_combined.columns:
        continue

    df_plot = df_combined[[col, 'is_experimental']].copy()
    df_plot = df_plot[np.isfinite(df_plot[col])]

    if df_plot.empty:
        continue

    plt.figure(figsize=(8, 4))
    sns.histplot(
        data=df_plot,
        x=col,
        hue='is_experimental',
        kde=True,
        bins=50,
        palette={True: "tab:blue", False: "tab:orange"},
        element="step",
        stat="density",  # <- normalizes histograms
        common_norm=False
    )

    plt.title(f"{col} - Normalized Distribution (Experimental vs Non-Experimental)")
    plt.xlabel(col)
    plt.ylabel("Density")
    plt.grid(True)
    plt.legend(title="Experimental")
    plt.tight_layout()
    plt.show()


In [None]:
df_combined.to_parquet("../data/curated/combined/df_combined.parquet", index = False)

In [None]:
import pandas as pd
df_combined = pd.read_parquet("../data/curated/combined/df_combined.parquet")

In [None]:
df_combined

In [1]:
1

1