In [34]:
import json
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from bio_lib import run_prodigy_jax

def process_pdbs(dataset_path, pdb_folder, output_folder):
    """Process all PDBs and save individual results."""
    with open(dataset_path) as f:
        dataset = json.load(f)
    
    os.makedirs(output_folder, exist_ok=True)
    
    for pdb_id, data in dataset.items():
        try:
            pdb_path = Path(pdb_folder) / f"{pdb_id}.pdb"
            results = run_prodigy_jax.run(
                pdb_path=pdb_path,
                target_chain="A", 
                binder_chain="B"
            )
            results= {k: float(v) if hasattr(v, 'item') else float(v) for k, v in results.to_dict().items()}
            
            output_path = Path(output_folder) / f"{pdb_id}.json"
            with open(output_path, 'w') as f:
                json.dump(results, f, indent=2)
              
        except Exception as e:
            print(f"Error processing {pdb_id}: {str(e)}")

In [2]:
import sys
#sys.path.append('/home/alessio/bio_lib/src/bio_lib/common')
import bio_lib.common.protein as Protein 
from bio_lib.common.protein_jax import JaxProtein, JAXStructureData
from typing import List, Tuple


In [3]:
def load_pdb_to_jax(pdb_path: str, target_chain: str, binder_chain: str) -> Tuple[JAXStructureData, JAXStructureData]:

    processor = JaxProtein()
    target = processor.process_pdb(pdb_path, selected_chains=[target_chain])
    binder = processor.process_pdb(pdb_path, selected_chains=[binder_chain])
    
    return target, binder

In [None]:
pdb_path = "/home/alessio/dr_sasa_python/data/PRODIGYdataset_fixed/1ACB.pdb"
load_pdb_to_jax(pdb_path, target_chain="A", binder_chain="B")

In [None]:
org_dataset_path = "dataset.json"
PRODIGYdataset_path = "PRODIGYdataset"
benchmark_results_folder = "benchmark_results"
process_pdbs(org_dataset_path, PRODIGYdataset_path, benchmark_results_folder) # was done on a A100 using google colab

In [23]:
      
import json
import pandas as pd

def merge_datasets(original_path: str, computed_path: str, output_path: str):
    # Load both datasets
    with open(original_path) as f:
        original = json.load(f)
    with open(computed_path) as f:
        computed = json.load(f)
    
    # Convert to dataframes
    df_original = pd.DataFrame.from_dict(original, orient='index')
    df_computed = pd.DataFrame.from_dict(computed, orient='index')
    
    # Add suffix to computed columns to avoid conflicts
    df_computed = df_computed.add_suffix('_computed')
    
    # Merge dataframes
    merged_df = pd.merge(
        df_original, 
        df_computed, 
        left_index=True, 
        right_index=True,
        how='left'
    )
    
    # Save merged dataset
    merged_df.to_csv(output_path)
    return merged_df


In [None]:
df = merge_datasets( "/home/alessio/bio_lib/merged_dataset.json", "/home/alessio/dr_sasa_python/data/dataset.json", 'final_dataset.csv')
df

In [44]:
def create_analysis_plots(df):
    fig, axes = plt.subplots(2, 2, figsize=(15, 15))
    
    # ba_val Correlation with Spearman
    ax = axes[0,0]
    spearman_corr = np.corrcoef(df['ba_val'], df['ba_val_computed'])[0,1]
    ax.scatter(df['ba_val'], df['ba_val_computed'])
    ax.plot([-15, 0], [-15, 0], 'r--')
    ax.set_xlabel('Original ba_val')
    ax.set_ylabel('Computed ba_val')
    ax.set_title(f'ba_val Correlation\nSpearman r={spearman_corr:.3f}')
    
    # Bland-Altman
    ax = axes[0,1]
    mean_ba_val = (df['ba_val'] + df['ba_val_computed']) / 2
    diff_ba_val = df['ba_val'] - df['ba_val_computed']
    ax.scatter(mean_ba_val, diff_ba_val)
    ax.axhline(y=np.mean(diff_ba_val), color='r', linestyle='--')
    ax.set_xlabel('Mean ba_val')
    ax.set_ylabel('Difference (Original - Computed)')
    ax.set_title('Bland-Altman Plot')
    
    # Contact types
    ax = axes[1,0]
    contact_types = ['CC', 'CP', 'AC', 'PP', 'AP', 'AA']
    original = df[contact_types].mean()
    computed = df[[f'{ct}_computed' for ct in contact_types]].mean()
    
    x = np.arange(len(contact_types))
    width = 0.35
    ax.bar(x - width/2, original, width, label='Original')
    ax.bar(x + width/2, computed, width, label='Computed')
    ax.set_xticks(x)
    ax.set_xticklabels(contact_types)
    ax.set_title('Average Contact Types')
    ax.legend()
    
    # NIS correlation pairs
    ax = axes[1,1]
    nis_types = ['nis_p', 'nis_a', 'nis_c']
    nis_data = pd.DataFrame()
    
    for nis in nis_types:
        nis_data[f'Original_{nis}'] = df[nis]
        nis_data[f'Computed_{nis}'] = df[f'{nis}_computed']
    
    # Calculate correlation matrix
    corr = nis_data.corr(method='spearman')
    
    # Create correlation heatmap
    im = ax.imshow(corr, cmap='RdYlBu_r', aspect='auto')
    plt.colorbar(im, ax=ax)
    
    # Add correlation values
    for i in range(len(corr.columns)):
        for j in range(len(corr.columns)):
            text = f'{corr.iloc[i, j]:.2f}'
            ax.text(j, i, text, ha='center', va='center')
    
    ax.set_xticks(np.arange(len(corr.columns)))
    ax.set_yticks(np.arange(len(corr.columns)))
    ax.set_xticklabels(corr.columns, rotation=45, ha='right')
    ax.set_yticklabels(corr.columns)
    ax.set_title('NIS Spearman Correlations')
    
    plt.tight_layout()
    return fig


In [None]:
# Usage
df = pd.read_csv('./benchmark/final_dataset.csv')
fig = create_analysis_plots(df)
plt.savefig('analysis_plots.png', dpi=300)

In [11]:
predict_binding_affinity("/Users/alessio/Documents/Repos/dr_sasa_python/data/PRODIGYdataset/1ACB.pdb", save_results=True)

[!] Structure contains gaps:
	A CYS1 < Fragment 0 > A LEU13
	A ILE16 < Fragment 1 > A TYR146
	A ALA149 < Fragment 2 > A ASN245
	B LYS8 < Fragment 3 > B GLY70



[+] Parsed structure file 1ACB (2 chains, 304 residues)
{'total': 13161.72782790869, 'per_chain': {'B': 3413.0944317456233, 'A': 9748.63339616307}}
[+] No. of intermolecular contacts: 72
[+] No. of charged-charged contacts: 3
[+] No. of charged-polar contacts: 2
[+] No. of charged-apolar contacts: 19
[+] No. of polar-polar contacts: 4
[+] No. of apolar-polar contacts: 19
[+] No. of apolar-apolar contacts: 25
[+] Percentage of apolar NIS residues: 38.50
[+] Percentage of charged NIS residues: 18.78
[++] Predicted binding affinity (kcal.mol-1):    -11.9
[++] Predicted dissociation constant (M) at 25.0˚C:  2.0e-09
1ACB_ba_results.json 1ACB_sasa_atom_results.csv


{'structure': '1ACB',
 'selection': 'A.B',
 'temp': 25.0,
 'ICs': 72,
 'nis_a': 38.497652582159624,
 'nis_c': 18.779342723004696,
 'nis_p': 42.72300469483568,
 'ba_val': -11.867636291079812,
 'kd_val': 1.9731060662205486e-09,
 'AA': 25,
 'PP': 4,
 'CC': 3,
 'AP': 19,
 'CP': 2,
 'AC': 19}

In [1]:
from bio_lib.custom_prodigy import predict_binding_affinity
import numpy as np
predict_binding_affinity("/Users/alessio/Documents/Repos/dr_sasa_python/data/PRODIGYdataset/1ACB.pdb", save_results=True)

I0000 00:00:1737829310.996341 2991492 service.cc:145] XLA service 0x11f6c9c20 initialized for platform METAL (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1737829310.996350 2991492 service.cc:153]   StreamExecutor device (0): Metal, <undefined>
I0000 00:00:1737829310.997522 2991492 mps_client.cc:406] Using Simple allocator.
I0000 00:00:1737829310.997530 2991492 mps_client.cc:384] XLA backend will use up to 51539214336 bytes on device 0 for SimpleAllocator.
[!] Structure contains gaps:
	A CYS1 < Fragment 0 > A LEU13
	A ILE16 < Fragment 1 > A TYR146
	A ALA149 < Fragment 2 > A ASN245
	B LYS8 < Fragment 3 > B GLY70



Metal device set to: Apple M1 Max

systemMemory: 64.00 GB
maxCacheSize: 24.00 GB

[+] Parsed structure file 1ACB (2 chains, 304 residues)
{'total': 13161.72782790869, 'per_chain': {'B': 3413.0944317456233, 'A': 9748.63339616307}}
[+] No. of intermolecular contacts: 72
[+] No. of charged-charged contacts: 3
[+] No. of charged-polar contacts: 2
[+] No. of charged-apolar contacts: 19
[+] No. of polar-polar contacts: 4
[+] No. of apolar-polar contacts: 19
[+] No. of apolar-apolar contacts: 25
[+] Percentage of apolar NIS residues: 38.50
[+] Percentage of charged NIS residues: 18.78
[++] Predicted binding affinity (kcal.mol-1):    -11.9
[++] Predicted dissociation constant (M) at 25.0˚C:  2.0e-09


{'structure': '1ACB',
 'selection': ['A', 'B'],
 'temp': 25.0,
 'ICs': 72,
 'nis_a': 38.497652582159624,
 'nis_c': 18.779342723004696,
 'nis_p': 42.72300469483568,
 'ba_val': -11.867636291079812,
 'kd_val': 1.9731060662205486e-09,
 'AA': 25,
 'PP': 4,
 'CC': 3,
 'AP': 19,
 'CP': 2,
 'AC': 19}

In [None]:
np.exp(-11.554325103759766 / (0.0019858775 * (25+ 273.15)))

In [2]:
predict_binding_affinity("/Users/alessio/Documents/Repos/dr_sasa_python/data/PRODIGYdataset/1ACB.pdb", save_results=True)

[!] Structure contains gaps:
	A CYS1 < Fragment 0 > A LEU13
	A ILE16 < Fragment 1 > A TYR146
	A ALA149 < Fragment 2 > A ASN245
	B LYS8 < Fragment 3 > B GLY70



[+] Parsed structure file 1ACB (2 chains, 304 residues)
{'total': 13161.72782790869, 'per_chain': {'B': 3413.0944317456233, 'A': 9748.63339616307}}
[+] No. of intermolecular contacts: 72
[+] No. of charged-charged contacts: 3
[+] No. of charged-polar contacts: 2
[+] No. of charged-apolar contacts: 19
[+] No. of polar-polar contacts: 4
[+] No. of apolar-polar contacts: 19
[+] No. of apolar-apolar contacts: 25
[+] Percentage of apolar NIS residues: 38.50
[+] Percentage of charged NIS residues: 18.78
[++] Predicted binding affinity (kcal.mol-1):    -11.9
[++] Predicted dissociation constant (M) at 25.0˚C:  2.0e-09


{'structure': '1ACB',
 'selection': ['A', 'B'],
 'temp': 25.0,
 'ICs': 72,
 'nis_a': 38.497652582159624,
 'nis_c': 18.779342723004696,
 'nis_p': 42.72300469483568,
 'ba_val': -11.867636291079812,
 'kd_val': 1.9731060662205486e-09,
 'AA': 25,
 'PP': 4,
 'CC': 3,
 'AP': 19,
 'CP': 2,
 'AC': 19}

In [None]:
predict_binding_affinity("/Users/alessio/Documents/Repos/dr_sasa_python/data/PRODIGYdataset/1ACB.pdb", acc_threshold=0.05)