In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from scipy import stats
import os, json
import seaborn as sns
import shutil

def get_dataset_df(pdb_folder):
    data = []

    for pdb_file in os.listdir(pdb_folder):
        if pdb_file.endswith(".pdb"):
            pdb_id = os.path.splitext(pdb_file)[0]  # Extract PDB ID from filename
            
            try:
                chain_a = chain_b = 0
                with open(os.path.join(pdb_folder, pdb_file), 'r') as f:
                    for line in f:
                        if line.startswith('ATOM'):
                            if line[21] == 'A':
                                chain_a += 1
                            elif line[21] == 'B':
                                chain_b += 1

                data.append({
                    'pdb_path': os.path.join(pdb_folder, pdb_file),
                    'pdb_id': pdb_id,
                    'chain_a_length': chain_a // 8,
                    'chain_b_length': chain_b // 8,
                    'n_atoms': chain_a + chain_b
                })
            except Exception as e:
                print(f"Error processing {pdb_id}: {e}")

    return pd.DataFrame(data)

def select_representative_pdbs(df, num_pdbs=10):
    df_sorted = df.sort_values("n_atoms")
    min_pdb = df_sorted.iloc[0]
    max_pdb = df_sorted.iloc[-1]
    mean_pdb = df_sorted.iloc[(df_sorted["n_atoms"] - df["n_atoms"].mean()).abs().idxmin()]
    
    remaining = df_sorted.drop([min_pdb.name, max_pdb.name, mean_pdb.name])
    sampled = remaining.iloc[np.linspace(0, len(remaining) - 1, num_pdbs - 3, dtype=int)]
    
    return pd.concat([min_pdb.to_frame().T, mean_pdb.to_frame().T, max_pdb.to_frame().T, sampled])

In [2]:
folder = Path("/Users/alessio/Documents/Repos/dr_sasa_python/data/PRODIGYdataset/")
target_dir = Path("/Users/alessio/Documents/Repos/bio_lib/benchmark_af/v3/selected_prodigy_pdbs")
dataset_df = get_dataset_df(folder)
selected_df = select_representative_pdbs(dataset_df)
for f in selected_df['pdb_path']:
    shutil.copy(Path(f), target_dir)

In [None]:
from bio_lib.run_prodigy_custom import run
import os
os.environ['XLA_PYTHON_CLIENT_MEM_FRACTION'] = '0.8'
os.environ['XLA_PYTHON_CLIENT_PREALLOCATE'] = 'false'

target_dir = Path("/Users/alessio/Documents/Repos/bio_lib/benchmark_af/v3/selected_prodigy_pdbs")
res = run(target_dir, output_dir=Path("results_cpu_benchmark_M1_selected_prodigy_pdbs"), sphere_points=100, quiet=True, use_jax=False, benchmark=True, output_json=True)


In [None]:
def process_prodigy_results(results_dir, name):

    all_sasa_values = []
    all_metrics = []
    
    for json_file in Path(results_dir).glob("*.json"):
        with open(json_file) as f:
            data = json.load(f)
            
        protein_id = data["structure_id"]
        # Process SASA data
        sasa_df = pd.DataFrame(data['sasa_data'])
        sasa_df['protein_id'] = protein_id
        sasa_df = sasa_df.sort_values(['chain', 'resname', 'resindex', 'atomname'])
        all_sasa_values.append(sasa_df)
        
        # Process other metrics
        metrics = {
            'name': name,
            'protein_id': protein_id,
            'atoms': len(sasa_df.index),
            'ba_val': data['ba_val'],
            'kd': data['kd'],
            'execution_times': data['execution_time']["benchmark_times"],
            'execution_times_min': min(data['execution_time']["benchmark_times"]),
            'execution_times_max': max(data['execution_time']["benchmark_times"]), 
            'execution_times_mean': np.mean(data['execution_time']["benchmark_times"]),
            'execution_times_std': np.std(data['execution_time']["benchmark_times"])
        }
        # Flatten nested dictionaries
        metrics.update(data['contacts'])
        metrics.update(data['nis'])
        
        all_metrics.append(metrics)
    
    # Combine all results
    combined_sasa_df = pd.concat(all_sasa_values, ignore_index=True)
    metrics_df = pd.DataFrame(all_metrics)
    
    return combined_sasa_df, metrics_df

# Process the results
results_dirs = { 
'./benchmark_af/v3/results_cpu_benchmark_A100_selected_prodigy_pdbs' : "cpu_A100",
"./benchmark_af/v3/results_cpu_benchmark_M1_selected_prodigy_pdbs" : "cpu_M1",
"./benchmark_af/v3/results_gpu_benchmark_M1_selected_prodigy_pdbs" : "gpu_M1",
"./benchmark_af/v3/results_gpu_benchmark_A100_selected_prodigy_pdbs" : "gpu_A100"}

all_sasa_dfs = []   
all_metrics_dfs = []
for results_dir, name in results_dirs.items():
    sasa_df, metrics_df = process_prodigy_results(results_dir, name)
    all_sasa_dfs.append(sasa_df)
    all_metrics_dfs.append(metrics_df)
all_sasa_dfs = pd.concat(all_sasa_dfs, ignore_index=True)
all_metrics_dfs = pd.concat(all_metrics_dfs, ignore_index=True)
all_metrics_dfs.to_csv("all_metrics_dfs.csv")
all_sasa_dfs.to_csv("all_sasa_dfs.csv")


In [None]:
# Create the figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(28, 10))

# Make the frame thicker and black for both subplots
for ax in [ax1, ax2]:
    for spine in ax.spines.values():
        spine.set_linewidth(2)
        spine.set_color('black')

# First subplot - Execution times vs atoms with polynomial fit
def poly_fit(x, y, degree=2):
    coeffs = np.polyfit(x, y, degree)
    p = np.poly1d(coeffs)
    return p, coeffs

# Define colors for each implementation
colors = {
    'cpu_A100': 'blue',
    'cpu_M1': 'green',
    'gpu_A100': 'red',
    'gpu_M1': 'purple'
}

for name in df['name'].unique():
    subset = df[df['name'] == name]
    x_i = subset['atoms'].values
    color = colors[name]
    
    # Use different execution time metrics for GPU vs CPU
    if 'gpu' in name.lower():
        # Plot min times for GPU (post-compilation)
        y_i_min = subset['execution_times_min'].values
        y_i_max = subset['execution_times_max'].values
        
        # Plot min times with lighter color
        ax1.scatter(x_i, y_i_min, 
                   label=f"{name} (min, n={len(x_i)}, post-compilation)", 
                   alpha=0.7, 
                   marker='o',
                   color=color,
                   facecolors='none')
        p_min, _ = poly_fit(x_i, y_i_min, degree=2)
        x_fit = np.linspace(min(x_i), max(x_i), 100)
        ax1.plot(x_fit, p_min(x_fit), '--', alpha=0.5, color=color)
        
        # Plot max times with solid color
        ax1.scatter(x_i, y_i_max, 
                   label=f"{name} (max, n={len(x_i)}, with compilation)", 
                   alpha=0.7, 
                   marker='o',
                   color=color)
        p_max, _ = poly_fit(x_i, y_i_max, degree=2)
        ax1.plot(x_fit, p_max(x_fit), '--', alpha=0.5, color=color)
    else:
        # Use mean times for CPU
        y_i_mean = subset['execution_times_mean'].values
        p, _ = poly_fit(x_i, y_i_mean, degree=2)
        
        ax1.scatter(x_i, y_i_mean, 
                   label=f"{name} (n={len(x_i)})", 
                   alpha=0.7,
                   color=color)
        x_fit = np.linspace(min(x_i), max(x_i), 100)
        ax1.plot(x_fit, p(x_fit), '--', alpha=0.5, color=color)

ax1.set_xlabel('Number of Atoms', fontsize=12)
ax1.set_ylabel('Execution Time (s)', fontsize=12)
ax1.set_ylim(-0.2, None)
ax1.set_xlim(500, None)

ax1.legend(fontsize=12, loc='upper left')
ax1.grid(True, linestyle='--', alpha=0.6)
ax1.set_title('Across Platforms: \nExecution Time vs. System Size', fontsize=14)

# Second subplot - Calculate mean RMSE across implementations
metrics = ["ba_val", "AA", "CC", "PP", "AC", "AP", "CP", "IC", 
          "chargedC", "polarC", "aliphaticC", "aliphatic", "charged", "polar"]

# Group by protein_id and name
grouped_df = df.groupby(["protein_id", "name"])[metrics].mean()

# Calculate mean RMSE for each metric
rmse_results = []
for metric in metrics:
    metric_rmses = []
    for protein_id in df['protein_id'].unique():
        values = grouped_df.loc[protein_id][metric].values
        # Calculate RMSE between all pairs of implementations
        n = len(values)
        rmse = 0
        count = 0
        for i in range(n):
            for j in range(i+1, n):
                rmse += (values[i] - values[j])**2
                count += 1
        if count > 0:
            rmse = np.sqrt(rmse/count)
            metric_rmses.append(rmse)
    rmse_results.append(np.mean(metric_rmses))

# Create DataFrame for plotting
rmse_df = pd.DataFrame(rmse_results, index=metrics, columns=['RMSE'])

# Create heatmap
sns.heatmap(rmse_df, 
            annot=True, 
            cmap='YlOrRd', 
            ax=ax2, 
            fmt='.2e',
            cbar_kws={'label': 'Mean RMSE'})

ax2.set_title('Consistency:\nMean RMSE Across Platforms per Metric', 
              fontsize=14, 
              pad=20)
ax2.set_xlabel('Mean RMSE', fontsize=12)
ax2.set_ylabel('Metric', fontsize=12)

plt.tight_layout()
plt.show()
fig.savefig("v3_bechmark_af.png")

In [None]:
from bio_lib.custom_prodigy import predict_binding_affinity

res = predict_binding_affinity("/Users/alessio/Documents/Repos/dr_sasa_python/data/PRODIGYdataset/1ACB.pdb", selection="A,B", output_dir=".")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Extracting loss values from the file content
loss_values = [
    0.24, 0.10, -1.06, -1.61, -1.58, -0.11, -0.64, -2.54, -3.12, -1.35,
    -3.29, -1.80, -4.27, -6.28, -2.02, -2.87, -3.39, -2.28, -4.46, -2.54,
    -2.81, -3.49, -10.48, -2.64, -3.51, -14.84, -3.01, -4.51, -5.10, -5.02,
    -5.24, -3.34, -6.71, -6.21, -3.37, -2.96, -4.37, -8.41, -4.06, -4.10,
    -4.84, -4.90, -4.01, -4.51, -3.99, -4.34, -7.99, -4.92, -4.70, -11.73,
    -4.50, -4.78, -5.65, -4.86, -11.32, -5.90, -7.05, -5.00, -5.22, -6.26,
    -8.13, -12.18, -11.37, -5.84, -10.29, -8.89, -3.35, -7.57, -5.15, -5.61,
    -6.37, -9.50, -6.32, -6.43, -5.89, -8.31, -6.62, -5.59, -10.87, -11.13
]

# Generate sequence indices
steps = np.arange(1, len(loss_values) + 1)

# Plot loss values
plt.figure(figsize=(10, 5))
plt.plot(steps, loss_values, marker='o', linestyle='-', label="Loss")

# Formatting the plot
plt.xlabel("Step")
plt.ylabel("Loss Value")
plt.title("Loss over Time")
plt.legend()
plt.grid(True)

# Show the plot
plt.show()
# Extracting additional values (i_ptm and ba_val)
i_ptm_values = [
    0.15, 0.19, 0.19, 0.19, 0.19, 0.16, 0.15, 0.21, 0.20, 0.16,
    0.20, 0.16, 0.20, 0.20, 0.17, 0.20, 0.20, 0.15, 0.20, 0.17,
    0.20, 0.22, 0.21, 0.17, 0.18, 0.21, 0.17, 0.17, 0.21, 0.22,
    0.20, 0.17, 0.19, 0.23, 0.18, 0.18, 0.19, 0.26, 0.22, 0.18,
    0.22, 0.24, 0.20, 0.18, 0.18, 0.20, 0.18, 0.25, 0.18, 0.18,
    0.21, 0.19, 0.26, 0.21, 0.23, 0.19, 0.18, 0.24, 0.24, 0.26,
    0.24, 0.18, 0.23, 0.21, 0.19, 0.25, 0.18, 0.17, 0.19, 0.23,
    0.22, 0.18, 0.22, 0.18, 0.19, 0.21, 0.18, 0.24
]

ba_val_values = [
    -6.09, -6.09, -6.29, -6.43, -6.35, -6.54, -6.58, -6.52, -6.71, -6.66,
    -6.79, -6.62, -7.28, -8.21, -6.53, -6.54, -6.68, -6.76, -7.35, -6.71,
    -6.42, -6.79, -10.14, -6.85, -6.97, -12.40, -6.75, -7.39, -7.54, -7.42,
    -7.70, -6.80, -8.38, -8.02, -7.01, -6.68, -7.24, -9.10, -6.97, -7.25,
    -7.39, -7.34, -7.33, -7.40, -7.25, -7.47, -8.82, -7.61, -7.56, -10.73,
    -7.46, -7.42, -7.84, -7.73, -10.43, -7.93, -8.42, -7.57, -7.70, -8.00,
    -8.86, -10.95, -10.53, -7.97, -10.04, -9.45, -6.73, -8.53, -7.63, -7.95,
    -8.17, -9.66, -8.15, -8.42, -7.92, -9.12, -8.56, -8.04, -10.32, -10.42
]

# Ensuring all lists have the same length
min_length = min(len(loss_values), len(i_ptm_values), len(ba_val_values))

# Trimming all lists to the same length
loss_values = loss_values[:min_length]

i_ptm_values = i_ptm_values[:min_length] 
ba_val_values = ba_val_values[:min_length]
steps = np.linspace(1, min_length, min_length)  # Scaling x-axis for better readability

# Plot all values
plt.figure(figsize=(12, 6))
plt.plot(steps, loss_values, marker='o', linestyle='-', label="Loss")

plt.plot(steps, i_ptm_values, marker='d', linestyle='-.', label="i_PTM")
plt.plot(steps, ba_val_values, marker='v', linestyle='-', label="BA Value")

# Formatting the plot
plt.xlabel("Step")
plt.ylabel("Values")
plt.title("Loss and Other Metrics Over Time")
plt.legend()
plt.grid(True)

# Show the plot
plt.show()