In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from scipy import stats
import os, json
import seaborn as sns

def load_and_process_data(benchmark_path: str, results_path: str):
    with open(benchmark_path) as f:
        benchmark = json.load(f)
    with open(results_path) as f:
        results = json.load(f)
        
    df_benchmark = pd.DataFrame.from_dict(benchmark, orient='index')
    
    processed_results = {}
    for pdb_id, data in results.items():
        try:
            processed_results[pdb_id] = {
                'ba_val': data['ba_val'],
                #'kd': data['kd'],
                'CC': data['contacts']['CC'],
                'CP': data['contacts']['CP'],
                'AC': data['contacts']['AC'],
                'PP': data['contacts']['PP'],
                'AP': data['contacts']['AP'],
                'AA': data['contacts']['AA'],
                'nis_p': data['nis']['polar'],
                'nis_a': data['nis']['aliphatic'],
                'nis_c': data['nis']['charged'],
                'execution_time': data['execution_time']["seconds"]
            }
        except KeyError as e:
            print(f"Warning: Missing data for {pdb_id}: {e}")
            continue
            
    df_results = pd.DataFrame.from_dict(processed_results, orient='index')
    
    return df_benchmark, df_results

def calculate_correlations(df_benchmark: pd.DataFrame, df_results: pd.DataFrame):
    common_pdbs = sorted(set(df_benchmark.index) & set(df_results.index))
    print(f"Common PDB entries: {len(common_pdbs)}")
    
    metrics = {
        'ba_val': 'Binding Affinity',
        'CC': 'Charged-Charged contacts',
        'CP': 'Charged-Polar contacts',
        'AC': 'Aliphatic-Charged contacts',
        'PP': 'Polar-Polar contacts',
        'AP': 'Aliphatic-Polar contacts',
        'AA': 'Aliphatic-Aliphatic contacts',
        'nis_p': 'NIS Polar',
        'nis_a': 'NIS Aliphatic',
        'nis_c': 'NIS Charged'
    }
    
    correlations = []
    for metric in metrics:
        if metric in df_benchmark.columns and metric in df_results.columns:
            bench_vals = df_benchmark.loc[common_pdbs, metric]
            result_vals = df_results.loc[common_pdbs, metric]
            pearson = stats.pearsonr(bench_vals, result_vals)
            rmse = np.sqrt(np.mean((bench_vals - result_vals) ** 2))
            correlations.append({
                'Metric': metrics[metric],
                'Pearson r': pearson[0],
                'p-value': pearson[1],
                'RMSE': rmse
            })
    
    return pd.DataFrame(correlations)

def plot_correlations(df_benchmark: pd.DataFrame, df_results: pd.DataFrame, output_dir: str):
    os.makedirs(output_dir, exist_ok=True)
    common_pdbs = sorted(set(df_benchmark.index) & set(df_results.index))
    
    metrics = {
        'ba_val': 'Binding Affinity',
        'CC': 'Charged-Charged contacts',
        'CP': 'Charged-Polar contacts',
        'AC': 'Aliphatic-Charged contacts',
        'PP': 'Polar-Polar contacts',
        'AP': 'Aliphatic-Polar contacts',
        'AA': 'Aliphatic-Aliphatic contacts',
        'nis_p': 'NIS Polar',
        'nis_a': 'NIS Aliphatic',
        'nis_c': 'NIS Charged'
    }
    
    fig, axes = plt.subplots(2, 5, figsize=(15, 10))
    axes = axes.ravel()
    
    for i, (metric, title) in enumerate(metrics.items()):
        if i < len(axes):
            bench_vals = df_benchmark.loc[common_pdbs, metric]
            result_vals = df_results.loc[common_pdbs, metric]
            
            pearson = stats.pearsonr(bench_vals, result_vals)[0]
            
            ax = axes[i]
            ax.scatter(bench_vals, result_vals, alpha=0.6)
            ax.plot([min(bench_vals), max(bench_vals)], 
                   [min(bench_vals), max(bench_vals)], 'r--')
            
            ax.set_xlabel('Prodigy ORG')
            ax.set_ylabel('Prodigy JAX')
            ax.set_title(f'{title}\nr = {pearson:.3f}')
    
    plt.tight_layout()
    plt.savefig(f'{output_dir}/correlations.png', dpi=300, bbox_inches='tight')
    plt.show()

def add_sequence_lengths(df: pd.DataFrame, pdb_folder: str):
    lengths = {}
    for pdb_id in df.index:
        try:
            pdb_file = os.path.join(pdb_folder, f"{pdb_id}.pdb")
            if os.path.exists(pdb_file):
                with open(pdb_file, 'r') as f:
                    lines = f.readlines()
                    # Count ATOM lines for chain A and B
                    chain_a = sum(1 for line in lines if line.startswith('ATOM') and line[21] == 'A')
                    chain_b = sum(1 for line in lines if line.startswith('ATOM') and line[21] == 'B')
                    # Divide by typical number of atoms per residue (usually around 8-10)
                    lengths[pdb_id] = {
                        'chain_a_length': chain_a // 8,
                        'chain_b_length': chain_b // 8
                    }
        except Exception as e:
            print(f"Error processing {pdb_id}: {e}")
            
    # Add to DataFrame
    length_df = pd.DataFrame.from_dict(lengths, orient='index')
    return pd.concat([df, length_df], axis=1)

def compare_sasa_results(gpu_dir: str, cpu_dir: str):
    gpu_path = Path(gpu_dir)
    cpu_path = Path(cpu_dir)
    all_comparisons = []
    all_sasa_values = []
    
    for protein_dir in gpu_path.glob("*"):
        print("")
        if not protein_dir.is_dir():
            continue
            
        protein_name = protein_dir.name
        gpu_csv = list(protein_dir.rglob("*.csv"))
        cpu_csv = list((cpu_path / protein_name).rglob("*.csv"))
        
        if not gpu_csv or not cpu_csv:
            continue
        
        gpu_data = pd.read_csv(gpu_csv[0])
        gpu_data.resid = gpu_data.resid.astype(int)
        gpu_data = gpu_data.sort_values(['chain', "resname", 'resid', 'atom'])
        cpu_data = pd.read_csv(cpu_csv[0])
        cpu_data.resid = cpu_data.resid.astype(int)
        cpu_data = cpu_data.sort_values(['chain', "resname", 'resid', 'atom'])

        if len(gpu_data) != len(cpu_data):
            print(f"Length mismatch in {protein_name}: GPU={len(gpu_data)}, CPU={len(cpu_data)}")
            continue
        
        comparison = pd.DataFrame({
          'sasa_cpu': cpu_data['sasa'].values,
          'sasa_gpu': gpu_data['sasa'].values,
          'diff': abs(cpu_data['sasa'].values - gpu_data['sasa'].values),
          'protein': protein_name,
          'chain_gpu': gpu_data['chain'].values,
          'resname_gpu': gpu_data['resname'].values, 
          'resid_gpu': gpu_data['resid'].values,
          'atom_gpu': gpu_data['atom'].values,
          'chain_cpu': cpu_data['chain'].values,
          'resname_cpu': cpu_data['resname'].values,
          'resid_cpu': cpu_data['resid'].values, 
          'atom_cpu': cpu_data['atom'].values
        })
        
        all_sasa_values.append(comparison)
        
        rmse = np.sqrt(np.mean(comparison['diff']**2))
        correlation = stats.pearsonr(comparison['sasa_cpu'], comparison['sasa_gpu'])[0]
        
        all_comparisons.append({
            'protein': protein_name,
            'rmse': rmse,
            'correlation': correlation,
            'mean_diff': comparison['diff'].mean(),
            'max_diff': comparison['diff'].max(),
            'num_atoms': len(comparison),
            'num_nonzero': len(comparison[comparison['sasa_gpu'] > 0])
        })
    
    summary_df = pd.DataFrame(all_comparisons)
    all_sasa_df = pd.concat(all_sasa_values)
    
    # Add high_rmse column to all_sasa_df
    high_rmse_proteins = set(summary_df[summary_df['rmse'] > 2]['protein'])
    all_sasa_df['high_rmse'] = all_sasa_df['protein'].isin(high_rmse_proteins)
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # RMSE scatter plot
    ax1.scatter(summary_df['num_atoms'], summary_df['rmse'], alpha=0.6)
    for i, txt in enumerate(summary_df['protein']):
        if summary_df['rmse'].iloc[i] > 2:
            ax1.annotate(txt, (summary_df['num_atoms'].iloc[i], summary_df['rmse'].iloc[i]))
    ax1.set_xlabel('Number of Atoms')
    ax1.set_ylabel('RMSE (Å²)')
    ax1.set_title('RMSE GPU and CPU vs Structure Size')
    
    # SASA values comparison with color coding
    normal_points = all_sasa_df[~all_sasa_df['high_rmse']]
    high_rmse_points = all_sasa_df[all_sasa_df['high_rmse']]
    
    ax2.scatter(normal_points['sasa_cpu'], normal_points['sasa_gpu'], alpha=0.1, color='blue')
    
    max_val = max(all_sasa_df['sasa_cpu'].max(), all_sasa_df['sasa_gpu'].max())
    ax2.plot([0, max_val], [0, max_val], 'k--')
    ax2.set_xlabel('CPU SASA (Å²)')
    ax2.set_ylabel('GPU SASA (Å²)')
    ax2.set_title('CPU vs GPU SASA Values')
    ax2.legend()
    
    plt.tight_layout()
    plt.savefig('sasa_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    return summary_df, all_sasa_df

In [None]:
benchmark = "./benchmark_af/dataset.json"
results = "./benchmark_af/20250126_140901_gpu/combined_results.json"
output = "output_comp"
dataset = "./benchmark_af/PRODIGYdataset/" # make sure you have the dataset
os.makedirs(output, exist_ok=True)
df_benchmark, df_results = load_and_process_data(benchmark, results)
correlations = calculate_correlations(df_benchmark, df_results)
print("\nCorrelation Analysis:")
print(correlations.to_string(index=False))
correlations.to_csv(f'{output}/correlations.csv', index=False)
plot_correlations(df_benchmark, df_results, output)

# Save processed DataFrames
df_benchmark.to_csv(f'{output}/benchmark_processed.csv')
df_results.to_csv(f'{output}/results_processed.csv')

df = add_sequence_lengths(df_results, dataset)
df['total_length'] = df['chain_a_length'] + df['chain_b_length']

plt.figure(figsize=(10, 6))
plt.scatter(df['total_length'], df['execution_time'])

# Add trend line
z = np.polyfit(df['total_length'], df['execution_time'], 1)
p = np.poly1d(z)
plt.plot(df['total_length'], p(df['total_length']), "r--", alpha=0.8)

# Calculate correlation
corr = df['total_length'].corr(df['execution_time'])

plt.xlabel('Total Sequence Length (residues)')
plt.ylabel('Execution Time (s)')
plt.title(f'Execution Time vs Sequence Length\nCorrelation: {corr:.3f}')
plt.grid(True, alpha=0.3)
summary_df, all_sasa_df = compare_sasa_results("./benchmark_af/20250126_140901_gpu", "./benchmark_af/20250127_160612_cpu")


In [None]:
predict_binding_affinity_jax("/Users/alessio/Documents/Repos/dr_sasa_python/data/PRODIGYdataset/1ACB.pdb")

In [None]:
res

In [38]:
r = res.sasa_data["atom_sasa"]

In [30]:
import pandas as pd
o = pd.read_csv("/Users/alessio/Documents/Repos/bio_lib/1ACB_sasa_data.csv").sasa

In [None]:
(o -r).sum()

In [None]:
from bio_lib.custom_prodigy import predict_binding_affinity

res = predict_binding_affinity("/Users/alessio/Documents/Repos/dr_sasa_python/data/PRODIGYdataset/1ACB.pdb", selection="A,B", output_dir=".")

In [None]:
res

In [None]:
res

In [None]:
import re
import pandas as pd
from pathlib import Path
import re
import pandas as pd
from pathlib import Path

def parse_performance_log(log_text):
    # Split the log by 'Processing'
    pdb_sections = log_text.split('Processing')[1:]
    
    # Lists to store parsed data
    data = []
    
    for section in pdb_sections:
        # Extract PDB filename
        pdb_match = re.search(r'(.+\.pdb)\nNumber of atoms: (\d+)', section)
        if not pdb_match:
            continue
        
        pdb_file = pdb_match.group(1).strip()
        num_atoms = int(pdb_match.group(2))
        
        # Find all block size measurements
        block_matches = re.finditer(r'Block size (\d+):\n  Run times: \[(.*?)\]\n  Mean: ([\d.]+)s, Median: ([\d.]+)s, Std: ([\d.]+)s', section)
        
        for block_match in block_matches:
            block_size = int(block_match.group(1))
            
            # Parse run times
            run_times = [float(t.strip().strip("'s")) for t in block_match.group(2).split(',')]
            
            # Ensure we have exactly 3 run times
            while len(run_times) < 3:
                run_times.append(None)
            
            # Parse statistics
            mean_time = float(block_match.group(3))
            median_time = float(block_match.group(4))
            std_time = float(block_match.group(5))
            
            data.append({
                'PDB_File': Path(pdb_file).stem,
                'Number_of_Atoms': num_atoms,
                'Block_Size': block_size,
                'Run1_Time': run_times[0],
                'Run2_Time': run_times[1],
                'Run3_Time': run_times[2],
                'Mean_Time': mean_time,
                'Median_Time': median_time,
                'Std_Time': std_time
            })
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    return df

# Read the log file content
with open('paste.txt', 'r') as file:
    log_text = file.read()

# Parse the log
df = parse_performance_log(log_text)

# Display the DataFrame
print(df)
print("\nDataFrame Info:")
df.info()

# Optional: Save to CSV
df.to_csv('pdb_performance_data.csv', index=False)

# Additional analysis
print("\nSummary Statistics:")
print(df.groupby('PDB_File').agg({
    'Number_of_Atoms': 'first',
    'Mean_Time': ['mean', 'min', 'max'],
    'Block_Size': ['min', 'max']
}))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit


# Group by PDB file and take the first row for each
grouped_data = df.groupby('PDB_File').first().reset_index()

# Exponential fitting function for Mean Time (increasing)
def exp_func_time(x, a, b, c):
    return a * np.exp(b * x) + c

# Exponential decay function for Block Size
def exp_decay_func(x, a, b, c):
    return a * np.exp(-b * x) + c

# Prepare data for fitting
x = grouped_data['Number_of_Atoms']
y_time = grouped_data['Mean_Time']
y_blocksize = grouped_data['Block_Size']

# Perform curve fitting for Mean Time
popt_time, _ = curve_fit(exp_func_time, x, y_time, p0=[0.001, 0.0001, 1])

# Perform curve fitting for Block Size (using decay function)
popt_blocksize, _ = curve_fit(exp_decay_func, x, y_blocksize, p0=[100, 0.0001, 1])

# Generate points for the fitted curves
x_fit = np.linspace(x.min(), x.max(), 100)
y_fit_time = exp_func_time(x_fit, *popt_time)
y_fit_blocksize = exp_decay_func(x_fit, *popt_blocksize)

# Create the plot with two y-axes
fig, ax1 = plt.subplots(figsize=(12, 7))

# First y-axis - Mean Time
color1 = 'blue'
ax1.set_xlabel('Number of Atoms')
ax1.set_ylabel('Mean Time (s)', color=color1)
ax1.errorbar(grouped_data['Number_of_Atoms'], grouped_data['Mean_Time'], 
             yerr=grouped_data['Std_Time'], 
             fmt='o', 
             capsize=5, 
             color=color1, 
             alpha=0.7, 
             label='Mean Time with Std Dev')
ax1.plot(x_fit, y_fit_time, color=color1, linestyle='--', 
         label=f'Time Fit: {popt_time[0]:.4e} * exp({popt_time[1]:.4e} * x) + {popt_time[2]:.4f}')
ax1.tick_params(axis='y', labelcolor=color1)

# Calculate R-squared for Mean Time
residuals_time = y_time - exp_func_time(x, *popt_time)
ss_res_time = np.sum(residuals_time**2)
ss_tot_time = np.sum((y_time - np.mean(y_time))**2)
r_squared_time = 1 - (ss_res_time / ss_tot_time)

# Second y-axis - Block Size
ax2 = ax1.twinx()
color2 = 'red'
ax2.set_ylabel('Block Size', color=color2)
ax2.scatter(grouped_data['Number_of_Atoms'], grouped_data['Block_Size'], 
            color=color2, alpha=0.7, label='Block Size')
ax2.plot(x_fit, y_fit_blocksize, color=color2, linestyle='--', 
         label=f'Block Size Fit: {popt_blocksize[0]:.4e} * exp(-{popt_blocksize[1]:.4e} * x) + {popt_blocksize[2]:.4f}')
ax2.tick_params(axis='y', labelcolor=color2)

# Calculate R-squared for Block Size
residuals_blocksize = y_blocksize - exp_decay_func(x, *popt_blocksize)
ss_res_blocksize = np.sum(residuals_blocksize**2)
ss_tot_blocksize = np.sum((y_blocksize - np.mean(y_blocksize))**2)
r_squared_blocksize = 1 - (ss_res_blocksize / ss_tot_blocksize)

# Title and layout
plt.title('Number of Atoms vs Mean Time and Block Size')
fig.tight_layout()

# Combine legends
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='best')

# Add R-squared annotations
ax1.annotate(f'R² (Time) = {r_squared_time:.4f}', 
             xy=(0.05, 0.95), 
             xycoords='axes fraction', 
             fontsize=10, 
             color=color1,
             verticalalignment='top')
ax1.annotate(f'R² (Block Size) = {r_squared_blocksize:.4f}', 
             xy=(0.05, 0.90), 
             xycoords='axes fraction', 
             fontsize=10, 
             color=color2,
             verticalalignment='top')

# Save the plot
plt.savefig('atoms_vs_time_and_blocksize.png')

# Print fitting details
print("Mean Time Exponential Fitting Details:")
print(f"Equation: y = {popt_time[0]:.4e} * exp({popt_time[1]:.4e} * x) + {popt_time[2]:.4f}")
print(f"R-squared: {r_squared_time:.4f}")

print("\nBlock Size Exponential Decay Fitting Details:")
print(f"Equation: y = {popt_blocksize[0]:.4e} * exp(-{popt_blocksize[1]:.4e} * x) + {popt_blocksize[2]:.4f}")
print(f"R-squared: {r_squared_blocksize:.4f}")

# Print data with predictions
grouped_data['Predicted_Time'] = exp_func_time(grouped_data['Number_of_Atoms'], *popt_time)
grouped_data['Time_Residual'] = grouped_data['Mean_Time'] - grouped_data['Predicted_Time']
grouped_data['Predicted_BlockSize'] = exp_decay_func(grouped_data['Number_of_Atoms'], *popt_blocksize)
grouped_data['BlockSize_Residual'] = grouped_data['Block_Size'] - grouped_data['Predicted_BlockSize']

print("\nData with Predictions:")
print(grouped_data[['PDB_File', 'Number_of_Atoms', 'Mean_Time', 'Predicted_Time', 'Time_Residual', 
                    'Block_Size', 'Predicted_BlockSize', 'BlockSize_Residual']].to_string(index=False))