## Initialization

In [None]:
from plapt import Plapt
import pandas as pd
from scipy.stats import spearmanr, pearsonr
import numpy as np

plapt = Plapt()

# Test_2016_290

In [43]:
test_2016_290_data  = pd.read_csv("data/Test2016_290.csv")
    
# Extract sequences and smiles from benchmark dataset
test_2016_290_prot_seqs = test_2016_290_data['seq'].tolist()
test_2016_290_mol_smiles = test_2016_290_data['smiles_can'].tolist()
test_2016_290_experimental_pKd = test_2016_290_data['neg_log10_affinity_M'].tolist()

In [44]:
test_2016_290_predictions = plapt.predict_affinity(test_2016_290_prot_seqs,test_2016_290_mol_smiles)

In [45]:
test_2016_290_predicted_pKd = [d['neg_log10_affinity_M'] for d in test_2016_290_predictions]

# Ensure the lengths of the lists are the same
if len(test_2016_290_predicted_pKd) != len(test_2016_290_experimental_pKd):
    raise ValueError("The lengths of the predicted and experimental lists do not match.")

# Convert lists to numpy arrays for easier calculations
test_2016_290_predicted_pKd = np.array(test_2016_290_predicted_pKd)
test_2016_290_experimental_pKd = np.array(test_2016_290_experimental_pKd)

mse = np.mean((test_2016_290_predicted_pKd - test_2016_290_experimental_pKd) ** 2)
mae = np.mean(np.abs(test_2016_290_predicted_pKd - test_2016_290_experimental_pKd))
rmse = np.sqrt(mse)

# Calculate Spearman's correlation
spearman_corr, _ = spearmanr(test_2016_290_predicted_pKd, test_2016_290_experimental_pKd)
pearson_corr, _ = pearsonr(test_2016_290_predicted_pKd, test_2016_290_experimental_pKd)

print("test_2016_290")
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Spearman's Correlation: {spearman_corr}")
print(f"Pearsons's Correlation (R): {pearson_corr}")

test_2016_290
MSE: 1.4308253988124906
MAE: 0.9060012964709052
RMSE: 1.196171141105022
Spearman's Correlation: 0.8314992191087003
Pearsons's Correlation (R): 0.8448984316773254


# CSAR HiQ 36

In [46]:
csar_hiq_36_data = pd.read_csv("data/CSAR-HiQ_36.csv")
    
# Extract sequences and smiles from benchmark dataset
csar_hiq_36_prot_seqs = csar_hiq_36_data['seq'].tolist()
csar_hiq_36_mol_smiles = csar_hiq_36_data['smiles_can'].tolist()
csar_hiq_36_experimental_pKd = csar_hiq_36_data['neg_log10_affinity_M'].tolist()

In [47]:
csar_hiq_36_predictions = plapt.predict_affinity(csar_hiq_36_prot_seqs,csar_hiq_36_mol_smiles)

In [48]:
csar_hiq_36_predicted_pKd = [d['neg_log10_affinity_M'] for d in csar_hiq_36_predictions]

# Ensure the lengths of the lists are the same
if len(csar_hiq_36_predicted_pKd) != len(csar_hiq_36_experimental_pKd):
    raise ValueError("The lengths of the predicted and experimental lists do not match.")

# Convert lists to numpy arrays for easier calculations
csar_hiq_36_predicted_pKd = np.array(csar_hiq_36_predicted_pKd)
csar_hiq_36_experimental_pKd = np.array(csar_hiq_36_experimental_pKd)

mse = np.mean((csar_hiq_36_predicted_pKd - csar_hiq_36_experimental_pKd) ** 2)
mae = np.mean(np.abs(csar_hiq_36_predicted_pKd - csar_hiq_36_experimental_pKd))
rmse = np.sqrt(mse)

# Calculate Spearman's correlation
spearman_corr, _ = spearmanr(csar_hiq_36_predicted_pKd, csar_hiq_36_experimental_pKd)
pearson_corr, _ = pearsonr(csar_hiq_36_predicted_pKd, csar_hiq_36_experimental_pKd)

print("csar_hiq_36")
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Spearman's Correlation: {spearman_corr}")
print(f"Pearsons's Correlation (R): {pearson_corr}")


csar_hiq_36
MSE: 1.8194388728307287
MAE: 1.157283093664381
RMSE: 1.3488657727256366
Spearman's Correlation: 0.7423423423423423
Pearsons's Correlation (R): 0.7314860774522897


# Benchmark2k2101

In [49]:
benchmark_data = pd.read_csv("data/benchmark1k2101.csv")
prot_seqs = benchmark_data['seq']
mol_smiles = benchmark_data['smiles_can']
experimental_pKd = benchmark_data['neg_log10_affinity_M']

In [50]:
predictions = plapt.predict_affinity(prot_seqs,mol_smiles)

In [51]:
predicted_pKd = [d['neg_log10_affinity_M'] for d in predictions]

# Ensure the lengths of the lists are the same
if len(predicted_pKd) != len(experimental_pKd):
    raise ValueError("The lengths of the predicted and experimental lists do not match.")

# Convert lists to numpy arrays for easier calculations
predicted_pKd = np.array(predicted_pKd)
experimental_pKd = np.array(experimental_pKd)

mse = np.mean((predicted_pKd - experimental_pKd) ** 2)
mae = np.mean(np.abs(predicted_pKd - experimental_pKd))
rmse = np.sqrt(mse)

# Calculate Spearman's correlation
spearman_corr, _ = spearmanr(predicted_pKd, experimental_pKd)
pearson_corr, _ = pearsonr(predicted_pKd, experimental_pKd)

print("benchmark1k2101")
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"Spearman's Correlation: {spearman_corr}")
print(f"Pearsons's Correlation (R): {pearson_corr}")

benchmark1k2101
MSE: 0.8505429481953753
MAE: 0.6883426528036773
RMSE: 0.9222488537240777
Spearman's Correlation: 0.8821411225023195
Pearsons's Correlation (R): 0.8816018987554616


## Recreate Custom Benchmark

In [None]:
from datasets import load_dataset, Dataset
import random
random.seed(2101)
benchmark_data = load_dataset("jglaser/binding_affinity")['train'].select(random.sample(range(10001,20001), 1000))

In [None]:
benchmark_data.to_csv("data/benchmark1k2101.csv")