In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
def display_df(df):
    display(HTML(df.to_html()))
    return None

In [3]:
def pearson_r_and_err(x, y, bootstrap_repeats=1000):
    assert x.ndim == y.ndim == 1, "x and y must be 1D array"
    assert x.shape[0] == y.shape[0], "x and y does not have the same len" 
    
    pearson_r = np.corrcoef(x, y)[0, -1]
    nsamples = x.shape[0]
    
    rs = []
    for _ in range(bootstrap_repeats):
        rand_idx = np.random.choice(nsamples, size=nsamples, replace=True)
        r = np.corrcoef(x[rand_idx], y[rand_idx])[0, -1]
        if (not np.isnan(r)) and (r not in [np.inf, -np.inf]):
            rs.append(r)

    rs = np.array(rs)
    err = rs.std()
    return pearson_r, err


def rmse_and_err(x, y, bootstrap_repeats=1000):
    assert x.ndim == y.ndim == 1, "x and y must be 1D array"
    assert x.shape[0] == y.shape[0], "x and y does not have the same len" 
    
    all_sample_rmse = np.sqrt(mean_squared_error(x, y))
    nsamples = x.shape[0]
    
    rmses_boostrap = []
    for _ in range(bootstrap_repeats):
        rand_idx = np.random.choice(nsamples, size=nsamples, replace=True)
        rmse = np.sqrt(mean_squared_error(x[rand_idx], y[rand_idx]))
        
        if (not np.isnan(rmse)) and (rmse not in [np.inf, -np.inf]):
            rmses_boostrap.append(rmse)

    rmses_boostrap = np.array(rmses_boostrap)
    err = rmses_boostrap.std()
    return all_sample_rmse, err


def hit_rate_cal(x, threshold, bootstrap_repeats=1000):
    assert x.ndim == 1, "x must be 1D array"
    all_sample_rate = (x <= threshold).mean()
    nsamples = x.shape[0]
    
    rates_bootstrap = []
    for _ in range(bootstrap_repeats):
        rand_idx = np.random.choice(nsamples, size=nsamples, replace=True)
        rate = (x[rand_idx] <= threshold).mean()
        if (not np.isnan(rate)) and (rate not in [np.inf, -np.inf]):
            rates_bootstrap.append(rate)
    
    rates_bootstrap = np.array(rates_bootstrap)
    err = rates_bootstrap.std()
    return all_sample_rate, err

# Pearson's R and RMSE

In [None]:
EXCEL_PATH = "Docking_RMSD_results_27052021.xlsx"

df_fe = pd.read_excel(EXCEL_PATH, sheet_name="free_energy_Hai")
print(df_fe.shape)
display_df(df_fe.head())

In [None]:
excl_cols = ["Target Name", "order_num", "PDB ID", "Ki", "Experimental"]

exp_fe_col = "Experimental"
fe_cols = [col for col in df_fe.columns if col not in excl_cols]

In [None]:
excl_cols = ["Target Name", "order_num", "PDB ID", "Ki", "Experimental"]

exp_fe_col = "Experimental"
fe_cols = [col for col in df_fe.columns if col not in excl_cols]

r, r_err = {}, {}
rmse, rmse_err = {}, {}

for col in fe_cols:
    print(col)
    r[col], r_err[col] = pearson_r_and_err(df_fe[exp_fe_col], df_fe[col])
    rmse[col], rmse_err[col] = rmse_and_err(df_fe[exp_fe_col], df_fe[col])

df_r_rmse = pd.DataFrame({"R": r, "R_error": r_err, "RMSE": rmse, "RMSE_error": rmse_err})
df_r_rmse = df_r_rmse.reset_index().rename(columns={"index": "docking"})
df_r_rmse.to_csv("r_rmse.csv", index=False)

In [None]:
display_df(df_r_rmse)

# Docking Success rate 

In [None]:
df_rmsd = pd.read_excel(EXCEL_PATH, sheet_name="rmsd_Hai")
print(df_rmsd.shape)
display_df(df_rmsd.head())

In [None]:
excl_cols = ["Target Name", "order_num", "PDB ID"]
rmsd_cols = [col for col in df_rmsd.columns if col not in excl_cols]

cut_offs = [0.2, 0.15, 0.1, 0.05]

df_hit_rate = {"docking": rmsd_cols}

for cut_off in cut_offs:
    hit_rate = []
    rate_err = []
    suffix = "_%0.2f" % cut_off
    print(suffix)
    
    for col in rmsd_cols:
        rate, err = hit_rate_cal(df_rmsd[col], cut_off)
        hit_rate.append(rate)
        rate_err.append(err)
    
    df_hit_rate["rate" + suffix] = hit_rate
    df_hit_rate["error" + suffix] = rate_err

df_hit_rate = pd.DataFrame(df_hit_rate)
df_hit_rate.to_csv("hit_rate.csv", index=False)

In [None]:
display_df(df_hit_rate)

# Supporting info

## Free energy 800

In [5]:
df_fe_800 = pd.read_excel("Supporting_Information.xlsx", sheet_name="Free_Energy_800")
df_fe_800.head()

Unnamed: 0,Target Name,order_num,PDB ID,Experiment,Default,gauss1_+50,gauss1_+40,gauss1_+30,gauss1_+20,gauss1_+10,...,rotation_-30,rotation_-40,rotation_-50,rotation_-60,rotation_-70,rotation_-80,rotation_-90,set1,set2,set3
0,3-Dehydroquinate Dehydratase,1,1GU1,-6.207972,-5.3,-6.3,-6.1,-5.9,-5.7,-5.5,...,-5.5,-5.6,-5.6,-5.7,-5.8,-5.9,-5.9,-9.4,-8.8,-13.5
1,3-Dehydroquinate Dehydratase,2,1V1J,-6.621157,-6.3,-7.3,-7.1,-6.9,-6.7,-6.5,...,-6.5,-6.6,-6.7,-6.8,-6.9,-7.0,-7.1,-10.5,-9.8,-14.7
2,3-Dehydroquinate Dehydratase,3,2C4V,-3.571512,-4.8,-5.8,-5.6,-5.4,-5.2,-5.0,...,-5.2,-5.3,-5.5,-5.6,-5.8,-6.0,-6.1,-9.4,-9.2,-14.0
3,3-Dehydroquinate Dehydratase,4,2C4W,-6.44967,-8.9,-10.2,-10.0,-9.7,-9.4,-9.2,...,-9.3,-9.5,-9.6,-9.8,-10.0,-10.1,-10.3,-15.2,-14.5,-21.6
4,3-Dehydroquinate Dehydratase,5,2XD9,-8.861225,-8.0,-9.5,-9.2,-8.9,-8.6,-8.3,...,-8.4,-8.6,-8.7,-8.9,-9.1,-9.2,-9.4,-13.9,-13.1,-19.6


In [6]:
non_docking_cols = ["Target Name", "order_num", "PDB ID", "Experiment", ]
exper_col = "Experiment"
docking_cols = [col for col in df_fe_800.columns if col not in non_docking_cols]

r, r_err = {}, {}
rmse, rmse_err = {}, {}

for col in docking_cols:
    print(col)
    r[col], r_err[col] = pearson_r_and_err(df_fe_800[exper_col], df_fe_800[col])
    rmse[col], rmse_err[col] = rmse_and_err(df_fe_800[exper_col], df_fe_800[col])

df_res_fe_800 = pd.DataFrame({"R": r, "R_error": r_err, "RMSE": rmse, "RMSE_error": rmse_err})
df_res_fe_800 = df_res_fe_800.reset_index().rename(columns={"index": "docking"})
df_res_fe_800.to_csv("free_energy_800.csv", index=False)

Default
gauss1_+50
gauss1_+40
gauss1_+30
gauss1_+20
gauss1_+10
gauss1_-10
gauss1_-20
gauss1_-30
gauss1_-40
gauss1_-50
gauss2_+150
gauss2_+140
gauss2_+130
gauss2_+120
gauss2_+110
gauss2_+100
gauss2_+90
gauss2_+80
gauss2_+70
gauss2_+60
gauss2_+50
gauss2_+40
gauss2_+30
gauss2_+20
gauss2_+10
gauss2_-10
gauss2_-20
gauss2_-30
gauss2_-40
gauss2_-50
repulsion_+50
repulsion_+40
repulsion_+30
repulsion_+20
repulsion_+10
repulsion_-10
repulsion_-20
repulsion_-30
repulsion_-40
repulsion_-50
hydrophobic_+50
hydrophobic_+40
hydrophobic_+30
hydrophobic_+20
hydrophobic_+10
hydrophobic_-10
hydrophobic_-20
hydrophobic_-30
hydrophobic_-40
hydrophobic_-50
hydrogenbond_+50
hydrogenbond_+40
hydrogenbond_+30
hydrogenbond_+20
hydrogenbond_+10
hydrogenbond_-10
hydrogenbond_-20
hydrogenbond_-30
hydrogenbond_-40
hydrogenbond_-50
rotation_+50
rotation_+40
rotation_+30
rotation_+20
rotation_+10
rotation_-10
rotation_-20
rotation_-30
rotation_-40
rotation_-50
rotation_-60
rotation_-70
rotation_-80
rotation_-90
set1

In [9]:
df_res_fe_800

Unnamed: 0,docking,R,R_error,RMSE,RMSE_error
0,Default,0.485354,0.025977,2.806549,0.066951
1,gauss1_+50,0.488181,0.025468,2.474444,0.055946
2,gauss1_+40,0.484847,0.026840,2.489744,0.058233
3,gauss1_+30,0.489706,0.025907,2.522166,0.059514
4,gauss1_+20,0.487369,0.026189,2.596595,0.062548
...,...,...,...,...,...
73,rotation_-80,0.529755,0.026067,2.630459,0.064319
74,rotation_-90,0.522574,0.025018,2.812188,0.074029
75,set1,0.566961,0.025289,5.678618,0.101126
76,set2,0.570922,0.023568,5.558170,0.101029


## Free energy 1315

In [7]:
df_fe_1315 = pd.read_excel("Supporting_Information.xlsx", sheet_name="Free_Energy_1315")
df_fe_1315.head()

Unnamed: 0,PDB_ID,Experiment,Default,set1,set2,set3
0,10gs,-8.759529,-8.4,-17.3,-16.6,-26.7
1,1a28,-11.353307,-11.5,-17.3,-17.1,-24.4
2,1a30,-5.888614,-7.3,-14.0,-13.7,-22.1
3,1a4w,-8.106294,-9.4,-17.8,-17.5,-29.3
4,1a69,-7.257731,-8.4,-14.4,-12.8,-19.7


In [8]:
non_docking_cols = ["PDB_ID", "Experiment",]
exper_col = "Experiment"
docking_cols = [col for col in df_fe_1315.columns if col not in non_docking_cols]


r, r_err = {}, {}
rmse, rmse_err = {}, {}

for col in docking_cols:
    print(col)
    r[col], r_err[col] = pearson_r_and_err(df_fe_1315[exper_col], df_fe_1315[col])
    rmse[col], rmse_err[col] = rmse_and_err(df_fe_1315[exper_col], df_fe_1315[col])

df_res_fe_1315 = pd.DataFrame({"R": r, "R_error": r_err, "RMSE": rmse, "RMSE_error": rmse_err})
df_res_fe_1315 = df_res_fe_1315.reset_index().rename(columns={"index": "docking"})
df_res_fe_1315.to_csv("free_energy_1315.csv", index=False)

Default
set1
set2
set3


In [10]:
df_res_fe_1315

Unnamed: 0,docking,R,R_error,RMSE,RMSE_error
0,Default,0.55174,0.017998,2.397581,0.045097
1,set1,0.606453,0.01598,7.779162,0.086786
2,set2,0.615012,0.014983,6.941543,0.084427
3,set3,0.59475,0.016109,16.383578,0.164652


## RMSD_800

In [11]:
df_rmsd_800 = pd.read_excel("Supporting_Information.xlsx", sheet_name="RMSD_800")
df_rmsd_800.head()

Unnamed: 0,Target Name,order_num,PDB ID,Default,gauss1_+50,gauss1_+40,gauss1_+30,gauss1_+20,gauss1_+10,gauss1_-10,...,rotation_-30,rotation_-40,rotation_-50,rotation_-60,rotation_-70,rotation_-80,rotation_-90,set1,set2,set3
0,3-Dehydroquinate Dehydratase,1,1GU1,0.182952,0.174177,0.16421,0.178053,0.178581,0.182607,0.170328,...,0.167437,0.16404,0.16933,0.168015,0.168456,0.179251,0.167454,0.175795,0.182423,0.191385
1,3-Dehydroquinate Dehydratase,2,1V1J,0.202078,0.213343,0.1904,0.215644,0.203765,0.200243,0.204052,...,0.20103,0.207443,0.188692,0.189532,0.200681,0.200563,0.200646,0.202095,0.216135,0.191572
2,3-Dehydroquinate Dehydratase,3,2C4V,0.196072,0.180677,0.195974,0.195795,0.19579,0.195458,0.195447,...,0.195806,0.196183,0.195774,0.195539,0.196001,0.195379,0.195381,0.200506,0.198159,0.197525
3,3-Dehydroquinate Dehydratase,4,2C4W,0.438471,0.438787,0.438804,0.438686,0.438656,0.438603,0.438356,...,0.438556,0.438509,0.438497,0.438508,0.438493,0.438613,0.438587,0.440517,0.447499,0.44008
4,3-Dehydroquinate Dehydratase,5,2XD9,0.341358,0.344948,0.343198,0.338572,0.343073,0.339445,0.345164,...,0.336448,0.342064,0.343802,0.339657,0.351503,0.341211,0.348791,0.346476,0.343711,0.350228


In [12]:
non_docking_cols = ["Target Name", "order_num", "PDB ID", ]
docking_cols = [col for col in df_rmsd_800.columns if col not in non_docking_cols]

cut_offs = [0.2, 0.15, 0.1, 0.05]

df_res_rmsd_800 = {"docking": docking_cols}

for cut_off in cut_offs:
    hit_rate = []
    rate_err = []
    suffix = "_%0.2f" % cut_off
    print(suffix)
    
    for col in docking_cols:
        rate, err = hit_rate_cal(df_rmsd_800[col], cut_off)
        hit_rate.append(rate)
        rate_err.append(err)
    
    df_res_rmsd_800["rate" + suffix] = hit_rate
    df_res_rmsd_800["error" + suffix] = rate_err

df_res_rmsd_800 = pd.DataFrame(df_res_rmsd_800)
df_res_rmsd_800.to_csv("rmsd_800.csv", index=False)

_0.20
_0.15
_0.10
_0.05


In [13]:
df_res_rmsd_800.head()

Unnamed: 0,docking,rate_0.20,error_0.20,rate_0.15,error_0.15,rate_0.10,error_0.10,rate_0.05,error_0.05
0,Default,0.79375,0.014569,0.66,0.016968,0.425,0.017687,0.16875,0.012726
1,gauss1_+50,0.805,0.014276,0.6725,0.017666,0.43875,0.017363,0.1875,0.013763
2,gauss1_+40,0.79875,0.014032,0.67,0.016764,0.425,0.017113,0.185,0.013625
3,gauss1_+30,0.8,0.014293,0.6625,0.016545,0.42875,0.016962,0.18375,0.013897
4,gauss1_+20,0.79625,0.014233,0.6575,0.016437,0.435,0.017824,0.18,0.013593


## RMSD_1315

In [14]:
df_rmsd_1315 = pd.read_excel("Supporting_Information.xlsx", sheet_name="RMSD_1315")
df_rmsd_1315.head()

Unnamed: 0,PDB_ID,Experiment,Default,set1,set2,set3
0,10gs,-8.759529,0.278647,0.282589,0.279649,0.279558
1,1a28,-11.353307,0.001043,0.000534,0.000164,0.006585
2,1a30,-5.888614,0.200334,0.223036,0.206451,0.23111
3,1a4w,-8.106294,0.130168,0.197603,0.112745,0.336142
4,1a69,-7.257731,0.043026,0.044169,0.014555,0.047293


In [15]:
non_docking_cols = ["PDB_ID", "Experiment", ]
docking_cols = [col for col in df_rmsd_1315.columns if col not in non_docking_cols]

cut_offs = [0.2, 0.15, 0.1, 0.05]

df_res_rmsd_1315 = {"docking": docking_cols}

for cut_off in cut_offs:
    hit_rate = []
    rate_err = []
    suffix = "_%0.2f" % cut_off
    print(suffix)
    
    for col in docking_cols:
        rate, err = hit_rate_cal(df_rmsd_1315[col], cut_off)
        hit_rate.append(rate)
        rate_err.append(err)
    
    df_res_rmsd_1315["rate" + suffix] = hit_rate
    df_res_rmsd_1315["error" + suffix] = rate_err

df_res_rmsd_1315 = pd.DataFrame(df_res_rmsd_1315)
df_res_rmsd_1315.to_csv("rmsd_1315.csv", index=False)

_0.20
_0.15
_0.10
_0.05


In [16]:
df_res_rmsd_1315

Unnamed: 0,docking,rate_0.20,error_0.20,rate_0.15,error_0.15,rate_0.10,error_0.10,rate_0.05,error_0.05
0,Default,0.843346,0.009484,0.742205,0.011945,0.542966,0.01416,0.278327,0.012621
1,set1,0.793916,0.011133,0.673004,0.012853,0.496578,0.013882,0.244106,0.011411
2,set2,0.763498,0.011712,0.659316,0.013302,0.472243,0.013913,0.225856,0.011632
3,set3,0.676046,0.013063,0.558935,0.012901,0.380989,0.012775,0.193916,0.010629


# Supporting info 2 v2

In [11]:
df_fe_800 = pd.read_excel("Supporting_Information_2_v2.xlsx", sheet_name="Free_Energy_800")
df_fe_800 = df_fe_800.dropna()
df_fe_800.head()

Unnamed: 0,Target Name,order_num,PDB ID,Experiment,Default,gauss1_+50,gauss1_+40,gauss1_+30,gauss1_+20,gauss1_+10,...,set3,set4,set5,set6,set7,set8,set9,set10,set11,set12
0,3-Dehydroquinate Dehydratase,1.0,1GU1,-7.7,-5.3,-8.8,-8.6,-8.4,-8.1,-7.9,...,-11.8,-13.1,-14.5,-13.9,-13.2,-13.9,-14.6,-11.9,-10.1,-14.8
1,3-Dehydroquinate Dehydratase,2.0,1V1J,-8.2,-6.3,-9.3,-9.0,-8.8,-8.6,-8.4,...,-12.3,-13.9,-15.2,-14.5,-14.0,-14.5,-15.3,-12.5,-10.8,-15.7
2,3-Dehydroquinate Dehydratase,3.0,2C4V,-7.5,-4.8,-8.7,-8.4,-8.2,-8.0,-7.8,...,-13.0,-13.9,-15.7,-14.3,-13.9,-14.3,-15.8,-12.8,-10.5,-16.1
3,3-Dehydroquinate Dehydratase,4.0,2C4W,-9.3,-8.9,-10.8,-10.5,-10.2,-9.9,-9.6,...,-15.2,-17.7,-18.2,-17.2,-18.0,-17.5,-18.5,-15.5,-14.7,-21.8
4,3-Dehydroquinate Dehydratase,5.0,2XD9,-10.2,-8.0,-11.7,-11.4,-11.1,-10.8,-10.5,...,-16.0,-17.8,-19.1,-17.9,-18.2,-18.3,-19.5,-16.2,-14.4,-20.9


In [14]:
non_docking_cols = ["Target Name", "order_num", "PDB ID", "Experiment", ]
exper_col = "Experiment"
docking_cols = [col for col in df_fe_800.columns if col not in non_docking_cols]

r, r_err = {}, {}
rmse, rmse_err = {}, {}

for col in docking_cols:
    print(col)
    r[col], r_err[col] = pearson_r_and_err(df_fe_800[exper_col], df_fe_800[col])
    rmse[col], rmse_err[col] = rmse_and_err(df_fe_800[exper_col], df_fe_800[col])

df_res_fe_800 = pd.DataFrame({"R": r, "R_error": r_err, "RMSE": rmse, "RMSE_error": rmse_err})
df_res_fe_800 = df_res_fe_800.reset_index().rename(columns={"index": "docking"})
df_res_fe_800.to_csv("si_2_v2/free_energy_800.csv", index=False)

Default
gauss1_+50
gauss1_+40
gauss1_+30
gauss1_+20
gauss1_+10
gauss1_-10
gauss1_-20
gauss1_-30
gauss1_-40
gauss1_-50
gauss2_+150
gauss2_+140
gauss2_+130
gauss2_+120
gauss2_+110
gauss2_+100
gauss2_+90
gauss2_+80
gauss2_+70
gauss2_+60
gauss2_+50
gauss2_+40
gauss2_+30
gauss2_+20
gauss2_+10
gauss2_-10
gauss2_-20
gauss2_-30
gauss2_-40
gauss2_-50
repulsion_+50
repulsion_+40
repulsion_+30
repulsion_+20
repulsion_+10
repulsion_-10
repulsion_-20
repulsion_-30
repulsion_-40
repulsion_-50
hydrophobic_+50
hydrophobic_+40
hydrophobic_+30
hydrophobic_+20
hydrophobic_+10
hydrophobic_-10
hydrophobic_-20
hydrophobic_-30
hydrophobic_-40
hydrophobic_-50
hydrogenbond_+50
hydrogenbond_+40
hydrogenbond_+30
hydrogenbond_+20
hydrogenbond_+10
hydrogenbond_-10
hydrogenbond_-20
hydrogenbond_-30
hydrogenbond_-40
hydrogenbond_-50
rotation_+50
rotation_+40
rotation_+30
rotation_+20
rotation_+10
rotation_-10
rotation_-20
rotation_-30
rotation_-40
rotation_-50
rotation_-60
rotation_-70
rotation_-80
rotation_-90
set1

## Free energy 1315

In [18]:
df_fe_1315 = pd.read_excel("Supporting_Information_2_v2.xlsx", sheet_name="Free_Energy_1315")
df_fe_1315.head()

Unnamed: 0,PDB_ID,Experiment,Default,set1,set2,set3,set4,set5,set6,set7,set8,set9,set10,set11,set12
0,10gs,-8.759529,-8.4,-14.4,-21.8,-18.8,-19.5,-19.9,-16.5,-20.0,-17.0,-20.5,-17.3,-16.6,-26.7
1,1a28,-11.353307,-11.5,-15.3,-20.1,-16.5,-20.1,-20.2,-19.8,-20.7,-20.4,-20.8,-17.3,-17.1,-24.4
2,1a30,-5.888614,-7.3,-12.4,-18.3,-14.9,-15.7,-16.5,-13.8,-16.5,-14.0,-16.8,-14.0,-13.7,-22.1
3,1a4w,-8.106294,-9.4,-15.2,-23.5,-19.2,-20.4,-20.9,-17.5,-20.6,-17.7,-21.4,-17.8,-17.5,-29.3
4,1a69,-7.257731,-8.4,-11.8,-15.9,-14.4,-16.4,-17.6,-16.5,-16.4,-16.5,-17.6,-14.4,-12.8,-19.7


In [21]:
non_docking_cols = ["PDB_ID", "Experiment",]
exper_col = "Experiment"
docking_cols = [col for col in df_fe_1315.columns if col not in non_docking_cols]


r, r_err = {}, {}
rmse, rmse_err = {}, {}

for col in docking_cols:
    print(col)
    r[col], r_err[col] = pearson_r_and_err(df_fe_1315[exper_col], df_fe_1315[col])
    rmse[col], rmse_err[col] = rmse_and_err(df_fe_1315[exper_col], df_fe_1315[col])

df_res_fe_1315 = pd.DataFrame({"R": r, "R_error": r_err, "RMSE": rmse, "RMSE_error": rmse_err})
df_res_fe_1315 = df_res_fe_1315.reset_index().rename(columns={"index": "docking"})
df_res_fe_1315.to_csv("si_2_v2/free_energy_1315.csv", index=False)

Default
set1
set2
set3
set4
set5
set6
set7
set8
set9
set10
set11
set12


## RMSD_800

In [22]:
df_rmsd_800 = pd.read_excel("Supporting_Information_2_v2.xlsx", sheet_name="RMSD_800")
df_rmsd_800.head()

Unnamed: 0,Target Name,order_num,PDB ID,Default,gauss1_+50,gauss1_+40,gauss1_+30,gauss1_+20,gauss1_+10,gauss1_-10,...,set3,set4,set5,set6,set7,set8,set9,set10,set11,set12
0,3-Dehydroquinate Dehydratase,1,1GU1,0.096764,0.096885,0.097147,0.096633,0.096741,0.097319,0.096373,...,0.097501,0.096798,0.019945,0.021482,0.096771,0.022198,0.0144,0.096426,0.096859,0.098046
1,3-Dehydroquinate Dehydratase,2,1V1J,0.093403,0.092391,0.09382,0.092618,0.092162,0.092642,0.092041,...,0.094158,0.092206,0.094182,0.093016,0.092589,0.094108,0.093311,0.093502,0.091743,0.092596
2,3-Dehydroquinate Dehydratase,3,2C4V,0.102908,0.052546,0.102962,0.052956,0.05466,0.053925,0.108501,...,0.053955,0.052678,0.054808,0.056616,0.053704,0.055358,0.053945,0.055977,0.123815,0.100009
3,3-Dehydroquinate Dehydratase,4,2C4W,0.021737,0.021088,0.021246,0.021485,0.021713,0.021697,0.022065,...,0.021213,0.02117,0.080352,0.080342,0.021098,0.08037,0.080408,0.080586,0.019675,0.020059
4,3-Dehydroquinate Dehydratase,5,2XD9,0.019759,0.017101,0.020782,0.018519,0.01921,0.021971,0.019779,...,0.021261,0.020242,0.022263,0.02225,0.020353,0.023801,0.01901,0.023054,0.022783,0.022683


In [23]:
non_docking_cols = ["Target Name", "order_num", "PDB ID", ]
docking_cols = [col for col in df_rmsd_800.columns if col not in non_docking_cols]

cut_offs = [0.2, 0.15, 0.1, 0.05]

df_res_rmsd_800 = {"docking": docking_cols}

for cut_off in cut_offs:
    hit_rate = []
    rate_err = []
    suffix = "_%0.2f" % cut_off
    print(suffix)
    
    for col in docking_cols:
        rate, err = hit_rate_cal(df_rmsd_800[col], cut_off)
        hit_rate.append(rate)
        rate_err.append(err)
    
    df_res_rmsd_800["rate" + suffix] = hit_rate
    df_res_rmsd_800["error" + suffix] = rate_err

df_res_rmsd_800 = pd.DataFrame(df_res_rmsd_800)
df_res_rmsd_800.to_csv("si_2_v2/rmsd_800.csv", index=False)

_0.20
_0.15
_0.10
_0.05


## RMSD_1315

In [24]:
df_rmsd_1315 = pd.read_excel("Supporting_Information_2_v2.xlsx", sheet_name="RMSD_1315")
df_rmsd_1315.head()

Unnamed: 0,PDB_ID,Experiment,Default,set1,set2,set3,set4,set5,set6,set7,set8,set9,set10,set11,set12
0,10gs,-8.759529,0.278647,0.294452,0.28382,0.28811,0.284091,0.274208,0.285081,0.283405,0.282067,0.282067,0.282589,0.279649,0.279558
1,1a28,-11.353307,0.001043,0.000985,0.000872,0.000959,0.001035,0.001115,0.001028,0.000772,0.000588,0.000588,0.000534,0.000164,0.006585
2,1a30,-5.888614,0.200334,0.217193,0.191071,0.22522,0.145097,0.228261,0.216722,0.194742,0.226895,0.226895,0.223036,0.206451,0.23111
3,1a4w,-8.106294,0.130168,0.174614,0.106217,0.190854,0.175746,0.19556,0.196704,0.19087,0.192203,0.192203,0.197603,0.112745,0.336142
4,1a69,-7.257731,0.043026,0.014757,0.014763,0.044672,0.045425,0.042779,0.042774,0.045249,0.04293,0.04293,0.044169,0.014555,0.047293


In [27]:
non_docking_cols = ["PDB_ID", "Experiment", ]
docking_cols = [col for col in df_rmsd_1315.columns if col not in non_docking_cols]

cut_offs = [0.2, 0.15, 0.1, 0.05]

df_res_rmsd_1315 = {"docking": docking_cols}

for cut_off in cut_offs:
    hit_rate = []
    rate_err = []
    suffix = "_%0.2f" % cut_off
    print(suffix)
    
    for col in docking_cols:
        rate, err = hit_rate_cal(df_rmsd_1315[col], cut_off)
        hit_rate.append(rate)
        rate_err.append(err)
    
    df_res_rmsd_1315["rate" + suffix] = hit_rate
    df_res_rmsd_1315["error" + suffix] = rate_err

df_res_rmsd_1315 = pd.DataFrame(df_res_rmsd_1315)
df_res_rmsd_1315.to_csv("si_2_v2/rmsd_1315.csv", index=False)

_0.20
_0.15
_0.10
_0.05
