# dGhydr offsets
Create dataframe containing absolute hydration energy offsets between experimental and calculated values within FreeSolv.

In [1]:
import pandas as pd
import numpy as np

data = '../../jupyter_notebook/database.txt'

# Load FreeSolv database
df1 = pd.read_csv(data, sep='; ', engine='python')

# SAMPl4_Guthrie experimental reference in FreeSolv.
ref = 'SAMPL4_Guthrie'
# Experimental reference column name.
exp_ref_col = 'experimental reference (original or paper this value was taken from)'

# List comprehension for all ligands that are not SAMPL4_Guthrie entires.
train_data = [df1.iloc[i] for i in range(len(df1))  if df1.loc[i, exp_ref_col] != ref]

# new training set DataFrame not containing SAMPL4_Guthrie entires.
df2 = pd.DataFrame(train_data)
df2

Unnamed: 0,compound id (and file prefix),SMILES,iupac name (or alternative if IUPAC is unavailable or not parseable by OEChem),experimental value (kcal/mol),experimental uncertainty (kcal/mol),Mobley group calculated value (GAFF) (kcal/mol),calculated uncertainty (kcal/mol),experimental reference (original or paper this value was taken from),calculated reference,text notes.
0,mobley_1017962,CCCCCC(=O)OC,methyl hexanoate,-2.49,0.6,-3.30,0.03,10.1021/ct050097l,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availab...
1,mobley_1019269,CCCCO,butan-1-ol,-4.72,0.6,-3.23,0.03,10.1021/ct050097l,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availab...
2,mobley_1034539,c1cc(c(cc1c2cc(c(c(c2Cl)Cl)Cl)Cl)Cl)Cl,"1,2,3,4-tetrachloro-5-(3,4-dichlorophenyl)benzene",-3.04,0.1,-1.08,0.04,10.1007/s10822-012-9568-8,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availab...
3,mobley_1036761,C1CCC(CC1)N,cyclohexanamine,-4.59,0.6,-3.95,0.03,10.1021/ct050097l,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availab...
4,mobley_1046331,c1ccc(cc1)OC=O,phenyl formate,-3.82,0.6,-5.44,0.03,"J. Peter Guthrie, unpublished data, as provide...",10.1021/acs.jced.7b00104,Experimental uncertainty not presently availab...
...,...,...,...,...,...,...,...,...,...,...
637,mobley_9913368,C(=C/Cl)\Cl,"(E)-1,2-dichloroethylene",-0.78,0.6,1.02,0.02,10.1021/ct050097l,10.1021/acs.jced.7b00104,"Renamed mobley_9913368 from (E)-1,2-dichloroet..."
638,mobley_9942801,CCc1ccc(cc1)C,1-ethyl-4-methyl-benzene,-0.95,0.6,-0.57,0.03,10.1021/ct050097l,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availab...
639,mobley_994483,CCBr,bromoethane,-0.74,0.6,0.49,0.02,10.1021/ct050097l,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availab...
640,mobley_9974966,CC(C)SC(C)C,2-isopropylsulfanylpropane,-1.21,0.6,0.14,0.03,10.1021/ct050097l,10.1021/acs.jced.7b00104,Experimental uncertainty not presently availab...


In [2]:
# Column names

ID = df2.loc[:, 'compound id (and file prefix)']
exp_val = df2.loc[:, 'experimental value (kcal/mol)']
exp_err = df2.loc[:, 'experimental uncertainty (kcal/mol)']
calc_val = df2.loc[:, 'Mobley group calculated value (GAFF) (kcal/mol)']
calc_err = df2.loc[:, 'calculated uncertainty (kcal/mol)']

# New nested list containing IDs and offsets
########## Errors also need to be added ###########
offsets = []
for name, exp, err1, calc, err2 in zip(ID, exp_val, exp_err, calc_val, calc_err):
    diff = exp - calc
    diff = np.absolute(diff)
    error = (err1**2 + err2**2)**0.5
    offsets.append([name, diff, round(error, 2)])

df3 = pd.DataFrame(offsets, columns=['ID', 'dGhydr (kcal/mol)', 'uncertainty (kcal/mol)'])
df3

Unnamed: 0,ID,dGhydr (kcal/mol),uncertainty (kcal/mol)
0,mobley_1017962,0.81,0.60
1,mobley_1019269,1.49,0.60
2,mobley_1034539,1.96,0.11
3,mobley_1036761,0.64,0.60
4,mobley_1046331,1.62,0.60
...,...,...,...
596,mobley_9913368,1.80,0.60
597,mobley_9942801,0.38,0.60
598,mobley_994483,1.23,0.60
599,mobley_9974966,1.35,0.60


In [3]:
save_loc = '../datasets/dGhydr_offsets.csv'
df3.to_csv(path_or_buf=save_loc, index=False)

print('Completed writing all dGhydr offsets to CSV.')

Completed writing all dGhydr offsets to CSV.
