In [98]:
#Import libraries
import pandas as pd
import numpy as np
from rdkit import Chem
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Normalization
from tensorflow.keras.models import Model
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [99]:
# Load dataset using pandas functionality
ddr1__hold_out_data = pd.read_csv('../data/ddr1_offdna.csv')

In [100]:
#Use preprocessing steps to ensure featurization is the same in the hold-out dataset 
#Generate Molecular Descriptors
from rdkit.Chem import Descriptors

def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return {
            'MolWt': Descriptors.MolWt(mol),
            'LogP': Descriptors.MolLogP(mol),
            'TPSA': Descriptors.TPSA(mol),
            'NumHDonors': Descriptors.NumHDonors(mol),
            'NumHAcceptors': Descriptors.NumHAcceptors(mol)
        }
    return None



# Apply descriptor calculation
descriptors = ddr1__hold_out_data['smiles'].apply(calculate_descriptors)

# Convert descriptors into a DataFrame
descriptors_df = pd.DataFrame(descriptors.tolist())

In [101]:
ddr1__hold_out_data.columns

Index(['Unnamed: 0', 'smiles', 'molecule_hash', 'kd', 'smiles_a', 'smiles_b',
       'smiles_c'],
      dtype='object')

In [102]:
descriptors_df.columns

Index(['MolWt', 'LogP', 'TPSA', 'NumHDonors', 'NumHAcceptors'], dtype='object')

In [103]:
X = descriptors_df
print(X.columns)
# Name the features
X_features = ['MolWt', 'LogP', 'TPSA', 'NumHDonors', 'NumHAcceptors']

# Convert dataframes to numpy arrays for better computation
X = X.values


# Scale features
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)
#X_scaled


Index(['MolWt', 'LogP', 'TPSA', 'NumHDonors', 'NumHAcceptors'], dtype='object')


In [104]:
ddr1__hold_out_data_preds = pd.DataFrame(X_scaled, index=ddr1__hold_out_data["smiles"], columns=descriptors_df.columns)
ddr1__hold_out_data_preds

Unnamed: 0_level_0,MolWt,LogP,TPSA,NumHDonors,NumHAcceptors
smiles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CC1=CC=C(C(NC2=CC=C(CN3CCN(C)CC3)C(C(F)(F)F)=C2)=O)C=C1C#CC4=CN=C5N4N=CC(CCC(O)=O)=C5,0.942011,0.583636,-0.292605,-1.487755,1.784458
CC(C1=CN(CC(NC2=CC=C(NC3=NC=NC4=CC(OCC(O)=O)=CC(F)=C34)C=C2)=O)N=N1)C,-0.659574,-0.227512,1.598589,-0.338126,3.395207
CCCC1CCC(C(=O)NC2CCN(CC(=O)N3CCC[C@@H](C(=O)NC)C3)CC2)CC1,-1.233444,-1.058291,-1.27411,-1.487755,-0.631666
CNC(=O)[C@H](CCC1CCCCC1)NC(=O)c1ccc(CNC(=O)c2cn[nH]c2C)cc1,-1.170295,-0.565441,0.301731,0.811503,-0.631666
CNC(=O)[C@H](CCc1ccccc1)NC(=O)c1ccc(CNC(=O)c2n[nH]c3ncccc23)cc1,-0.773959,-0.911483,0.895146,0.811503,0.173708
CNC(=O)[C@@H](CC1CCCCC1)NC(=O)CC1CCN(C(=O)c2cnc(N)s2)CC1,-1.221045,-1.050702,0.368024,-0.338126,0.979083
CNC(=O)[C@@H](CC1CCCCC1)NC(=O)CC1CCN(C(=O)c2n[nH]c3ncccc23)CC1,-0.97816,-0.885738,0.490482,-0.338126,0.173708
CNC(=O)[C@@H](Cc1cccc(Cl)c1)NC(=O)CC1CCN(C(=O)c2n[nH]c3ncccc23)CC1,-0.614787,-0.938433,0.490482,-0.338126,0.173708
CNC(=O)[C@@H]1CCC[C@H](NC(=O)c2ccc(CNC(=O)c3cnc(N)s3)cc2)C1,-1.477929,-1.390347,0.772688,0.811503,0.979083
CNC(=O)[C@H](CCC1CCCCC1)NC(=O)c1ccc(CNC(=O)c2cnn(Cc3ccccc3)c2)cc1,-0.19653,0.295041,-0.198229,-0.338126,0.173708


In [105]:
#Load previous model
model_ddr1 = load_model('../models/ddr1_model_1.h5')

In [106]:
#Predictions of enrichment
enrichment_predictions = model_ddr1.predict(X_scaled)

#Add predictions of enrichment to ddr1__hold_out_data_preds
ddr1__hold_out_data_preds["enrichment_predictions"] = enrichment_predictions

#Add kd from in vitro testing in hold out set
kd_values = ddr1__hold_out_data["kd"]
kd_values2 = kd_values.to_numpy()
ddr1__hold_out_data_preds["kd"] = kd_values2

ddr1__hold_out_data_preds



Unnamed: 0_level_0,MolWt,LogP,TPSA,NumHDonors,NumHAcceptors,enrichment_predictions,kd
smiles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CC1=CC=C(C(NC2=CC=C(CN3CCN(C)CC3)C(C(F)(F)F)=C2)=O)C=C1C#CC4=CN=C5N4N=CC(CCC(O)=O)=C5,0.942011,0.583636,-0.292605,-1.487755,1.784458,0.503574,0.0
CC(C1=CN(CC(NC2=CC=C(NC3=NC=NC4=CC(OCC(O)=O)=CC(F)=C34)C=C2)=O)N=N1)C,-0.659574,-0.227512,1.598589,-0.338126,3.395207,0.582965,64100.0
CCCC1CCC(C(=O)NC2CCN(CC(=O)N3CCC[C@@H](C(=O)NC)C3)CC2)CC1,-1.233444,-1.058291,-1.27411,-1.487755,-0.631666,0.500876,21800.0
CNC(=O)[C@H](CCC1CCCCC1)NC(=O)c1ccc(CNC(=O)c2cn[nH]c2C)cc1,-1.170295,-0.565441,0.301731,0.811503,-0.631666,0.753996,8.4
CNC(=O)[C@H](CCc1ccccc1)NC(=O)c1ccc(CNC(=O)c2n[nH]c3ncccc23)cc1,-0.773959,-0.911483,0.895146,0.811503,0.173708,0.683056,16.2
CNC(=O)[C@@H](CC1CCCCC1)NC(=O)CC1CCN(C(=O)c2cnc(N)s2)CC1,-1.221045,-1.050702,0.368024,-0.338126,0.979083,0.501565,8190.0
CNC(=O)[C@@H](CC1CCCCC1)NC(=O)CC1CCN(C(=O)c2n[nH]c3ncccc23)CC1,-0.97816,-0.885738,0.490482,-0.338126,0.173708,0.559271,24100.0
CNC(=O)[C@@H](Cc1cccc(Cl)c1)NC(=O)CC1CCN(C(=O)c2n[nH]c3ncccc23)CC1,-0.614787,-0.938433,0.490482,-0.338126,0.173708,0.564651,20400.0
CNC(=O)[C@@H]1CCC[C@H](NC(=O)c2ccc(CNC(=O)c3cnc(N)s3)cc2)C1,-1.477929,-1.390347,0.772688,0.811503,0.979083,0.786385,242.0
CNC(=O)[C@H](CCC1CCCCC1)NC(=O)c1ccc(CNC(=O)c2cnn(Cc3ccccc3)c2)cc1,-0.19653,0.295041,-0.198229,-0.338126,0.173708,0.649595,8790.0


In [107]:
# To evaluate model, compare how well the model enrichment predictions correlate with Kd values for the molecules in the hold_out_test set
import scipy
from scipy import stats

# Target enrichment scores predicted my model
x = np.array(ddr1__hold_out_data_preds["enrichment_predictions"])
# Binding affinity from in vitro testing (Kd)
y = np.array(ddr1__hold_out_data_preds["kd"])

In [108]:
print(type(x))
print(type(y))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [109]:
res = stats.spearmanr(x,y)
res.statistic

-0.11389655652187708