In [40]:
#Import libraries
import pandas as pd
import numpy as np
from rdkit import Chem
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Normalization
from tensorflow.keras.models import Model
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [41]:
# Load dataset using pandas functionality
mapk14__hold_out_data = pd.read_csv('../data/mapk14_offdna.csv')

In [42]:
#Use preprocessing steps to ensure featurization is the same in the hold-out dataset 
#Generate Molecular Descriptors
from rdkit.Chem import Descriptors

def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return {
            'MolWt': Descriptors.MolWt(mol),
            'LogP': Descriptors.MolLogP(mol),
            'TPSA': Descriptors.TPSA(mol),
            'NumHDonors': Descriptors.NumHDonors(mol),
            'NumHAcceptors': Descriptors.NumHAcceptors(mol)
        }
    return None



# Apply descriptor calculation
descriptors = mapk14__hold_out_data['smiles'].apply(calculate_descriptors)

# Convert descriptors into a DataFrame
descriptors_df = pd.DataFrame(descriptors.tolist())

In [43]:
mapk14__hold_out_data.columns

Index(['Unnamed: 0', 'smiles', 'molecule_hash', 'kd', 'smiles_a', 'smiles_b',
       'smiles_c'],
      dtype='object')

In [44]:
descriptors_df.columns

Index(['MolWt', 'LogP', 'TPSA', 'NumHDonors', 'NumHAcceptors'], dtype='object')

In [45]:
X = descriptors_df
print(X.columns)
# Name the features
X_features = ['MolWt', 'LogP', 'TPSA', 'NumHDonors', 'NumHAcceptors']

# Convert dataframes to numpy arrays for better computation
X = X.values


# Scale features
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)
#X_scaled


Index(['MolWt', 'LogP', 'TPSA', 'NumHDonors', 'NumHAcceptors'], dtype='object')


In [46]:
mapk14_hold_out_data_preds = pd.DataFrame(X_scaled, index=mapk14__hold_out_data["smiles"], columns=descriptors_df.columns)
mapk14_hold_out_data_preds

Unnamed: 0_level_0,MolWt,LogP,TPSA,NumHDonors,NumHAcceptors
smiles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CC1=CC=C(C(NC2=CC=C(CN3CCN(C)CC3)C(C(F)(F)F)=C2)=O)C=C1C#CC4=CN=C5N4N=CC(CCC(O)=O)=C5,2.048324,1.20426,-0.437264,-2.07606,1.744203
CCCC1CCC(C(=O)NC2CCN(CC(=O)N3CCC[C@@H](C(=O)NC)C3)CC2)CC1,-1.268749,-0.868397,-1.672941,-2.07606,-0.761554
CNC(=O)[C@H](CCC1CCCCC1)NC(=O)c1ccc(CNC(=O)c2cn[nH]c2C)cc1,-1.172461,-0.246256,0.310982,1.207011,-0.761554
CNC(=O)[C@H](CCc1ccccc1)NC(=O)c1ccc(CNC(=O)c2n[nH]c3ncccc23)cc1,-0.568138,-0.683076,1.058068,1.207011,0.073699
CNC(=O)[C@@H](CC1CCCCC1)NC(=O)CC1CCN(C(=O)c2cnc(N)s2)CC1,-1.249842,-0.858818,0.394442,-0.434524,0.908951
CNC(=O)[C@@H](CC1CCCCC1)NC(=O)CC1CCN(C(=O)c2n[nH]c3ncccc23)CC1,-0.879499,-0.650578,0.548612,-0.434524,0.073699
CNC(=O)[C@H](Cc1cccc(Cl)c1)NC(=O)c1ccc(CNC(=O)c2cnc(N)s2)cc1,-0.5401,-0.655592,0.903898,1.207011,0.908951
CNC(=O)[C@@H](Cc1cccc(Cl)c1)NC(=O)CC1CCN(C(=O)c2n[nH]c3ncccc23)CC1,-0.325437,-0.717097,0.548612,-0.434524,0.073699
CNC(=O)[C@H](CCC1CCCCC1)NC(=O)c1ccc(CNC(=O)c2cnn(Cc3ccccc3)c2)cc1,0.312308,0.839957,-0.318448,-0.434524,0.073699
CNC(=O)[C@H]1C[C@@H](NC(=O)[C@H](CCC2CCCCC2)NC(=O)c2ccc3ccccc3c2)C1,-0.976665,0.726437,-1.351271,-0.434524,-1.596806


In [47]:
#Load previous model
model_mapk14= load_model('../models/mapk14_model_1.h5')

In [48]:
#Predictions of enrichment
enrichment_predictions = model_mapk14.predict(X_scaled)

#Add predictions of enrichment to mapk14_hold_out_data_preds
mapk14_hold_out_data_preds["enrichment_predictions"] = enrichment_predictions

#Add kd from in vitro testing in hold out set
kd_values = mapk14__hold_out_data["kd"]
kd_values2 = kd_values.to_numpy()
mapk14_hold_out_data_preds["kd"] = kd_values2

mapk14_hold_out_data_preds



Unnamed: 0_level_0,MolWt,LogP,TPSA,NumHDonors,NumHAcceptors,enrichment_predictions,kd
smiles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CC1=CC=C(C(NC2=CC=C(CN3CCN(C)CC3)C(C(F)(F)F)=C2)=O)C=C1C#CC4=CN=C5N4N=CC(CCC(O)=O)=C5,2.048324,1.20426,-0.437264,-2.07606,1.744203,0.476416,10.3
CCCC1CCC(C(=O)NC2CCN(CC(=O)N3CCC[C@@H](C(=O)NC)C3)CC2)CC1,-1.268749,-0.868397,-1.672941,-2.07606,-0.761554,0.462574,64900.0
CNC(=O)[C@H](CCC1CCCCC1)NC(=O)c1ccc(CNC(=O)c2cn[nH]c2C)cc1,-1.172461,-0.246256,0.310982,1.207011,-0.761554,0.528493,277.0
CNC(=O)[C@H](CCc1ccccc1)NC(=O)c1ccc(CNC(=O)c2n[nH]c3ncccc23)cc1,-0.568138,-0.683076,1.058068,1.207011,0.073699,0.515568,20900.0
CNC(=O)[C@@H](CC1CCCCC1)NC(=O)CC1CCN(C(=O)c2cnc(N)s2)CC1,-1.249842,-0.858818,0.394442,-0.434524,0.908951,0.498588,26800.0
CNC(=O)[C@@H](CC1CCCCC1)NC(=O)CC1CCN(C(=O)c2n[nH]c3ncccc23)CC1,-0.879499,-0.650578,0.548612,-0.434524,0.073699,0.512955,27600.0
CNC(=O)[C@H](Cc1cccc(Cl)c1)NC(=O)c1ccc(CNC(=O)c2cnc(N)s2)cc1,-0.5401,-0.655592,0.903898,1.207011,0.908951,0.522531,2740.0
CNC(=O)[C@@H](Cc1cccc(Cl)c1)NC(=O)CC1CCN(C(=O)c2n[nH]c3ncccc23)CC1,-0.325437,-0.717097,0.548612,-0.434524,0.073699,0.522093,89000.0
CNC(=O)[C@H](CCC1CCCCC1)NC(=O)c1ccc(CNC(=O)c2cnn(Cc3ccccc3)c2)cc1,0.312308,0.839957,-0.318448,-0.434524,0.073699,0.523165,185.0
CNC(=O)[C@H]1C[C@@H](NC(=O)[C@H](CCC2CCCCC2)NC(=O)c2ccc3ccccc3c2)C1,-0.976665,0.726437,-1.351271,-0.434524,-1.596806,0.527858,2100.0


In [49]:
mapk14_hold_out_data_preds.columns

Index(['MolWt', 'LogP', 'TPSA', 'NumHDonors', 'NumHAcceptors',
       'enrichment_predictions', 'kd'],
      dtype='object')

In [50]:
# To evaluate model, compare how well the model enrichment predictions correlate with Kd values for the molecules in the hold_out_test set
import scipy
from scipy import stats

# Target enrichment scores predicted my model
x = np.array(mapk14_hold_out_data_preds["enrichment_predictions"])
# Binding affinity from in vitro testing (Kd)
y = np.array(mapk14_hold_out_data_preds["kd"])

In [51]:
print(type(x))
print(type(y))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [52]:
res = stats.spearmanr(x,y)
res.statistic

-0.05393430099312452