In [None]:
# 10 selected MoAs 
moas_to_use = ['Aurora kinase inhibitor', 'tubulin polymerization inhibitor', 'JAK inhibitor', 'protein synthesis inhibitor', 'HDAC inhibitor', 
        'topoisomerase inhibitor', 'PARP inhibitor', 'ATPase inhibitor', 'retinoid receptor agonist', 'HSP inhibitor']

In [None]:
# read the data 
import pandas as pd
import numpy as np
all_data = pd.read_csv('../data_for_image_based_model/fl_data.csv')

In [None]:
# Drop the lines with DMSO 
all_data = all_data[all_data.moa != 'dmso']
assert 'dmso' not in all_data.values

In [None]:
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

In [None]:
dictionary = {'ATPase inhibitor': 7, 'Aurora kinase inhibitor': 0,
 'HDAC inhibitor': 4, 'HSP inhibitor': 9, 'JAK inhibitor': 2, 'PARP inhibitor': 6,
 'protein synthesis inhibitor': 3, 'retinoid receptor agonist': 8,
 'topoisomerase inhibitor': 5, 'tubulin polymerization inhibitor': 1}

In [None]:
# change moa to classes 
all_data['classes'] = None
for i in range(all_data.shape[0]):
    all_data.iloc[i, 11] = dictionary[all_data.iloc[i, 10]]

In [None]:
# add 'digit' column as index 
all_data['digit'] = [i for i in range(0, all_data.shape[0])]

In [None]:
# shuffle the data 
all_data = all_data.sample(frac = 1, random_state = 1)

In [None]:
# get the compounds in test data  
compound_test =  ['CBK303888', 'CBK200949', 'CBK309016', 'CBK290799', 'CBK308876', 'CBK289882',
 'CBK308108', 'CBK290852', 'CBK307956', 'CBK290529', 'CBK309488', 'CBK290717', 'CBK277957', 'CBK290547',
 'CBK291084', 'CBK289740', 'CBK278064', 'CBK308819', 'CBK308608', 'CBK288281', 'CBK309437', 'CBK289987',
 'CBK278556G', 'CBK277961']
assert len(compound_test) == 24 

In [None]:
# get the compounds in validation data 
compound_valid = ['CBK309483', 'CBK200593', 'CBK278120', 'CBK309545', 'CBK277955', 'CBK278047', 'CBK278016', 'CBK290481', 'CBK290217',
 'CBK041182', 'CBK271643', 'CBK041804', 'CBK042154', 'CBK308980', 'CBK290803', 'CBK309258', 'CBK290154',
 'CBK308260', 'CBK017131', 'CBK290770', 'CBK291055', 'CBK307747', 'CBK228145']
assert len(compound_valid) == 23

In [None]:
# get the compounds in training data 
compound_train = []
for i in all_data.compound.tolist():
    if i not in (compound_test + compound_valid + compound_train):
        compound_train.append(i)

assert len(compound_train) + len(compound_valid) + len(compound_test) == len(set(all_data.compound.to_list()))

In [None]:
# get the train, valid and test set 
train = all_data[all_data['compound'].isin(compound_train)].reset_index(drop = True)  
valid = all_data[all_data['compound'].isin(compound_valid)].reset_index(drop = True)  
test = all_data[all_data['compound'].isin(compound_test)].reset_index(drop = True)  

In [None]:
# get the dictionary for compound_id-SMILES pair 
import pickle
compound_smiles_dictionary = pickle.load(open("../data_for_image_based_model/dictionary2.pickle", "rb"))

In [None]:
# on the fly data augmentation 
import albumentations as A
train_transforms = A.Compose([A.Flip(),A.ShiftScaleRotate(scale_limit=0.2),A.RandomRotate90(),
    A.OneOf([A.Flip(),A.ShiftScaleRotate(scale_limit=0.2),A.RandomRotate90(),],p = 0.2),
    A.OneOf([A.Flip(),A.ShiftScaleRotate(scale_limit=0.2),A.RandomRotate90(),],p = 0.4),
    A.OneOf([A.Flip(),A.ShiftScaleRotate(scale_limit=0.2),A.RandomRotate90(),],p = 0.5),
    A.OneOf([A.Flip(),A.ShiftScaleRotate(scale_limit=0.2),A.RandomRotate90(),],p = 0.6),
    A.OneOf([A.Flip(),A.ShiftScaleRotate(scale_limit=0.2),A.RandomRotate90(),],p = 0.8),
    A.Flip(),A.ShiftScaleRotate(scale_limit=0.2),A.RandomRotate90(),])
valid_transforms = A.Compose([])

In [None]:
# get all images  
import numpy as np
all_images = np.load(open(, "rb")) # The path has to be set

In [None]:
# data generator for training 
def get_train_image(end):
  start = 0
  while start < end:      
    idx = start       
    row = train.iloc[idx]

    assert row['digit']     in train.digit.tolist()
    assert row['compound']   in train.compound.tolist()

    assert row['digit']    not in valid.digit.tolist()
    assert row['compound']  not in valid.compound.tolist()

    assert row['digit']    not in test.digit.tolist()
    assert row['compound']  not in test.compound.tolist()

    image = all_images[all_data.digit.tolist().index(row['digit'])]  
    image = train_transforms(image = image)['image']               
    target = int(row['classes'])   
    
    yield image, target
    start += 1

In [None]:
# data generator for validation   
def get_valid_image(end):
  start = 0
  while start<end:
    idx = start       
    row = valid.iloc[idx]

    assert row['digit']   not in train.digit.tolist()
    assert row['compound']  not in train.compound.tolist()

    assert row['digit']     in valid.digit.tolist()
    assert row['compound']   in valid.compound.tolist()

    assert row['digit']    not in test.digit.tolist()
    assert row['compound']  not in test.compound.tolist()
       
    image = all_images[all_data.digit.tolist().index(row['digit'])] 
    image = valid_transforms(image = image)['image']
    target = int(row['classes'])   
    
    yield image, target
    start += 1

In [None]:
batch_size = 32
channels = 5      
image_size = 256 

In [None]:
# turn to tensorflow datasets 
import tensorflow as tf
train_data = tf.data.Dataset.from_generator(get_train_image,
            (tf.float32, tf.int32),
            ((tf.TensorShape([image_size, image_size, channels])), tf.TensorShape([])),
            args = [train.shape[0]]).batch(batch_size, num_parallel_calls = 64).prefetch(1024)

valid_data = tf.data.Dataset.from_generator(get_valid_image,
            (tf.float32, tf.int32),
            ((tf.TensorShape([image_size, image_size, channels])), tf.TensorShape([])),
            args = [valid.shape[0]]).batch(batch_size, num_parallel_calls = 64).prefetch(1024)

In [None]:
# we choose efficientnet b1 as the base model    
base_model = tf.keras.applications.EfficientNetB1(input_shape = (image_size, image_size, channels),
                                  include_top = False, weights = None)

In [None]:
import tensorflow_addons as tfa
from tensorflow.keras import layers, models, optimizers, regularizers
from keras.layers import Dropout

In [None]:
# complete the architecture of efficientnet b1
drop = 0.30
num_classes = len(set(train['classes'].tolist())) 
x = base_model.output
x = Dropout(drop)(x)
x = layers.GlobalAveragePooling2D()(x)
x = Dropout(drop)(x)
preds = layers.Dense(num_classes, activation = 'softmax',                     
    kernel_regularizer = regularizers.L1L2(l1 = 1e-4, l2 = 1e-3),
    bias_regularizer = regularizers.L2(1e-3),
    activity_regularizer = regularizers.L2(1e-4))(x)
cnn_model = models.Model(inputs = base_model.input, outputs = preds)

In [None]:
# set the optimizer of efficientnet b1    
cnn_optimizer = tfa.optimizers.AdamW(weight_decay = 1e-6, learning_rate = 0.001, beta_1 = 0.9,
    beta_2 = 0.999, epsilon = 1e-07,)

In [None]:
# compile the model   
cnn_model.compile(optimizer = cnn_optimizer,
         loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
         metrics = ['accuracy'])

In [None]:
# set the class weights 
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight(class_weight = 'balanced', 
         classes = np.unique(train.classes), y = train.classes)   
weight_dictionary = dict(zip(np.unique(train.classes), class_weights))

In [None]:
# set the check point   
from keras.callbacks import ModelCheckpoint
filepath_cnn = './content/shuffle_5_' + str(base_model.name) + '_weights.hdf5'
checkpoint_cnn = ModelCheckpoint(filepath_cnn, monitor = 'val_accuracy', verbose = 0, 
                  save_best_only = True, mode = 'max')

In [None]:
# train the efficientnet b1
from tensorflow.keras.callbacks import EarlyStopping  
reduce_lr_loss_cnn = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'val_loss',
              factor = 0.5, patience = 9, verbose = 2, min_lr = 1e-7, mode = 'min')
history_cnn = cnn_model.fit(train_data, validation_data = valid_data, class_weight = weight_dictionary,
               verbose = 2, epochs = 100, callbacks=[reduce_lr_loss_cnn, checkpoint_cnn,])

In [None]:
# get the best model 
from keras.models import load_model
from sklearn.metrics import classification_report
best_cnn_model = load_model() # Path has to be set

In [None]:
# evaluate the model   
predicted_test = []
for i in range(test.shape[0]):    
  row = test.iloc[i]

  assert row['digit']  not  in train.digit.tolist()
  assert row['compound'] not  in train.compound.tolist()
  
  assert row['digit']   not  in valid.digit.tolist()
  assert row['compound']  not  in valid.compound.tolist()

  assert row['digit']     in test.digit.tolist()
  assert row['compound']    in test.compound.tolist()  

  im = all_images[all_data.digit.tolist().index(row['digit'])] 
  im = valid_transforms(image = im)['image']
  im = np.expand_dims(im, 0)
  value = best_cnn_model.predict(im).argmax()
  
  predicted_test.append(value)   
  
print(classification_report(test.classes.tolist(), predicted_test))    

MLP

In [None]:
# A function changing SMILES to Morgan fingerprints 
from rdkit import Chem
from rdkit.Chem import DataStructs, AllChem
def smiles_to_array(smiles):
  molecules = Chem.MolFromSmiles(smiles) 
  fingerprints = AllChem.GetMorganFingerprintAsBitVect(molecules, 2)
  x_array = []
  arrays = np.zeros(0,)
  DataStructs.ConvertToNumpyArray(fingerprints, arrays)
  x_array.append(arrays)
  x_array = np.asarray(x_array)
  x_array = ((np.squeeze(x_array)).astype(int)) 
  return x_array

In [None]:
# generator for training data 
def get_train_smiles(end):
  start = 0
  while start < end:
    idx = start       
    row = train.iloc[idx]

    assert row['digit']    in train.digit.tolist()
    assert row['compound']   in train.compound.tolist()
  
    assert row['digit']    not in valid.digit.tolist()
    assert row['compound']  not  in valid.compound.tolist()

    assert row['digit']   not  in test.digit.tolist()
    assert row['compound']  not  in test.compound.tolist()

    smiles = compound_smiles_dictionary[row['compound']]     
    smiles_array = smiles_to_array(smiles)                      
    target_mlp = int(row['classes'])   
    
    yield smiles_array, target_mlp
    start += 1

In [None]:
# generator for validation data 
def get_valid_smiles(end):
  start = 0
  while start<end:
    idx = start       
    row = valid.iloc[idx]

    assert row['digit']  not  in train.digit.tolist()
    assert row['compound'] not  in train.compound.tolist()
  
    assert row['digit']     in valid.digit.tolist()
    assert row['compound']    in valid.compound.tolist()

    assert row['digit']   not  in test.digit.tolist()
    assert row['compound']  not  in test.compound.tolist()

    smiles = compound_smiles_dictionary[row['compound']]     
    smiles_array = smiles_to_array(smiles)                  
    target_mlp = int(row['classes'])   
    
    yield smiles_array, target_mlp
    start += 1

In [None]:
# complete the architecture of MLP and compile MLP 
from keras.models import Sequential
from keras.layers import Dense

units = 64  
drop = 0.89  

model_mlp = Sequential()
model_mlp.add(Dense(units, input_dim = 2048, activation = 'relu'))
model_mlp.add(Dropout(drop))
model_mlp.add(Dense(10, activation = 'softmax'))
model_mlp.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-4),
         loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
         metrics = ['accuracy'])

In [None]:
# turn to TensorFlow dataset 
train_smiles_data = tf.data.Dataset.from_generator(get_train_smiles,
            (tf.float32, tf.int32),
            (tf.TensorShape(2048), tf.TensorShape([])),
            args = [train.shape[0]]).batch(batch_size, num_parallel_calls = 64).prefetch(1024)

In [None]:
# turn to TensorFlow dataset  
valid_smiles_data = tf.data.Dataset.from_generator(get_valid_smiles,
            (tf.float32, tf.int32),
            (tf.TensorShape(2048), tf.TensorShape([])),
            args = [valid.shape[0]]).batch(batch_size, num_parallel_calls = 64).prefetch(1024)

In [None]:
# set the checkpoint
filepath_mlp = './content/shuffle_5_mlp_weights.hdf5'
checkpoint_mlp = ModelCheckpoint(filepath_mlp, monitor = 'val_accuracy', verbose = 0, 
                  save_best_only = True, mode = 'max')

In [None]:
# train the model  
from tensorflow.keras.callbacks import EarlyStopping  
earlyStopping = EarlyStopping(monitor = 'val_loss', patience = 20, verbose = 2, mode = 'min')
reduce_lr_loss = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'val_loss',
                factor = 0.5, patience = 9, verbose = 2, min_lr = 1e-7, mode = 'min')
history_mlp = model_mlp.fit(train_smiles_data, validation_data = valid_smiles_data,
               class_weight = weight_dictionary, verbose = 2, epochs = 1800,      
               callbacks = [earlyStopping, reduce_lr_loss, checkpoint_mlp,])

In [None]:
# get the best MLP and evaluate it  
#best_model_mlp = load_model('/content/drive/MyDrive/github/shuffle_5_mlp_weights.hdf5') 
best_model_mlp = load_model('') # Path has to be set
predicted_test = []
for i in range(test.shape[0]): 
  row = test.iloc[i]

  assert row['digit']  not  in train.digit.tolist()
  assert row['compound'] not  in train.compound.tolist()
  
  assert row['digit']   not  in valid.digit.tolist()
  assert row['compound']  not   in valid.compound.tolist()

  assert row['digit']     in test.digit.tolist()
  assert row['compound']    in test.compound.tolist()
  
  smiles = compound_smiles_dictionary[row['compound']]     
  smiles_array = smiles_to_array(smiles)  
  smiles_array = np.expand_dims(smiles_array, 0)
  value = int(best_model_mlp.predict(smiles_array)[0].argmax())

  predicted_test.append(value) 

print(classification_report(test.classes.tolist(), predicted_test))      

Integrate MLP and CNN.



In [None]:
# training data generator for the global model 
def get_train_image_smiles(end):
  start = 0
  while start < end:
    idx = start       
    row = train.iloc[idx]

    assert row['digit']    in train.digit.tolist()
    assert row['compound']   in train.compound.tolist()
  
    assert row['digit']   not  in valid.digit.tolist()
    assert row['compound']  not  in valid.compound.tolist()

    assert row['digit']   not  in test.digit.tolist()
    assert row['compound']  not  in test.compound.tolist()

    image  = all_images[all_data.digit.tolist().index(row['digit'])]  
    image = train_transforms(image = image)['image'] 

    smiles = compound_smiles_dictionary[row['compound']]     
    smiles_array = smiles_to_array(smiles)                
    target = int(row['classes'])   

    yield (image, smiles_array), target
    start += 1

In [None]:
# validation data generator for the global model 
def get_valid_image_smiles(end):
  start = 0
  while start < end:
    idx = start       
    row = valid.iloc[idx]
       
    assert row['digit']  not  in train.digit.tolist()
    assert row['compound'] not  in train.compound.tolist()
  
    assert row['digit']     in valid.digit.tolist()
    assert row['compound']    in valid.compound.tolist()

    assert row['digit']   not  in test.digit.tolist()
    assert row['compound']  not  in test.compound.tolist()       
    
    image = all_images[all_data.digit.tolist().index(row['digit'])] 
    image = valid_transforms(image = image)['image']
    smiles = compound_smiles_dictionary[row['compound']]     
    smiles_array = smiles_to_array(smiles)      
    target = int(row['classes'])   

    yield (image, smiles_array), target
    start += 1

In [None]:
# turn to TensorFlow dataset 
train_image_smiles = tf.data.Dataset.from_generator(get_train_image_smiles,
           ((tf.float32, tf.float32), tf.int32),
           ((tf.TensorShape([image_size, image_size, channels]), 2048), tf.TensorShape([])),
           args = [train.shape[0]]).batch(batch_size, num_parallel_calls = 64).prefetch(1024)

In [None]:
# turn to TensorFlow dataset 
valid_image_smiles = tf.data.Dataset.from_generator(get_valid_image_smiles,
           ((tf.float32, tf.float32), tf.int32),
           ((tf.TensorShape([image_size, image_size, channels]), 2048), tf.TensorShape([])),
           args = [valid.shape[0]]).batch(batch_size, num_parallel_calls = 64).prefetch(1024)

In [None]:
# combine MLP and efficientNet to get the global model    
from keras import Model
model1 = Model(inputs = best_cnn_model.input, 
        outputs = best_cnn_model.get_layer('global_average_pooling2d').output)

model2 = Model(inputs = best_model_mlp.input, outputs = best_model_mlp.get_layer('dropout_2').output)

In [None]:
x1 = model1.output    
x1 = tf.keras.layers.Dense(64, activation = 'relu')(x1)    
x2 = model2.output 
x3 = tf.concat([x1, x2], axis = -1)
x3 = tf.keras.layers.Dense(128, activation = 'relu')(x3)
output1 = tf.keras.layers.Dense(10, activation = 'softmax')(x3)
global_model = tf.keras.models.Model(inputs = [model1.input, model2.input], outputs = output1)

In [None]:
# freeze MLP and efficientnet 
model1.trainable = False
model2.trainable = False

In [None]:
# compile the global model        
global_model.compile(optimizer = cnn_optimizer, 
           loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
           metrics = ['accuracy'])

In [None]:
# train the global model with MLP and efficientnet freezed     
history_global = global_model.fit(train_image_smiles, validation_data = valid_image_smiles,     
                  class_weight = weight_dictionary, verbose = 2, epochs = 10)

In [None]:
# Unfreeze MLP and efficientNet   
model1.trainable = True
model2.trainable = True

In [None]:
# set the optimizer
global_model_optimizer = tfa.optimizers.AdamW(weight_decay = 1e-6, learning_rate = 1e-3/2,
                         beta_1 = 0.9, beta_2 = 0.999, epsilon = 1e-07,)

In [None]:
# compile the global model            
global_model.compile(optimizer = global_model_optimizer, 
           loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
           metrics = ['accuracy'])

In [None]:
# set the checkpoint   
from keras.callbacks import ModelCheckpoint
filepath = './content/shuffle_5_global_weights.hdf5'
global_model_checkpoint = ModelCheckpoint(filepath, monitor = 'val_accuracy', verbose = 0, 
                      save_best_only = True, mode = 'max')

In [None]:
# train the global model 
reduce_lr_loss_global = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'val_loss',
             factor = 0.5, patience = 9, verbose = 2, min_lr = 1e-7, mode = 'min')
history_global = global_model.fit(train_image_smiles, validation_data = valid_image_smiles,     
             class_weight = weight_dictionary, verbose = 2, epochs = 100-10,
             callbacks = [reduce_lr_loss_global, global_model_checkpoint,])

In [None]:
# load the best global model and evaluate it 
#best_global_model = load_model('/content/drive/MyDrive/github/shuffle_5_global_weights.hdf5')
best_global_model = load_model('') # Path has to be set
predicted_test = []
for i in range(test.shape[0]):
  row = test.iloc[i]

  assert row['digit']  not  in train.digit.tolist()
  assert row['compound'] not  in train.compound.tolist()
  
  assert row['digit']    not in valid.digit.tolist()
  assert row['compound']  not  in valid.compound.tolist()

  assert row['digit']     in test.digit.tolist()
  assert row['compound']    in test.compound.tolist()

  image = all_images[all_data.digit.tolist().index(row['digit'])] 
  image = valid_transforms(image = image)['image']
  smiles = compound_smiles_dictionary[row['compound']]     
  smiles_array = smiles_to_array(smiles) 

  image = np.expand_dims(image, 0)
  smiles_array = np.expand_dims(smiles_array, 0)       

  value = best_global_model.predict((image, smiles_array)).argmax()   
  predicted_test.append(value)   

print(classification_report(test.classes.tolist(), predicted_test))    

In [None]:
# references  
# https://www.tensorflow.org/guide/data
# https://www.tensorflow.org/api_docs/python/tf/data/Dataset
# https://www.tensorflow.org/