In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
from gensim.models import word2vec
from rdkit import Chem 
from rdkit.Chem import Descriptors

Init Plugin
Init Graph Optimizer
Init Kernel


In [2]:
df_pretrain_features = pd.read_csv('pretrain_features.csv')
df_pretrain_labels = pd.read_csv('pretrain_labels.csv')
df_train_features = pd.read_csv('train_features.csv')
df_train_labels = pd.read_csv('train_labels.csv')
df_test_features = pd.read_csv('test_features.csv')

In [3]:
#extract the features from the dataframe 
pretrain_features = df_pretrain_features.iloc[:,2:]
train_features = df_train_features.iloc[:,2:]
test_features = df_test_features.iloc[:,2:]

# (1) Feature creation 

## (1.1) feature compression from given features (RDKit) with an autoencoder

In [4]:
#training test split
X_train, X_test = train_test_split(pretrain_features, test_size=0.2, random_state=42)

#train the autoencoder on the pretrain set
autoencoder = tf.keras.Sequential([
    tf.keras.layers.Dense(1000, activation='tanh'),
    tf.keras.layers.Dense(500, activation='tanh'),
    tf.keras.layers.Dense(250, activation='tanh'),
    tf.keras.layers.Dense(50, activation='tanh', name = 'bottelneck'),
    tf.keras.layers.Dense(250, activation='tanh'),
    tf.keras.layers.Dense(500, activation='tanh'),
    tf.keras.layers.Dense(1000, activation='tanh'),
])

#compile the autoencoder
autoencoder.compile(optimizer='adam', loss='mse',
                    metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy')])

#train the autoencoder
history = autoencoder.fit(X_train, X_train, epochs=5, validation_data = (X_test, X_test))

#build model to extract the bottelneck compressed features 
layer_name = 'bottelneck'
compress_model = Model(inputs=autoencoder.input,
                                 outputs=autoencoder.get_layer(layer_name).output)

#compress the features using the autoencoder 
compressed_pretrain_features = pd.DataFrame(compress_model.predict(pretrain_features))
compressed_train_features = pd.DataFrame(compress_model.predict(train_features))
compressed_test_features = pd.DataFrame(compress_model.predict(test_features))

Metal device set to: Apple M1 Pro
Epoch 1/5


2022-05-31 17:16:11.831370: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-05-31 17:16:11.831482: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2022-05-31 17:16:12.005759: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2022-05-31 17:16:12.005977: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


   7/1250 [..............................] - ETA: 10s - loss: 0.0533 - accuracy: 0.9526 

2022-05-31 17:16:12.173583: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2022-05-31 17:16:20.276340: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


2022-05-31 17:16:55.699597: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


## (1.2) additional feature extraction using mol2vec
following: https://www.kaggle.com/code/vladislavkisin/tutorial-ml-in-chemistry-research-rdkit-mol2vec/notebook

In [5]:
#get the smiles strings
pretrain = pd.DataFrame(df_pretrain_features['smiles'])
train = pd.DataFrame(df_train_features['smiles'])
test = pd.DataFrame(df_test_features['smiles'])

#Transforming SMILES to MOL
pretrain['mol'] = pretrain['smiles'].apply(lambda x: Chem.MolFromSmiles(x))
train['mol'] = train['smiles'].apply(lambda x: Chem.MolFromSmiles(x))
test['mol'] = test['smiles'].apply(lambda x: Chem.MolFromSmiles(x))

#Loading pre-trained model via word2vec
model = word2vec.Word2Vec.load('model_300dim.pkl')

#Constructing sentences
pretrain['sentence'] = pretrain.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)
train['sentence'] = train.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)
test['sentence'] = test.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)

#Extracting embeddings to a numpy.array
#Note that we always should mark unseen='UNK' in sentence2vec() so that model is taught how to handle unknown substructures
pretrain['mol2vec'] = [DfVec(x) for x in sentences2vec(pretrain['sentence'], model, unseen='UNK')]
train['mol2vec'] = [DfVec(x) for x in sentences2vec(train['sentence'], model, unseen='UNK')]
test['mol2vec'] = [DfVec(x) for x in sentences2vec(test['sentence'], model, unseen='UNK')]

X_pretrain = np.array([x.vec for x in pretrain['mol2vec']])
X_train = np.array([x.vec for x in train['mol2vec']])
X_test = np.array([x.vec for x in test['mol2vec']])

In [7]:
#store features in dataframe
vec_pretrain_features= pd.DataFrame(X_pretrain)
vec_train_features = pd.DataFrame(X_train)
vec_test_features= pd.DataFrame(X_test)

#concat features from 1.1 and 1.2
final_pretrain_features = pd.concat([compressed_pretrain_features, vec_pretrain_features], axis = 1)
final_train_features = pd.concat([compressed_train_features, vec_train_features], axis = 1)
final_test_features = pd.concat([compressed_test_features, vec_test_features], axis = 1)

#save the final features
final_pretrain_features.to_csv('pretrain_features_new.csv', index=False)
final_train_features.to_csv('train_features_new.csv', index=False)
final_test_features.to_csv('test_features_new.csv', index=False)