In [3]:
#Data preprocessing
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from rdkit import Chem
import os
os.chdir('C:/Users/sunoj/downloads')

In [4]:
df=pd.read_csv('fullPubchemMonoPhosphine.csv')

In [6]:
df.shape

(137610, 3)

In [7]:
df.columns

Index(['CID', 'MolecularWeight', 'CanonicalSMILES'], dtype='object')

In [8]:
df['mol_len']=[len(i) for i in df['CanonicalSMILES']]

In [10]:
smiles_list=df[df['mol_len']<=100]['CanonicalSMILES']

In [11]:
len(smiles_list)

98377

In [12]:
max_len=100

In [13]:
smile_padded = [i.ljust(max_len) for i in smiles_list]

In [19]:
#collect the unique characters; I will do the help of a function from guzik's code; its easy i can also do it
def smiles2one_hot_chars(smi_list):
    # get all the characters
    char_lists = [list(smi) for smi in smi_list]
    chars = list(set([char for sub_list in char_lists for char in sub_list]))
    chars.append(' ')
    return chars

In [20]:
unique_characters=smiles2one_hot_chars(smile_padded)

In [22]:
#check the number of unique characters and print the unique letter; NOTE the blankspace as unique character
print(len(unique_characters))
print(unique_characters)

62
[']', '(', 'o', 'l', 'Z', 'f', 'r', 'm', '3', 'N', 'd', 'n', '[', 'I', 'R', 'O', 'C', '1', 'F', '7', '5', 'c', 'h', 'B', 'e', 't', 'p', 'a', 'K', 'Y', 'L', 's', 'u', 'M', '0', '.', 'P', ' ', 'i', '+', '4', '2', '#', 'b', 'W', 'S', '-', '9', '8', '6', 'U', 'H', ')', 'g', '=', 'V', 'E', '%', 'T', 'G', 'A', ' ']


In [25]:
#now there are two way i can convert the smiles string to one hot encoded vector; 
#i will write both of them here one by one
def string_vectorizer(string, unique_characters):
    """given a string and the list of unique characters
    this function returns one hot encoded vector of the string"""
    vector = [[0 if char != letter else 1 for char in unique_characters] 
                  for letter in string]
    return vector

In [26]:
smile_ohe=[string_vectorizer(i, unique_characters) for i in smile_padded] #it returns a list of list

In [27]:
#convert the output of string_vectorizer to numpy array
smile_ohe_data=np.array(smile_ohe)

In [28]:
smile_ohe_data.shape

(98377, 100, 62)

In [31]:
smile_ohe_data[0]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

In [32]:
smile_padded[0]

'C1=CC=C(C=C1)P(=O)(C2=CC=CC=C2)C3=CC=CC=C3                                                          '

In [33]:
#this two things will be handy to get back to the smiles from one hot encoded vectors
char_to_int = dict((c, i) for i, c in enumerate(unique_characters))
int_to_char=dict((i, j) for i, j in enumerate(unique_characters))

In [34]:
#okay lets convert the first one hot encoded vector to the corresponding smile
def ohe_to_smile(ohe_vector):
    letterIndex=np.argmax(ohe_vector, axis=1)
    letterIndexList=list(letterIndex)
    letters=[int_to_char[i] for i in letterIndexList]
    smile=''.join(letters)
    mol=Chem.MolFromSmiles(smile)
    smile=Chem.MolToSmiles(mol)
    return smile

In [36]:
demo=ohe_to_smile(smile_ohe_data[1])

In [37]:
Chem.MolFromSmiles(demo)

<rdkit.Chem.rdchem.Mol at 0x27455418030>

In [38]:
demo

'c1ccc(P(c2ccccc2)c2ccccc2)cc1'

In [39]:
data_reshaped=np.reshape(smile_ohe_data, (98377, 6200))

In [40]:
data_reshaped[0]

array([0, 0, 0, ..., 0, 0, 1])

In [41]:
data_reshaped[0].shape

(6200,)

In [42]:
# Dataset parameters.
num_features = 6200 # data features (smile shape: 100*62).

# Training parameters.
batch_size = 128
epochs = 50

# Network Parameters
hidden_1 = 4000 # 1st layer num features.
hidden_2 = 1000 # 2nd layer num features (the latent dim).


In [43]:
inputs = keras.Input(shape=(num_features, ))
encoder = keras.layers.Dense(hidden_1, activation='sigmoid')(inputs)
encoder = keras.layers.Dense(hidden_2, activation='sigmoid')(encoder)
encoder_model = keras.Model(inputs, encoder, name='encoder')
encoder_model.summary()

Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 6200)]            0         
_________________________________________________________________
dense (Dense)                (None, 4000)              24804000  
_________________________________________________________________
dense_1 (Dense)              (None, 1000)              4001000   
Total params: 28,805,000
Trainable params: 28,805,000
Non-trainable params: 0
_________________________________________________________________


In [44]:
latent_dim = keras.Input(shape=(hidden_2, ))
decoder = keras.layers.Dense(hidden_1, activation='sigmoid')(latent_dim)
decoder = keras.layers.Dense(num_features, activation='sigmoid')(decoder)
decoder_model = keras.Model(latent_dim, decoder, name='decoder')
decoder_model.summary()

Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 1000)]            0         
_________________________________________________________________
dense_2 (Dense)              (None, 4000)              4004000   
_________________________________________________________________
dense_3 (Dense)              (None, 6200)              24806200  
Total params: 28,810,200
Trainable params: 28,810,200
Non-trainable params: 0
_________________________________________________________________


In [45]:
outputs = decoder_model(encoder_model(inputs))
ae_model = keras.Model(inputs, outputs )
ae_model.compile(optimizer='adam', loss='mse')
ae_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 6200)]            0         
_________________________________________________________________
encoder (Functional)         (None, 1000)              28805000  
_________________________________________________________________
decoder (Functional)         (None, 6200)              28810200  
Total params: 57,615,200
Trainable params: 57,615,200
Non-trainable params: 0
_________________________________________________________________


In [47]:
ae_model.fit(x=data_reshaped, y=data_reshaped, batch_size=batch_size, shuffle=False, epochs=epochs)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x276381465f8>

In [50]:
history=ae_model.history

In [53]:
history.on_batch_begin

<bound method Callback.on_batch_begin of <tensorflow.python.keras.callbacks.History object at 0x00000276381465F8>>

In [106]:
y_true=data_reshaped[100]
y_true=np.reshape(y_true, (100, 62))
ohe_to_smile(y_true)

'CCOC(=O)C[P+](c1ccccc1)(c1ccccc1)c1ccccc1.[Cl-]'

In [99]:
y_true

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

In [65]:
data_reshaped[0].shape

(6200,)

In [105]:
y_pred=ae_model.predict(data_reshaped[100:110]) #Predicting the first 10 training sample
y_pred.shape 

(10, 6200)

In [109]:
y_pred0=y_pred[0]
y_pred0=np.reshape(y_pred0, (100, 62))
letterIndex=[np.argmax(i) for i in y_pred0]
letters=[int_to_char[i] for i in letterIndex]
smile=''.join(letters)
smile

'CCOC(=O)C[P+](C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3.[Cl-]                                            '

In [None]:
new_smil