In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import pickle
import os
from tensorflow import keras
from math import floor

In [2]:
# Setup

PATH = '.'
FINGERPRINTS = 'fingerprints'
fing_path = os.path.join(PATH, FINGERPRINTS)

morg_2048_path = 'morgan_2048_df.p'
morg_1024_path = 'morgan_1024_df.p'
maccs = 'maccs_df.p'

morg_2048_bit = os.path.join(fing_path, morg_2048_path)
morg_1024_bit = os.path.join(fing_path, morg_1024_path)
maccs = os.path.join(fing_path, maccs)


pd.options.display.max_rows = 14
pd.options.display.max_columns = 6
np.random.seed(2)

In [47]:
current_fingerprint = 'maccs'

train_frac = .5
validation_frac = .15
test_frac = .2

df = pickle.load(open(FINGERPRINTS+'\\%s_df.p'%current_fingerprint, 'rb'))

df.reset_index(drop=True, inplace=True)
    
df_len = len(df)
df['Solubility'] = df['Solubility'].apply(tuple)
df['fingerprints'] = df['fingerprints'].apply(tuple)

# Creating training and first validation set
df_training = df[:floor(df_len*train_frac)]
df_training_insol = df_training[df_training['Solubility']==(1,0)]
df_training_sol = df_training[df_training['Solubility']==(0,1)]

validation_set1 = df[floor(df_len*train_frac):floor(df_len*train_frac)+floor(df_len*validation_frac)]

# Delete original set to reduce overhead.
del df


In [53]:
print(df, df2)

                                                  SMILES Solubility  \
0      O=C([C@H](CC1=CC=CC=C1)N1C(=O)C2=CC=CC=C2C1=O)...     [0, 1]   
1             CC1=C(C)CC2C(C1)C(=O)N(C2=O)C1=CC=CC(O)=C1     [0, 1]   
2             CC1=C(C)CC2C(C1)C(=O)N(C2=O)C1=CC=C(I)C=C1     [0, 1]   
3                             CCCCN1C(=O)NC2=C(CCC2)C1=O     [0, 1]   
4                  CC1=NC2=C(S1)C=CC1=C2C=CC2=C1N=C(C)S2     [1, 0]   
5                            CC(C)OC(=O)NC1=CC=C(Br)C=C1     [0, 1]   
6                             O=C(NC1=CC=CC=C1)OC1CCCCC1     [0, 1]   
...                                                  ...        ...   
50613          COC1=CC=CC(C=CC(=O)C2=CC=CC(=C2)C#N)=C1OC     [0, 1]   
50614    CC(=O)N1CCC2=CC(=CC=C12)C(=O)NC1CCOC2=CC=CC=C12     [0, 1]   
50615             ClC1=CC=C(COC(=O)C2=CC=C3OCOC3=C2)C=C1     [0, 1]   
50616        BrC1=CC=C(C=C1)C(=O)NC1=CC=C(C=C1)C1=CNC=N1     [0, 1]   
50617                 CC1=CC=C2N(CCCC2=C1)C(=O)C1=COCCO1     [0, 1]   
50618 

In [4]:
print(len([c for c, i in enumerate(df_training['Solubility']) if i == (1,0)])\
/len([c for c, i in enumerate(df_training['Solubility']) if i == (0,1)])*100,'% insoluble')
print(len([c for c, i in enumerate(df_training['Solubility']) if i == (1,0)]), '# insoluble')
print(len([c for c, i in enumerate(df_training['Solubility']) if i == (0,1)]), '# soluble')

5.634390651085141 % insoluble
1350 # insoluble
23960 # soluble


In [5]:
def batch(df_i,df_s):
    """
    Feed function for model. Undersampled, randomly pickes 100 insoluble and 100 soluble compounds
    """
    insol = df_i.sample(n = 100)
    sol = df_s.sample(n = 100)
    
    df = pd.concat([insol, sol])
    df = df.sample(frac=1)
    
    fprints = np.array(list(df['fingerprints']), dtype='int32')
    labels = np.array(list(df['Solubility']), dtype='int32')
    
    return fprints, labels

def validation_input(valid_set):
    features = np.array(list(valid_set['fingerprints']))
    labels = np.array(list(valid_set['Solubility']))
    return features, labels

def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
            / predictions.shape[0])

def insol_accuracy(valid, labels_v):
    valid = [list(i) for i in valid]
    labels_v = [list(i) for i in labels_v]
    neg_matching = [count for count in range(len(valid)) if valid[count] == [1,0] and labels_v[count] == [1,0]]
    return len(neg_matching)/len([count for count in range(len(labels_v)) if labels_v[count] == [1,0]])*100

In [21]:
# Neural network time!

graph = tf.Graph()

n_inputs = len(df_training['fingerprints'][0])
layer1_nodes = 2300
layer2_nodes = 1300

batch_size = 200
learning_rate = .0001

with graph.as_default():
    # Setting up tensorflow graph
    # Training data to be fed at runtime
    train_data = tf.placeholder(dtype=tf.float32, name='input_layer')
    train_labels = tf.placeholder(dtype=tf.float32, name='train_labels')

    
    # Weights
    layer1_weights = tf.Variable(tf.truncated_normal([n_inputs, layer1_nodes]), name='l1_weights')
    layer4_weights = tf.Variable(tf.truncated_normal([layer1_nodes, 2]), name='l2_weights')
    
    # Logits
    logit1 = tf.matmul(train_data, layer1_weights, name='logit1')
    relu_layer1 = tf.nn.relu6(logit1, name='relu_layer')
    dropout_ = tf.nn.dropout(x=relu_layer1, keep_prob=.9, name='dropout')
    logit4 = tf.matmul(dropout_, layer4_weights, name='logit4')
    
    # Loss tf.nn.softmax_cross_entropy_with_logits_v2
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logit4, labels=train_labels), name='loss')
    # Optimizer
    optimizer = tf.train.GradientDescentOptimizer(learning_rate,name='optimizer').minimize(loss)
    
    # Prediction
    train_prediction = tf.nn.softmax(logit4, name='predictor')

In [22]:


with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    # writer = tf.summary.FileWriter('.', session.graph)
    for step in range(10001):
        data, labels = batch(df_training_insol, df_training_sol)
        feed_dict = {train_data: data,train_labels: labels }
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        # print(predictions)
        # print(l)
        if step % 500 == 0:
            data_v, labels_v = validation_input(validation_set1)
            feed_dict = {train_data: data_v,train_labels: labels_v}
            _, valid = session.run([loss, train_prediction], feed_dict=feed_dict)
            print("Minibatch loss at step {}: {}".format(step, l))
            print("Minibatch accuracy: {:.2f}".format(accuracy(predictions, labels)))
            print("Insoluble minibatch accuracy: {:.2f}".format(insol_accuracy(predictions, labels)))
            print("Insoluble validation accuracy: {:.2f}".format(insol_accuracy(valid, labels_v)))
            print("Minibatch validation accuracy: {:.2f}".format(accuracy(valid, labels_v)))
            print("")
        
        

Minibatch loss at step 0: 148.84068298339844
Minibatch accuracy: 50.00
Insoluble minibatch accuracy: 97.00
Insoluble validation accuracy: 98.03
Minibatch validation accuracy: 5.62



KeyboardInterrupt: 

[1, 2, 2, 3, 4]