In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import pickle
import os
from tensorflow import keras
from math import floor

In [2]:
# Setup

PATH = '.'
FINGERPRINTS = 'fingerprints'
fing_path = os.path.join(PATH, FINGERPRINTS)

morg_2048_path = 'morgan_2048_df.p'
morg_1024_path = 'morgan_1024_df.p'
maccs = 'maccs_df.p'

morg_2048_bit = os.path.join(fing_path, morg_2048_path)
morg_1024_bit = os.path.join(fing_path, morg_1024_path)
maccs = os.path.join(fing_path, maccs)

train_frac = .5
validation_frac = .15
test_frac = .2

pd.options.display.max_rows = 14
pd.options.display.max_columns = 6
np.random.seed(2)

In [3]:
def combined_fp():
    current_fingerprint = 'pubchem'
    # Combining fingerprints
    df = pickle.load(open(FINGERPRINTS+'\\%s_df.p'%current_fingerprint, 'rb'))

    current_fingerprint = 'maccs'
    # Combining fingerprints
    df2 = pickle.load(open(FINGERPRINTS+'\\%s_df.p'%current_fingerprint, 'rb'))
    df['fingerprints'] = df['fingerprints'] + df2['fingerprints']
    del df2

    current_fingerprint = 'morgan_2048'
    # Combining fingerprints
    df3 = pickle.load(open(FINGERPRINTS+'\\%s_df.p'%current_fingerprint, 'rb'))
    df['fingerprints'] = df['fingerprints'] + df3['fingerprints']
    del df3
    
    pickle.dump(df, open('Fingerprints\\combined_df.p', 'wb+'))
# combined_fp()

In [13]:
current_fingerprint = 'morgan_1024'

# load fingerprints
df = pickle.load(open(FINGERPRINTS+'\\%s_df.p'%current_fingerprint, 'rb'))
# Randomize dataframe
df = df.sample(frac = 1)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

df_len = len(df)
df['Solubility'] = df['Solubility'].apply(tuple)
df['fingerprints'] = df['fingerprints'].apply(tuple)

# Creating training and first validation set
df_training = df[:floor(df_len*train_frac)]
df_training_insol = df_training[df_training['Solubility']==(1,0)]
df_training_sol = df_training[df_training['Solubility']==(0,1)]

validation_set1 = df[floor(df_len*train_frac):floor(df_len*train_frac)+floor(df_len*validation_frac)]

# Delete original set to reduce overhead.
del df

In [14]:
for item in range(100):
    print('ones: ',df_training['fingerprints'][item].count(1))
    print('zeroes: ',df_training['fingerprints'][item].count(0))

ones:  40
zeroes:  984
ones:  39
zeroes:  985
ones:  56
zeroes:  968
ones:  41
zeroes:  983
ones:  39
zeroes:  985
ones:  45
zeroes:  979
ones:  47
zeroes:  977
ones:  35
zeroes:  989
ones:  44
zeroes:  980
ones:  46
zeroes:  978
ones:  52
zeroes:  972
ones:  43
zeroes:  981
ones:  44
zeroes:  980
ones:  51
zeroes:  973
ones:  37
zeroes:  987
ones:  51
zeroes:  973
ones:  37
zeroes:  987
ones:  44
zeroes:  980
ones:  39
zeroes:  985
ones:  38
zeroes:  986
ones:  32
zeroes:  992
ones:  40
zeroes:  984
ones:  49
zeroes:  975
ones:  40
zeroes:  984
ones:  49
zeroes:  975
ones:  63
zeroes:  961
ones:  46
zeroes:  978
ones:  47
zeroes:  977
ones:  33
zeroes:  991
ones:  38
zeroes:  986
ones:  47
zeroes:  977
ones:  40
zeroes:  984
ones:  47
zeroes:  977
ones:  47
zeroes:  977
ones:  44
zeroes:  980
ones:  49
zeroes:  975
ones:  55
zeroes:  969
ones:  47
zeroes:  977
ones:  48
zeroes:  976
ones:  49
zeroes:  975
ones:  43
zeroes:  981
ones:  50
zeroes:  974
ones:  56
zeroes:  968
ones:  48
z

In [20]:
print(len([c for c, i in enumerate(df_training['Solubility']) if i == (1,0)])
/len([c for c, i in enumerate(df_training['Solubility']) if i == (0,1)])*100,'% insoluble')
print(len([c for c, i in enumerate(df_training['Solubility']) if i == (1,0)]), '# insoluble')
print(len([c for c, i in enumerate(df_training['Solubility']) if i == (0,1)]), '# soluble')

5.635096214050173 % insoluble
1350 # insoluble
23957 # soluble


In [8]:
def batch(df_i,df_s):
    """
    Feed function for model. Undersampled, randomly pickes 100 insoluble and 100 soluble compounds
    """
    insol = df_i.sample(n = 150)
    sol = df_s.sample(n = 150)
    
    df = pd.concat([insol, sol])
    df = df.sample(frac=1)
    
    fprints = np.array(list(df['fingerprints']), dtype='int32')
    labels = np.array(list(df['Solubility']), dtype='int32')
    return fprints, labels

def validation_input(valid_set):
    features = np.array(list(valid_set['fingerprints']))
    labels = np.array(list(valid_set['Solubility']))
    return features, labels

def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
            / predictions.shape[0])

def insol_accuracy(valid, labels_v):
    valid = [list(i) for i in valid]
    labels_v = [list(i) for i in labels_v]
    neg_matching = [count for count in range(len(valid)) if valid[count] == [1,0] and labels_v[count] == [1,0]]
    return len(neg_matching)/len([count for count in range(len(labels_v)) if labels_v[count] == [1,0]])*100

In [15]:
# Neural network time!

graph = tf.Graph()

n_inputs = len(df_training['fingerprints'][0])
layer1_nodes = 4100
layer2_nodes = 3000
layer3_nodes = 2300

batch_size = 300
learning_rate = .001

with graph.as_default():
    # Setting up tensorflow graph
    # Training data to be fed at runtime
    train_data = tf.placeholder(dtype=tf.float32, name='input_layer')
    train_labels = tf.placeholder(dtype=tf.float32, name='train_labels')

    
    # Weights
    layer1_weights = tf.Variable(tf.truncated_normal([n_inputs, layer1_nodes]), name='l1_weights')
    layer2_weights = tf.Variable(tf.truncated_normal([layer1_nodes, layer2_nodes]), name='l2_weights')
    layer4_weights = tf.Variable(tf.truncated_normal([layer2_nodes, 2]), name='l2_weights')
    
    # Logits
    logit1 = tf.matmul(train_data, layer1_weights, name='logit1')
    relu_layer1 = tf.nn.relu(logit1, name='relu_layer')
    dropout_1 = tf.nn.dropout(x=relu_layer1, keep_prob=.5, name='dropout')
    
    logit2 = tf.matmul(dropout_1, layer2_weights, name='logit1')
    relu_layer2 = tf.nn.relu(logit2, name='relu_layer')
    dropout_2 = tf.nn.dropout(x=relu_layer2, keep_prob=.5, name='dropout')
    
    logit4 = tf.matmul(dropout_2, layer4_weights, name='logit4')
    
    # Loss tf.nn.softmax_cross_entropy_with_logits_v2
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logit4, labels=train_labels), name='loss')
    # Optimizer
    optimizer = tf.train.GradientDescentOptimizer(learning_rate, name='optimizer').minimize(loss)
    
    # Prediction
    train_prediction = tf.nn.softmax(logit4, name='predictor')

In [16]:
with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    # writer = tf.summary.FileWriter('.', session.graph)
    for step in range(10001):
        data, labels = batch(df_training_insol, df_training_sol)
        feed_dict = {train_data: data,train_labels: labels }
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        # print(predictions)
        # print(l)
        if step % 500 == 0:
            data_v, labels_v = validation_input(validation_set1)
            feed_dict = {train_data: data_v,train_labels: labels_v}
            _, valid = session.run([loss, train_prediction], feed_dict=feed_dict)
            print("Minibatch loss at step {}: {}".format(step, l))
            print("Minibatch accuracy: {:.2f}".format(accuracy(predictions, labels)))
            print("Insoluble minibatch accuracy: {:.2f}".format(insol_accuracy(predictions, labels)))
            print("Insoluble validation accuracy: {:.2f}".format(insol_accuracy(valid, labels_v)))
            print("Minibatch validation accuracy: {:.2f}".format(accuracy(valid, labels_v)))
            print("")

Minibatch loss at step 0: 19084.19140625
Minibatch accuracy: 49.00
Insoluble minibatch accuracy: 35.33
Insoluble validation accuracy: 100.00
Minibatch validation accuracy: 5.36

Minibatch loss at step 500: 1013.7991943359375
Minibatch accuracy: 71.00
Insoluble minibatch accuracy: 84.00
Insoluble validation accuracy: 46.19
Minibatch validation accuracy: 79.00

Minibatch loss at step 1000: 549.18115234375
Minibatch accuracy: 63.67
Insoluble minibatch accuracy: 56.00
Insoluble validation accuracy: 73.71
Minibatch validation accuracy: 57.03

Minibatch loss at step 1500: 473.5751647949219
Minibatch accuracy: 67.33
Insoluble minibatch accuracy: 40.67
Insoluble validation accuracy: 84.28
Minibatch validation accuracy: 45.43

Minibatch loss at step 2000: 436.3830871582031
Minibatch accuracy: 62.33
Insoluble minibatch accuracy: 26.00
Insoluble validation accuracy: 91.89
Minibatch validation accuracy: 32.05

Minibatch loss at step 2500: 359.8451232910156
Minibatch accuracy: 63.67
Insoluble minib

KeyboardInterrupt: 