In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import pickle
import os
from tensorflow import keras
from math import floor
from sklearn.metrics import confusion_matrix

from tf_data_inputs import undersampled_data, entire_df_input_fprint, random_sample

In [None]:
# Setup

PATH = '.'
descriptors = 'Descriptors'
fing_path = os.path.join(PATH, descriptors)

morg_2048_path = 'morgan_2048_df.p'
morg_1024_path = 'morgan_1024_df.p'
maccs = 'maccs_df.p'

morg_2048_bit = os.path.join(fing_path, morg_2048_path)
morg_1024_bit = os.path.join(fing_path, morg_1024_path)
maccs = os.path.join(fing_path, maccs)

train_frac = .8
validation_frac = .2
test_frac = .2

pd.options.display.max_rows = 14
pd.options.display.max_columns = 6
np.random.seed(2)

In [None]:
current_descriptor = 'maccs'
second_desc = None
sampling_method = 'random'

# Current options are fingerprint or combined
descriptor = 'fingerprints'
descriptor2 = 'combined'

# load descriptor
df = pickle.load(open(descriptors +'\\%s_df.p'%current_descriptor, 'rb'))

if second_desc:
    df2 = pickle.load(open(descriptors +'\\%s_df.p'%second_desc, 'rb'))
    df[descriptor] = df[descriptor] + df2[descriptor2]

# Randomize dataframe
df = df.sample(frac = 1)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df_len = len(df)

if sampling_method == 'undersample':
    df[descriptor] = df[descriptor].apply(tuple)
    df['Solubility'] = df['Solubility'].apply(tuple)
    # Creating training and first validation set
    df_training = df[:floor(df_len*train_frac)]
    df_training_insol = df_training[df_training['Solubility']==(1,0)]
    df_training_sol = df_training[df_training['Solubility']==(0,1)]
    validation_set1 = df[floor(df_len*train_frac):floor(df_len*train_frac)+floor(df_len*validation_frac)]
    del df
    
validation_set1 = df[floor(df_len*train_frac):floor(df_len*train_frac)+floor(df_len*validation_frac)]

In [None]:
# print(len([c for c, i in enumerate(df_training['Solubility']) if i == (1,0)])
# /len([c for c, i in enumerate(df_training['Solubility']) if i == (0,1)])*100,'% insoluble')
# print(len([c for c, i in enumerate(df_training['Solubility']) if i == (1,0)]), '# insoluble')
# print(len([c for c, i in enumerate(df_training['Solubility']) if i == (0,1)]), '# soluble')

In [None]:
def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
            / labels.shape[0])

def insol_accuracy(valid, labels_v):
    valid = [list(i) for i in valid]
    labels_v = [list(i) for i in labels_v]
    neg_matching = [count for count in range(len(valid)) if valid[count] == [1,0] and labels_v[count] == [1,0]]
    return len(neg_matching)/len([count for count in range(len(labels_v)) if labels_v[count] == [1,0]])*100

def perf_measure(y_hat, y_actual):
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    for i in range(len(y_hat)): 
        if list(y_hat[i])==list(y_actual[i])==[1,0]:
            TN +=1
        elif list(y_hat[i])==list(y_actual[i])==[0,1]:
            TP +=1
        elif list(y_hat[i])!=list(y_actual[i])==[1,0]:
            FN +=1
        elif list(y_hat[i])!=list(y_actual[i])==[0,1]:
            FP +=1
    return TP, FP, TN, FN

In [None]:
graph = tf.Graph()

if sampling_method == 'undersample':
    n_inputs = len(df_training[descriptor][0])
else:
    n_inputs = len(df[descriptor][0])
    
layer1_nodes = 4100
layer2_nodes = 3000
layer3_nodes = 2300

batch_size = 300
learning_rate = .01
l1 = 0
l2 = .01
training_weights = [2,.09]


with graph.as_default():
    # Setting up tensorflow graph
    # Training data to be fed at runtime
    train_data = tf.placeholder(dtype=tf.float32, name='input_layer')
    train_labels = tf.placeholder(dtype=tf.float32, name='train_labels')

    
    # Weights
    layer1_weights = tf.Variable(tf.truncated_normal([n_inputs, layer1_nodes]), name='l1_weights')
    layer2_weights = tf.Variable(tf.truncated_normal([layer1_nodes, layer2_nodes]), name='l2_weights')
    layer4_weights = tf.Variable(tf.truncated_normal([layer2_nodes, 2]), name='l2_weights')
    
    # Logits
    logit1 = tf.matmul(train_data, layer1_weights, name='logit1')
    relu_layer1 = tf.nn.relu(logit1, name='relu_layer')
    # dropout_1 = tf.nn.dropout(x=relu_layer1, keep_prob=.5, name='dropout')
    
    logit2 = tf.matmul(relu_layer1, layer2_weights, name='logit2')
    relu_layer2 = tf.nn.relu(logit2, name='relu_layer')
    # dropout_2 = tf.nn.dropout(x=relu_layer2, keep_prob=.5, name='dropout')
    
    logit4 = tf.matmul(relu_layer2, layer4_weights, name='logit4')
    scaled_logits = tf.multiply(logit4, training_weights)
    
    loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=scaled_logits, labels=train_labels)
    
    total_loss = tf.reduce_mean(loss, name='loss')
    
    optimizer = tf.train.FtrlOptimizer(
        learning_rate, 
        l2_regularization_strength=l2, 
        l1_regularization_strength=l1,
        name='optimizer'
    ).minimize(total_loss)
    
    # Prediction
    _dummy = tf.constant(5)
    train_prediction = tf.nn.softmax(logit4, name='predictor')

In [None]:
with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    # writer = tf.summary.FileWriter('.', session.graph)
    for step in range(10001):
        data, labels = random_sample(df, descriptor)
        feed_dict = {train_data: data, train_labels: labels }
        _,_2  = session.run([optimizer, _dummy], feed_dict=feed_dict)
        #print(predictions)
        #print(l)
        if step % 100 == 0:
            data, labels = entire_df_input_fprint(validation_set1, descriptor)
            feed_dict = {train_data: data, train_labels: labels }
            _, predictions, l = session.run([_dummy, train_prediction, total_loss], feed_dict=feed_dict)
    #              valid_labels, valid_features = entire_df_input_fprint(validation_set1, descriptor)
            print(perf_measure(predictions, labels))
            print('accuracy at step {}: {:.2f}'.format(step, accuracy(predictions, labels)))
            print('loss at step {}: {:.2f}'.format(step,l))
            print('')
        

In [None]:
with tf.Session(graph=graph) as session:
    print(session.run(tf.multiply([[1,2],[4,5]],[2,2])))