In [1]:
# Load testdatabase into memory, ignore duplicate rows
# Import more data for better results
import sqlite3
import numpy as np
import math
from sklearn.model_selection import train_test_split
conn = sqlite3.connect('testdatabase.sqlite3')
c = conn.cursor()
c.execute('SELECT DISTINCT standard_inchi,protein_sequence,standard_value FROM small_interactions')
d = c.fetchall()
conn.close()

In [90]:
# Convert inchi and protein sequence into ASCII, convert IC50 into binary True/False 
din=[[0] for x in range(len(d))]
dpr=[[0] for x in range(len(d))]
dca=[[0] for x in range(len(d))]
i=0
for row in d:
    din[i] = [ord(c) for c in d[i][0]] # inchis
    dpr[i] = [ord(c) for c in d[i][1]] # proteins
    if d[i][2]<50:
        dca[i] = 1
    elif d[i][2]<100:
        dca[i] = 2
    elif d[i][2]<200:
        dca[i] = 3
    elif d[i][2]<500:
        dca[i] = 4
    else:
        dca[i] = 0
    #dca[i] = 0 if d[i][2]>200 else 1   # categories - can adjust this threshold for better results
    i += 1

In [91]:
# Pad with zeros to max length of Inchis and Proteins
# Since we concatenate the two later, need max length overall to pad so every conc row has same length
maxlen=0
for row in range(len(din)):
    maxlen = len(din[row]) if len(din[row])>maxlen else maxlen
    maxlen = len(dpr[row]) if len(dpr[row])>maxlen else maxlen # delete this if not conc later
for row in range(len(din)):
    din[row].extend([0 for x in range(maxlen-len(din[row]))])  
    dpr[row].extend([0 for x in range(maxlen-len(dpr[row]))])  # delete this if not conc later
#maxlen=0
#for row in range(len(dpr)):
#    maxlen = len(dpr[row]) if len(dpr[row])>maxlen else maxlen
#for row in range(len(dpr)):
#    dpr[row].extend([0 for x in range(maxlen-len(dpr[row]))])

nin = np.array(din).astype(float) # turn into Numpy Array for Keras - INICHIS (samples x asciis)
npr = np.array(dpr).astype(float) # turn into Numpy Array for Keras - PROTEINS (samples x asciis)
nca = np.array(dca).astype(float) # turn into Numpy Array for Keras - IC50s (samples x 0/1)

In [92]:
# Scale ASCII int to float(0..1) 
maxval = 0
for row in range(len(nin)):
    maxval = max(nin[row]) if max(nin[row])>maxval else maxval
for row in range(len(nin)):
    nin[row] = np.divide(nin[row],float(maxval))
maxval = 0
for row in range(len(npr)):
    maxval = max(npr[row]) if max(npr[row])>maxval else maxval
for row in range(len(npr)):
    npr[row] = np.divide(npr[row],float(maxval))

In [5]:
# Import Keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.utils import np_utils

Using TensorFlow backend.


In [93]:
# Create Train and Test inputs for Keras
# Concatenate Inchis and Proteins as a first try
X = np.concatenate((nin,npr),axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, nca, test_size=0.2, random_state=61)
X_train = X_train.reshape(X_train.shape[0],X_train.shape[1],1)
X_test = X_test.reshape(X_test.shape[0],X_test.shape[1],1)
# one hot encode outputs - needed for Keras optimizer
y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)
num_classes = y_test.shape[1]

In [94]:
# Build CNN with 
# 2x (1D Conv layer + MaxPooling)
# 2x Dense
# Can adjust:
# number of Conv1D layers
# number of Conv1D features (default=32, 16), size of Kernel (default=8,4)
# size of MaxPooling1D (default=2,2)
# number and size of Dense layers

model = Sequential()
model.add(Conv1D(48, (12), input_shape=(X_train.shape[1],1), activation='relu'))
#model.add(MaxPooling1D(pool_size=(4)))
#model.add(Conv1D(48, (16), activation='relu'))
#model.add(MaxPooling1D(pool_size=(2)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
#model.add(Dense(64, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [95]:
batch_size = 128
epochs = 3
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(X_test, y_test))

Train on 15201 samples, validate on 3801 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1a4c4bf6a0>

In [96]:
# Generate probabilities for all out-of-training-sample reactions
predictions = model.predict(X_test)

In [118]:
# Print one prediction versus actual
x=13  # change this
np.set_printoptions(precision=3)
np.set_printoptions(suppress=True)
print("Prediction Probabilities = ",predictions[x])
print("Actual Result = ",y_test[x])


Prediction Probabilities =  [0.42  0.455 0.048 0.031 0.047]
Actual Result =  [0. 1. 0. 0. 0.]


In [122]:
# Analyze each result category
for cat in range(len(predictions[0])):
    x,y = 0.0,0.0
    for rowp,rowr in zip(predictions,y_test):
        x += rowp[cat]
        y += rowr[cat]
    print("Category ",cat,": Sum(Probabilities) = ",x,"  Sum(Results) = ",y)

Category  0 : Sum(Probabilities) =  2943.8086913377047   Sum(Results) =  2905.0
Category  1 : Sum(Probabilities) =  517.6655498892387   Sum(Results) =  497.0
Category  2 : Sum(Probabilities) =  103.20330834126253   Sum(Results) =  103.0
Category  3 : Sum(Probabilities) =  77.2075070230203   Sum(Results) =  113.0
Category  4 : Sum(Probabilities) =  159.1149429625366   Sum(Results) =  183.0
