In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import keras
import tensorflow as tf


from sklearn.model_selection import train_test_split

### Encoder preparation

In [None]:
SMILES_CHARS = [' ',
                '#', '%', '(', ')', '+', '-', '.', '/', '~',
                '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                '=', '@',
                'A', 'B', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P',
                'R', 'S', 'T', 'V', 'X', 'Z',
                '[', '\\', ']',
                'a', 'b', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'p', 'r', 's', 'd',
                't', 'u']

In [None]:
smi2index = dict( (c,i) for i,c in enumerate( SMILES_CHARS ) )
index2smi = dict( (i,c) for i,c in enumerate( SMILES_CHARS ) )

def smiles_encoder( smiles, maxlen=3000 ):
    X = np.zeros( ( maxlen, len( SMILES_CHARS ) ) )
    for i, c in enumerate( smiles ):
        X[i, smi2index[c] ] = 1
    return X

def smiles_decoder( X ):
    smi = ''
    X = X.argmax( axis=-1 )
    for i in X:
        smi += index2smi[ i ]
    return smi

### Working with df

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Course_work/notebooks/data/df_smiles.csv')
df.drop(columns = ['Unnamed: 0'], inplace = True)
df.head(6)

Unnamed: 0,pdbcode,affinity,pocket_smiles
0,3zzf,0.4,CC(C)C[C@@H](C=O)NC(=O)[C@H](CO)NC(=O)[C@@H](N...
1,3gww,0.45,CC(C)C[C@H](N)C(=O)N[C@H](C=O)Cc1ccc(O)cc1.CC(...
2,1w8l,0.49,CC(C)C[C@@H](C=O)NC(=O)[C@@H](N)Cc1c[nH]c2cccc...
3,3fqa,0.49,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)CNC(=O)[C@H](C)...
4,1zsb,0.6,CC(C)C[C@@H](C=O)NC(=O)[C@H](CC(C)C)NC(=O)[C@@...
5,6h9v,0.66,CC[C@H](C)[C@H](N)C(=O)N[C@@H](CCC(N)=O)C(=O)N...


### Train validation split

In [1]:
y = df.drop(columns = ['pocket_smiles']).copy()
x = df.drop(columns = ['affinity']).copy()

NameError: ignored

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x, y, train_size=0.7)
#x_val, x_test, y_val, y_test = train_test_split(x_rem, y_rem, test_size=0.33)

In [None]:
partition = {'train': x_train.pdbcode.values,
             'val': x_val.pdbcode.values}
labels = {}
for code in df.pdbcode:
  labels.update({code: df[df.pdbcode == code].affinity.values[0]})

### Data Generator

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, batch_size=32, dim=(59,3000), shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, *self.dim))
        y = np.empty((self.batch_size), dtype=int)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            smi = df[df.pdbcode == ID].pocket_smiles.values[0]
            X[i,] = smiles_encoder(smi)

            # Store class
            y[i] = self.labels[ID]

        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

In [None]:
training_generator = DataGenerator(partition['train'], labels)
validation_generator = DataGenerator(partition['val'], labels)

### Model

In [None]:
model_alexnet = keras.models.Sequential([
    keras.layers.Conv1D(filters=96, kernel_size=(11), strides=(4), activation='relu', input_shape=(59,3000)),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPool1D(pool_size=(3), strides=(2)),
    keras.layers.Conv1D(filters=256, kernel_size=(5), strides=(1), activation='relu', padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPool1D(pool_size=(3), strides=(2)),
    keras.layers.Conv1D(filters=384, kernel_size=(3), strides=(1), activation='relu', padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.Conv1D(filters=384, kernel_size=(3), strides=(1), activation='relu', padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.Conv1D(filters=256, kernel_size=(3), strides=(1), activation='relu', padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPool1D(pool_size=(3), strides=(2)),
    keras.layers.Flatten(),
    keras.layers.Dense(4096, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(4096, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1, activation='softmax')
])

ValueError: ignored