In [None]:
! pip install lime
! pip install tensorflow
! pip install keras

In [None]:
import numpy as np
import tensorflow as tf
from timeit import default_timer as timer
from lime import lime_tabular
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Conv1D, AveragePooling1D, MaxPooling1D, TimeDistributed, LeakyReLU, BatchNormalization, Flatten
from keras import optimizers, callbacks
from lime.lime_text import LimeTextExplainer
import sklearn.feature_extraction
import re
#from keras.regularizers import l2

In [None]:
sequence_len = 700
total_features = 57
amino_acid_residues = 21
num_classes_orig = 8
cnn_width = 17


def get_dataset(path="dataset/cullpdb+profile_6133.npy"):
    dataset = np.load(path)
    dataset = np.reshape(dataset, (dataset.shape[0], sequence_len, total_features))
    ret = np.zeros((dataset.shape[0], dataset.shape[1], amino_acid_residues + num_classes_orig))
    ret[:, :, :amino_acid_residues] = dataset[:, :, 35:56]
    ret[:, :, amino_acid_residues:] = dataset[:, :, amino_acid_residues + 1:amino_acid_residues+ 1 + num_classes_orig]
    return ret


def split_dataset(dataset, seed=None):
    np.random.seed(seed)
    np.random.shuffle(dataset)
    train_split = int(dataset.shape[0]*0.8)
    test_val_split = int(dataset.shape[0]*0.1)
    train = dataset[:train_split, :, :]
    test = dataset[train_split:train_split+test_val_split, :, :]
    validation = dataset[train_split+test_val_split:, :, :]

    return train, test, validation


def reshape_data(X, y):
    # Reshape X
    padding = np.zeros((X.shape[0], X.shape[2], int(cnn_width/2)))
    X = np.dstack((padding, np.swapaxes(X, 1, 2), padding))
    X = np.swapaxes(X, 1, 2)
    res = np.zeros((X.shape[0], X.shape[1] - cnn_width + 1, cnn_width, amino_acid_residues))
    for i in range(X.shape[1] - cnn_width + 1):
        res[:, i, :, :] = X[:, i:i+cnn_width, :]
    res = np.reshape(res, (X.shape[0]*(X.shape[1] - cnn_width + 1), cnn_width, amino_acid_residues))
    X = res[np.count_nonzero(res, axis=(1, 2))>(int(cnn_width/2)*amino_acid_residues), :, :]

    # Reshape y
    y = np.reshape(y, (y.shape[0] * y.shape[1], y.shape[2]))
    y = y[~np.all(y == 0, axis=1)]

    return X, y


def get_helix_labels(labels):

    relevant_labels = labels[:, 3:6]
    reshaped_labels = np.zeros(relevant_labels.shape[0])
    reshaped_labels = np.any(relevant_labels, axis=1, out=reshaped_labels)

    return reshaped_labels


def get_alpha_helix_labels(labels):
    return labels[:, 5]


def get_dataset_reshaped(seed=100):
    dataset = get_dataset('dataset/cullpdb+profile_6133_filtered.npy')
    train, test, validation = split_dataset(dataset, seed)

    X_train, y_train = train[:, :, :amino_acid_residues], train[:, :, amino_acid_residues:]
    X_test, y_test = test[:, :, :amino_acid_residues], test[:, :, amino_acid_residues:]
    X_val, y_val = validation[:, :, :amino_acid_residues], validation[:, :, amino_acid_residues:]

    # Reshape data using the window width
    X_train, y_train = reshape_data(X_train, y_train)
    X_test, y_test = reshape_data(X_test, y_test)
    X_val, y_val = reshape_data(X_val, y_val)

    y_train = get_helix_labels(y_train)
    y_test = get_helix_labels(y_test)
    y_val = get_helix_labels(y_val)

    return X_train, y_train, X_test, y_test, X_val, y_val


In [None]:
X_train, y_train, X_test, y_test, X_val, y_val = get_dataset_reshaped(seed=100)

In [None]:
do_summary = False

LR = 0.0009  # maybe after some (10-15) epochs reduce it to 0.0008-0.0007
drop_out = 0.38
batch_dim = 64
nn_epochs = 1

loss = 'binary_crossentropy'
opt = optimizers.Adam(lr=LR)

early_stop = callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=1, verbose=0, mode='min')


filepath = "our_best_model.hdf5"
checkpoint = callbacks.ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')


def CNN_model():
    m = Sequential()
    m.add(Conv1D(128, 5, padding='same', activation='relu', input_shape=(cnn_width, amino_acid_residues)))
    m.add(BatchNormalization())
    # m.add(MaxPooling1D(pool_size=2))
    m.add(Dropout(drop_out))
    m.add(Conv1D(128, 3, padding='same', activation='relu'))
    m.add(BatchNormalization())
    # m.add(MaxPooling1D(pool_size=2))
    m.add(Dropout(drop_out))
    m.add(Conv1D(64, 3, padding='same', activation='relu'))
    m.add(BatchNormalization())
    # m.add(MaxPooling1D(pool_size=2))
    m.add(Dropout(drop_out))
    # m.add(Conv1D(32, 3, padding='same', activation='relu'))
    # m.add(BatchNormalization())
    # m.add(MaxPooling1D(pool_size=2))
    # m.add(Dropout(drop_out))
    m.add(Flatten())
    m.add(Dense(128, activation='relu'))
    m.add(Dense(32, activation='relu'))
    m.add(Dense(8, activation='relu'))
    m.add(Dense(1, activation='sigmoid'))

    m.compile(optimizer=opt,
              loss=loss,
              metrics=['accuracy', 'mae'])

    if do_summary:
        print("\nHyper Parameters\n")
        print("Learning Rate: " + str(LR))
        print("Drop out: " + str(drop_out))
        print("Batch dim: " + str(batch_dim))
        print("Number of epochs: " + str(nn_epochs))
        print("\nLoss: " + loss + "\n")

        m.summary()

    return m

In [None]:
net = CNN_model()

In [None]:
start_time = timer()

history = net.fit(X_train, y_train, epochs=nn_epochs, batch_size=batch_dim, shuffle=True,
                        validation_data=(X_val, y_val), callbacks=[checkpoint])

end_time = timer()
print("\n\nTime elapsed training the model: " + "{0:.2f}".format((end_time - start_time)) + " s")

In [None]:
AA_LABELS = ['A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y', 'X', '-']

ohe_text_map = {}
def one_hot_decoder(item):
    one_hot_labels = AA_LABELS
    sequence = ''
    for aa in item:
        max_label = (0, 0)
        for i, val in enumerate(aa):
            if val > max_label[0]:
                max_label = (val, i)
        if max_label[0] != 0:
            sequence += one_hot_labels[max_label[1]]
    ohe_text_map[sequence] = item
    return sequence

def one_hot_decoder2(item):
    sequence = ''
    for aa in item:
        if np.amax(aa) != 0:
            index = np.argmax(aa)%21
            print('Index: ', index)
            sequence += AA_LABELS[index]
            print(sequence)
    return sequence

In [None]:
def tokenize_aa_data(dataset, vectorizer=sklearn.feature_extraction.text.TfidfVectorizer(lowercase=True, token_pattern=r'(?u)\b\w+\b')):
    separated_dataset = []
    for item in dataset:
        separated_string = ' '.join(item)
        separated_string = re.sub(r'(\\ .)', r"\\\1", separated_string)
        separated_dataset.append(separated_string)
    return vectorizer.fit_transform(separated_dataset)

vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=True, token_pattern=r'(?u)\b\w+\b')

X_train_text = [one_hot_decoder(x) for x in X_train]
X_test_text = [one_hot_decoder(x) for x in X_test]
train_vectors = tokenize_aa_data(X_train_text, vectorizer)
test_vectors = tokenize_aa_data(X_test_text, vectorizer)

In [None]:
#Y_train_text = [one_hot_decoder(y, x=False) for y in y_train]
#Y_train_text = one_hot_decoder(y_train, x=False)
#Y_test_text = one_hot_decoder(y_test, x=False)

#Y_test_text = [one_hot_decoder(y, x=False) for y in y_test]

In [35]:
def one_hot_encode(item):
    # Item received should be a single amino-acid sequence
    try:
        encoded_item = []
        length_of_padding= 17-len(item)
        for c in item:
            encoded_char = np.zeros(21)
            encoded_char[AA_LABELS.index(c)] = 1
            encoded_item.append(encoded_char)
        for i in range(length_of_padding):
            encoded_item.append(np.zeros(21))
        encoded_item = np.concatenate(encoded_item, axis=0 )
        encoded_item = encoded_item.reshape(1,17,21)
        return encoded_item
    except:
        print(item)
        
def predict_text_to_ohe(items):
    #item_ohe = ohe_text_map[item[0]]
    return net.predict([one_hot_encode(item) for item in items])
    #return net.predict(X_test)[idx]

#print(one_hot_encode(one_hot_decoder(X_test[8])))

In [36]:
def split_characters(item):
    separated_string = ' '.join(item)
    separated_string = re.sub(r'(\\ .)', r"\\\1", separated_string)
    print(separated_string)
    return separated_string.split()

In [37]:
idx=5

In [38]:
predict_text_to_ohe(X_test_text[idx])

ValueError: in user code:

    File "C:\Python310\lib\site-packages\keras\engine\training.py", line 2137, in predict_function  *
        return step_function(self, iterator)
    File "C:\Python310\lib\site-packages\keras\engine\training.py", line 2123, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Python310\lib\site-packages\keras\engine\training.py", line 2111, in run_step  **
        outputs = model.predict_step(data)
    File "C:\Python310\lib\site-packages\keras\engine\training.py", line 2079, in predict_step
        return self(x, training=False)
    File "C:\Python310\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Python310\lib\site-packages\keras\engine\input_spec.py", line 216, in assert_input_compatibility
        raise ValueError(

    ValueError: Layer "sequential" expects 1 input(s), but it received 14 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, 17, 21) dtype=float32>, <tf.Tensor 'IteratorGetNext:1' shape=(None, 17, 21) dtype=float32>, <tf.Tensor 'IteratorGetNext:2' shape=(None, 17, 21) dtype=float32>, <tf.Tensor 'IteratorGetNext:3' shape=(None, 17, 21) dtype=float32>, <tf.Tensor 'IteratorGetNext:4' shape=(None, 17, 21) dtype=float32>, <tf.Tensor 'IteratorGetNext:5' shape=(None, 17, 21) dtype=float32>, <tf.Tensor 'IteratorGetNext:6' shape=(None, 17, 21) dtype=float32>, <tf.Tensor 'IteratorGetNext:7' shape=(None, 17, 21) dtype=float32>, <tf.Tensor 'IteratorGetNext:8' shape=(None, 17, 21) dtype=float32>, <tf.Tensor 'IteratorGetNext:9' shape=(None, 17, 21) dtype=float32>, <tf.Tensor 'IteratorGetNext:10' shape=(None, 17, 21) dtype=float32>, <tf.Tensor 'IteratorGetNext:11' shape=(None, 17, 21) dtype=float32>, <tf.Tensor 'IteratorGetNext:12' shape=(None, 17, 21) dtype=float32>, <tf.Tensor 'IteratorGetNext:13' shape=(None, 17, 21) dtype=float32>]


In [41]:
explainer = LimeTextExplainer(
    17,
    #X_train_text,
    #training_labels = y_train,
    #feature_names = ['A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y', 'X', '-'], 
    split_expression=split_characters,
    #bow being false replaces the removed feature with 'UNKWORDZ' instead of removing it,
    #It should be false because character order matters for us, but this breaks our encoder. We should use 'X' instead of 'UNKWORDZ'
    #bow=False,
    class_names=['Helix', 'Not helix'],
    char_level=true)

#array = one_hot_encode('AKHESLSHFDAMWF')
#print(array)
#new_arry = np.concatenate(array, axis=0 )
#print(new_arry.reshape(17,21))

#print(one_hot_encode('AKHESLSHFDAMWF'))
#print(one_hot_encode(X_test_text[idx])[0])
#print(X_test_text[idx])
#print(X_test[0])
#predict_text_to_ohe('AKHESLSHFDAMWF')

#predict_text_to_ohe(X_test_text[idx])
exp = explainer.explain_instance(X_test_text[idx], predict_text_to_ohe, num_features=17) #, labels=(1,))# labels=[0, 17])

#exp.show_in_notebook()
#print('Document id: %d' % idx)
#print('Predicted class =', net.predict(X_test[idx]))
#print('True class: ', y_test[idx])

#print('Explanation for class %s' % class_names[0])
#print('\n'.join(map(str, exp.as_list(label=0))))

#exp = explainer.explain_instance(X_test_text[idx], c.predict_proba, num_features=6, top_labels=2)
#print(exp.available_labels())
#exp.show_in_notebook(text=False)

#exp.show_in_notebook(text=X_test_text[idx], labels=(0,))
#exp.show_in_notebook()

NameError: name 'true' is not defined

In [None]:
print(one_hot_encode('AKHESLSHFDAMWF'))
'''
[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
'''

print(X_test_text[idx])
'''
AKHESLSHFDAMWF
'''

print(one_hot_decoder(X_test[idx]))

'''
AKHESLSHFDAMWF
'''
print(X_test.shape)

#print(one_hot_decoder2(21*[X_test[8]]))

'''
DDDDDDDDDDDDDDDDDDDDD
'''

#predict_text_to_ohe('AKHESLSHFDAMWF')

#print(predict_text_to_ohe)

#print(one_hot_decoder2(X_test[8]))
#print(predict_text_to_ohe(one_hot_decoder2(X_test[8])))

#print(np.amax(X_test[8][0]))
#(X_test[8][0])

#predict_text_to_ohe(

#one_hot_encode('AKHESLSHFDAMWFEFN').

In [None]:
explainer = lime_tabular.RecurrentTabularExplainer(
    X_train, training_labels = y_train,
    feature_names = ['A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y', 'X', '-'],
    class_names = ['Helix', 'Not helix'])

In [None]:
print('Yeah, got the first part!')
exp1 = explainer.explain_instance(X_test[10], net.predict, num_features=10, labels=(0,))
exp1.show_in_notebook()

In [None]:
exp2 = explainer.explain_instance(X_test[10], net.predict, num_features=10, labels=(0,))

In [None]:
exp2.show_in_notebook(text=X_test[idx], labels=(0,))