In [1]:
PATH = "../input/antioxidant-codes"
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from sklearn.model_selection import train_test_split
import sys
sys.path.insert(1, PATH)
from vocab import Vocab
from helper_functions import *
from dataset import *

In [2]:
vocab_size=23
batch_size=64
embedding_dim=256
rnn_units=256

In [3]:
df_General_data = pd.read_csv("../input/antioxidant-generation/general_peptide_data.csv")
vocabulary, ready_data = Vocab.create_vocab(np.array(df_General_data.Sequence.values))
df_train, df_test = train_test_split(ready_data, random_state=42, test_size=0.10)

In [4]:
vocab = vocabulary.vocab
print(vocab)

{'!': 1, 'G': 8, 'D': 5, 'V': 20, 'K': 11, 'F': 7, 'S': 18, 'L': 12, '%': 2, 'H': 9, 'N': 14, 'Y': 22, 'I': 10, 'E': 6, 'Q': 16, 'C': 4, 'R': 17, 'W': 21, 'P': 15, 'A': 3, 'T': 19, 'M': 13, '+': 0}


In [5]:
generate_data = GenerateData(vocab=vocab)

In [6]:
train_encode = generate_data.encode_data_lst(df_train)
test_encode = generate_data.encode_data_lst(df_test)


train_pading = generate_data.padding(train_encode)
test_padding = generate_data.padding(test_encode)


x_train, y_train = generate_data.split_for_data(train_pading)
x_test, y_test = generate_data.split_for_data(test_padding)

In [7]:
x_train[1], y_train[1]

(array([ 1,  3,  9, 18, 18,  3,  9, 18, 18, 18,  3,  7,  2,  0,  0,  0,  0,
         0,  0], dtype=int32),
 array([ 3,  9, 18, 18,  3,  9, 18, 18, 18,  3,  7,  2,  0,  0,  0,  0,  0,
         0,  0], dtype=int32))

In [None]:
x = tf.keras.Input(shape=(19,))
x2 = tf.keras.layers.Embedding(
    input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True
)(x)
h1 = layers.GRU(rnn_units, return_sequences=True, stateful=False)(x2)
h2 = layers.GRU(rnn_units, return_sequences=True, stateful=False)(h1)

yhat = tf.keras.layers.Dense(vocab_size)(h2)
train_model = tf.keras.Model(inputs=x, outputs=yhat)

In [9]:
train_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 19)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 19, 256)           5888      
_________________________________________________________________
gru (GRU)                    (None, 19, 256)           394752    
_________________________________________________________________
gru_1 (GRU)                  (None, 19, 256)           394752    
_________________________________________________________________
dense (Dense)                (None, 19, 23)            5911      
Total params: 801,303
Trainable params: 801,303
Non-trainable params: 0
_________________________________________________________________


In [10]:
checkpoint_filepath = 'saved_models/checkpoint'
best_model = ModelCheckpoint(filepath = checkpoint_filepath, monitor = 'loss', save_best_only=True, save_weights_only = True, mode='min')

rlr = ReduceLROnPlateau(monitor='loss', factor=0.5,patience=5, min_lr=0.000001, verbose=1, min_delta=1e-5)

In [11]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)
train_model.compile(tf.optimizers.Adam(), loss=loss)
result = train_model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=250, callbacks = [best_model, rlr])

Epoch 1/250


2022-09-24 01:42:21.679302: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2022-09-24 01:42:27.726224: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250

Epoch 00017: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250

Epoch 00055: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78/250
Epoch 79/250
Epoch 80/250
Epoch 81/250
Epoch 82/250
Epoch 83/250
Epoch 84/250
Epoch 85/250
Epoch 86/250
Epoch 87/250
Epoch 88/250
Epoch 89/250
Epoch 90/250
Epoch 91/250
Epoch 92/250
Epoch 93/250
Epoch 94/250
Epoch 95/250
Epoch 96/250
Epoch 97/250
Epoch 98/250
Epoch 99/250
Epoch 100/250
Epoch 101/250
Epoch 102/250
Epoch 103/250
Epoch 104/250
Epoch 105/250
Epoch 106/250
Epoch 107/250
Epoch 108/250
Epoch 109/250
Epoch 110/250
Epoch 111/250
Epoch 112/250
Epoch 113/250

Epoch 00113: ReduceLROnPlateau reducing learning rate to 0.0001250000059