In [1]:
PATH = "../input/antioxidant-codes"
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
import sys
sys.path.insert(1, PATH)
from vocab import Vocab
from helper_functions import *
from dataset import *

In [2]:
vocab_size=23
batch_size=64
embedding_dim=256
rnn_units=256

In [3]:
df_oxid_data = pd.read_csv("../input/antioxidant-generation/df_antiox_peptide_data.csv")
df_General_data = pd.read_csv("../input/antioxidant-generation/general_peptide_data.csv")
vocabulary, _ = Vocab.create_vocab(np.array(df_General_data.Sequence.values))
ready_data = Vocab.creat_data(df_oxid_data.Sequence.values)
df_train, df_test = train_test_split(ready_data, random_state=42, test_size=0.10)

In [4]:
vocab = vocabulary.vocab
print(vocab)

{'!': 1, 'G': 8, 'D': 5, 'V': 20, 'K': 11, 'F': 7, 'S': 18, 'L': 12, '%': 2, 'H': 9, 'N': 14, 'Y': 22, 'I': 10, 'E': 6, 'Q': 16, 'C': 4, 'R': 17, 'W': 21, 'P': 15, 'A': 3, 'T': 19, 'M': 13, '+': 0}


In [5]:
generate_data = GenerateData(vocab=vocab)

In [6]:
train_encode = generate_data.encode_data_lst(df_train)
test_encode = generate_data.encode_data_lst(df_test)


train_pading = generate_data.padding(train_encode)
test_padding = generate_data.padding(test_encode)


x_train, y_train = generate_data.split_for_data(train_pading)
x_test, y_test = generate_data.split_for_data(test_padding)

In [7]:
x_train[1], y_train[1]

(array([ 1, 12, 21,  6,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0], dtype=int32),
 array([12, 21,  6,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0], dtype=int32))

In [8]:
x_train[0].shape

(19,)

In [9]:
x = tf.keras.Input(shape=(19,))
x2 = tf.keras.layers.Embedding(
    input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True
)(x)
h1 = layers.GRU(rnn_units, return_sequences=True, stateful=False)(x2)
h2 = layers.GRU(rnn_units, return_sequences=True, stateful=False)(h1)

yhat = tf.keras.layers.Dense(vocab_size)(h2)
Generateing_train_model = tf.keras.Model(inputs=x, outputs=yhat)
Generateing_train_model.load_weights("../input/gru-base-weights/checkpoint")

2022-09-24 07:36:20.931813: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-24 07:36:20.940982: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-24 07:36:20.941711: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-24 07:36:20.943182: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fbe20036ad0>

In [10]:
Generateing_train_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 19)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 19, 256)           5888      
_________________________________________________________________
gru (GRU)                    (None, 19, 256)           394752    
_________________________________________________________________
gru_1 (GRU)                  (None, 19, 256)           394752    
_________________________________________________________________
dense (Dense)                (None, 19, 23)            5911      
Total params: 801,303
Trainable params: 801,303
Non-trainable params: 0
_________________________________________________________________


In [11]:
for i in range(3):
        Generateing_train_model.layers[i].trainable = False
Generateing_train_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 19)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 19, 256)           5888      
_________________________________________________________________
gru (GRU)                    (None, 19, 256)           394752    
_________________________________________________________________
gru_1 (GRU)                  (None, 19, 256)           394752    
_________________________________________________________________
dense (Dense)                (None, 19, 23)            5911      
Total params: 801,303
Trainable params: 400,663
Non-trainable params: 400,640
_________________________________________________________________


In [12]:
checkpoint_filepath = 'GRU_TL/checkpoint'
best_model = ModelCheckpoint(filepath = checkpoint_filepath, monitor = 'loss', save_best_only=True, save_weights_only = True, mode='min')

rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.5,patience=5, min_lr=0.000001, verbose=1, min_delta=1e-5)
early_stop = EarlyStopping(monitor="val_loss",min_delta=0.002,patience=6, verbose=0,mode="auto",baseline=None,
            restore_best_weights=False,)

In [13]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)
Generateing_train_model.compile(tf.optimizers.Adam(), loss=loss)
result = Generateing_train_model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10000, callbacks = [best_model, rlr], batch_size = 10)

Epoch 1/10000


2022-09-24 07:36:23.651465: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2022-09-24 07:36:29.903201: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Epoch 52/10000
Epoch 53/10000
Epoch 54/10000
Epoch 55/10000
Epoch 56/10000
Epoch 57/10000
Epoch 58/10000
Epoch 59/10000
Epoch 60/10000
Epoch 61/10000
Epoch 62/10000
Epoch 63/10000
Epoch 64/10000
Epoch 65/10000
Epoch 66/10000
Epoch 67/10000
Epoch 68/10000
Epo