In [None]:
import tensorflow as tf
from tensorflow import keras
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.metrics import AUC, Accuracy, Precision
from keras.models import Model
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.metrics import Precision
from tensorflow.keras.models import Model
from helper_functions import get_model_name
from vocab import Vocab

In [None]:
train_data = pd.read_csv("/data/Classification_data.csv")
Seq = []
labels = []

for i, seq in enumerate(train_data.Sequence):

    if train_data['Chelator'][i] == 0 and train_data["FRS"][i] ==1:
        if 1<len(train_data["Sequence"][i]) <=20:
            Seq.append(train_data["Sequence"][i])
            labels.append(1)

    elif train_data['Chelator'][i]==1 and train_data['FRS'][i] == 1:
        if 1<len(train_data["Sequence"][i]) <=20:
            Seq.append(train_data["Sequence"][i])
            labels.append(1)

    elif train_data['Chelator'][i] == 0 and train_data['FRS'][i] == 0:
        if 1<len(train_data["Sequence"][i]) <=20:
            Seq.append(train_data["Sequence"][i])
            labels.append(0)


In [None]:
Xdata = np.array(Seq)
Ydata = np.array(labels)

Xdata, Ydata = shuffle(Xdata, Ydata, random_state=42)

In [None]:
text_vectorizer = TextVectorization(
                                    standardize=None,
                                    split ="character",
                                    output_mode='int'
                                    )
text_vectorizer.adapt(Xdata)


In [None]:
def model():
    tf.random.set_seed(42)
    input = layers.Input(shape=(1,), dtype="string")
    vect = text_vectorizer(input)
    x1 = layers.Embedding(22, 256)(vect)
    x2 = layers.Conv1D(128,3)(x1)
    x3 = layers.GlobalMaxPooling1D()(x2)
    layer1 = tf.keras.layers.LayerNormalization(axis=-1)
    xx = layer1(x3)
    yhat = layers.Dense(1, activation = "sigmoid")(xx)
    classification_model = Model(inputs = input, outputs = yhat)
    return classification_model


In [None]:
epochs = 80
batch_size = 10

name = "crossval"
n_splits=5
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
accuracy = []
auc = []
precision = []
fold_no = 1
for train, test  in kfold.split(Xdata, Ydata):
    train_model = model()
    rlr = ReduceLROnPlateau(monitor='val_auc', factor=0.5,patience=5, min_lr=0.000001, verbose=1, min_delta=1e-5)
    train_model.compile(optimizer = tf.keras.optimizers.legacy.Adam(learning_rate = 0.0003,decay = 0.001),
                                loss=tf.keras.losses.BinaryCrossentropy(),
                                metrics = ['accuracy', tf.keras.metrics.AUC(), Precision()])
    print(f'training fold number {fold_no}')

    save_dir = 'Classification_model/'
    checkpoint = tf.keras.callbacks.ModelCheckpoint(save_dir+get_model_name(fold_no, name),
            monitor='val_accuracy', verbose=1,
            save_best_only=True, mode='max')

    callbacks_list = [checkpoint, rlr]

    history = train_model.fit(Xdata[train], Ydata[train], epochs = epochs,
                            validation_data = (Xdata[test], Ydata[test]),
                            batch_size = batch_size,
                            callbacks = callbacks_list)


    train_model.load_weights(save_dir+get_model_name(fold_no, name))
    scores = train_model.evaluate(Xdata[test], Ydata[test], verbose=0)
    accuracy.append(scores[1])
    auc.append(scores[2])
    precision.append(scores[3])
    y_pred = train_model.predict(Xdata[test])
    tf.keras.backend.clear_session()
    fold_no+=1

In [None]:
print("score per fold:")
for i in range(n_splits):
    print(f'Accuracy in fold {i+1}: {accuracy[i]*100}, \nAUC in fold {i+1}: {auc[i]*100},\n Precision in fold{i+1}: {precision[i]*100}')

score per fold:
Accuracy in fold 1: 77.8181791305542, 
AUC in fold 1: 83.04196000099182,
 Precision in fold1: 79.71014380455017
Accuracy in fold 2: 77.0909070968628, 
AUC in fold 2: 83.80423188209534,
 Precision in fold2: 74.52229261398315
Accuracy in fold 3: 77.0909070968628, 
AUC in fold 3: 83.02387595176697,
 Precision in fold3: 76.37795209884644
Accuracy in fold 4: 74.45255517959595, 
AUC in fold 4: 81.0380756855011,
 Precision in fold4: 77.66990065574646
Accuracy in fold 5: 75.91241002082825, 
AUC in fold 5: 82.31560587882996,
 Precision in fold5: 74.40000176429749


In [None]:
print(f"Average Accuracy: {sum(accuracy)/5 *100},\nAverage AUC: {sum(auc)/5 *100},\n Average Precision: {sum(precision)/5 *100}")

Average Accuracy: 76.4729917049408,
Average AUC: 82.64474987983704,
 Average Precision: 76.53605818748474


In [None]:
# Load the models into a list
models = []
for i in range(1, 6):
    model_path = f'/Classification_model/model_crossval{i}.tf'
    model = tf.keras.models.load_model(model_path, custom_objects={'Precision': Precision, 'AUC': AUC, 'accuracy': Accuracy})
    models.append(model)

# Perform k-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
mcc = []

for fold_index, (train_indices, test_indices) in enumerate(kfold.split(Xdata, Ydata)):
    X_train, X_test = Xdata[train_indices], Xdata[test_indices]
    y_train, y_test = Ydata[train_indices], Ydata[test_indices]

    # Get the corresponding model for the current fold
    model = models[fold_index]

    # Make predictions using the current model
    predictions = model.predict(X_test)

    # Convert predictions to binary values
    binary_predictions = (predictions >= 0.5).astype(int).reshape(predictions.shape[0])

    # Calculate the Matthews correlation coefficient
    mcc_fold = metrics.matthews_corrcoef(y_test, binary_predictions)
    mcc.append(mcc_fold)

    print(f"MCC for Fold {fold_index + 1}: {mcc_fold}")

# Calculate the average MCC
average_mcc = sum(mcc) / len(mcc)
print(f"Average MCC: {average_mcc}")


Exception ignored in: <function _xla_gc_callback at 0x7b30857eacb0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/jax/_src/lib/__init__.py", line 97, in _xla_gc_callback
    def _xla_gc_callback(*args):
KeyboardInterrupt: 


MCC for Fold 1: 0.5566673715036466
MCC for Fold 2: 0.5448356265763565
MCC for Fold 3: 0.5400313689369748
MCC for Fold 4: 0.4874453575400174
MCC for Fold 5: 0.5152038765644379
Average MCC: 0.5288367202242866
