In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/enz-eff-project
!pip install -r requirements.txt

/content/drive/.shortcut-targets-by-id/1iS6gSWfUE3cZnmrNWbbV9_W_zQH3vaiu/enz-eff-project
Collecting pandas<1.6,>=1.4 (from -r requirements.txt (line 1))
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m
Collecting rdkit (from -r requirements.txt (line 4))
  Downloading rdkit-2023.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fair-esm (from -r requirements.txt (line 5))
  Downloading fair_esm-2.0.0-py3-none-any.whl (93 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Collecting pickle5 (from -r requirements.txt (line 10))
  Downloading pickle5-0.0.11.tar.gz (132 kB)
[2K     [90m━━━━━━

In [None]:
!pip install tensorflow-determinism

Collecting tensorflow-determinism
  Downloading tensorflow_determinism-0.4.0-py3-none-any.whl (3.9 kB)
Installing collected packages: tensorflow-determinism
Successfully installed tensorflow-determinism-0.4.0


In [None]:
%cd improved_code/model_training

/content/drive/.shortcut-targets-by-id/1iS6gSWfUE3cZnmrNWbbV9_W_zQH3vaiu/enz-eff-project/improved_code/model_training


In [None]:
%ls

# CNN
Hyperparameter tuning +  Model training + saving best Model

In [None]:
import random
random.seed(42)  # define seed

import numpy as np
np.random.seed(42)  # define seed

import tensorflow as tf
tf.random.set_seed(42)  # define seed

# Reduce randomness due the GPU manipulation
tf.config.experimental.enable_tensor_float_32_execution(False)
tf.config.optimizer.set_jit(False)
tf.config.experimental.list_physical_devices('GPU')

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Set to the desired GPU device
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['TF_DETERMINISTIC_OPS'] = '1'
import warnings
warnings.filterwarnings("ignore")

import keras
keras.utils.set_random_seed(42)
import pandas as pd
import numpy as np
from os.path import join
from tensorflow import keras
from sklearn.metrics import mean_squared_error, r2_score
from tensorflow.keras.models import save_model
from tensorflow.keras.initializers import glorot_normal
from tensorflow import keras
from scipy.optimize import minimize
from utils import (get_processed_data,
                   create_model,
                   save_best_params,
                   is_not_require_params,
                   train_model,
                   calculate_weighted_mean,
                   evaluate_model,
                   get_model_preds,
                   delete_file,
                   empty_directory,
                   )

# load train and test dataset
data_train = pd.read_pickle(
    join("..", "..", "data", "kcat_data", "splits", "train_df_kcat_new.pkl")
)
data_test = pd.read_pickle(
    join("..", "..", "data", "kcat_data", "splits", "test_df_kcat_new.pkl")
)

train_indices = list(
    np.load(
        join("..", "..", "data", "kcat_data", "splits", "CV_train_indices.npy"),
        allow_pickle = True
        ))
test_indices = list(
    np.load(
        join("..", "..", "data", "kcat_data", "splits", "CV_test_indices.npy"),
        allow_pickle = True)
        )

data_train.shape, data_test.shape

((3391, 30), (874, 30))

## ESM1b + DRFP

### Hyperparameter tuning

In [None]:
BEST_R2 = 0
BEST_HYPER_PARAMS = None
BEST_MODEL = "../../models/hyperparam_tune_models/cnn_best_esm1b_drfp.h5"
BEST_HYPER_PARAMS_FILE = "../../hyperparameters/cnn_hyperparam_esm1b_drfp.txt"
# Note: Due to limited GPU access time on Colab, if hyperparameter tuning stops at some point,
# we can resume the iteration from the point where it left off.
# Set the starting point for iteration
START = 0
# Define the total number of iterations (assuming you want to perform 1000 iterations in total)
TOTAL_ITERATION = 500


# Define the hyperparameter search space
PARAM_SPACE = {
    "filters_1": list(range(2, 15, 2)),
    "filters_2": list(range(4, 25, 2)),
    "filters_3": list(range(8, 35, 2)),
    "kernel_size_1": list(range(3, 19, 2)),
    "kernel_size_2": list(range(5, 17, 2)),
    "kernel_size_3": list(range(7, 15, 2)),
    "dense_units_1": [64, 128, 256, 512],
    "dense_units_2": [8, 16, 32, 64, 128, 256],
    "dropout_rate": [0.10, 0.2, 0.3, 0.4, 0.5],
    "optimizer": ["nadam", "adam", "rmsprop"],
    "batch_size": [8, 16, 24, 32, 64, 128],
}


if __name__ == "__main__":
    # apply processing on train and test dataset
    train_X, train_Y = get_processed_data([data_train["DRFP"],
                                          data_train["ESM1b_norm"]],
                                          data_train["log10_kcat_norm"])

    test_X, test_Y = get_processed_data([data_test["DRFP"],
                                          data_test["ESM1b_norm"]],
                                          data_test["log10_kcat_norm"])

    n_timesteps, n_features = train_X.shape[1], train_X.shape[2]

    # To avoid process on duplicate params
    processed_params = []
    for iteration in range(START, TOTAL_ITERATION):
        print(f"Iteration-{iteration}...")
        # randomly select the params from params space
        params = {
            key: np.random.choice(value) for key, value in PARAM_SPACE.items()
            }

        if (iteration < START) or (params in processed_params) or is_not_require_params(params):
            continue

        model = create_model(n_timesteps, n_features, **params)
        model = train_model(model, train_X, train_Y, test_X, test_Y, params)
        y_pred = model.predict(test_X).reshape(-1)
        curr_r2 = round(r2_score(test_Y, y_pred), 2)

        if curr_r2 > BEST_R2:
            BEST_R2 = curr_r2
            BEST_HYPER_PARAMS = params

            # delete_file(BEST_MODEL)
            # delete_file(BEST_HYPER_PARAMS_FILE)

            save_best_params(BEST_HYPER_PARAMS_FILE, BEST_HYPER_PARAMS, BEST_R2)
            save_model(model, BEST_MODEL)
            print(f"New best R2 score: {BEST_R2}")
            print(f"New Best hyperparameters: {BEST_HYPER_PARAMS}")

        processed_params.append(params)


### Training

In [None]:
# Trained Model path
MODEL_DIR = "../../models/train_models/"
TOTAL_MODELS = 5

# Trained hyperparameter
'''
HYPER_PARAMS = {
    "filters_1": 8,
    "filters_2": 22,
    "filters_3": 28,
    "kernel_size_1": 13,
    "kernel_size_2": 15,
    "kernel_size_3": 11,
    "dense_units_1": 512,
    "dense_units_2": 32,
    "dropout_rate": 0.2,
    "optimizer": "rmsprop",
    "batch_size": 16,
}

HYPER_PARAMS = {
    'filters_1': 8,
    'filters_2': 24,
    'filters_3': 28,
    'kernel_size_1': 3,
    'kernel_size_2': 13,
    'kernel_size_3': 7,
    'dense_units_1': 128,
    'dense_units_2': 8,
    'dropout_rate': 0.1,
    'optimizer': 'rmsprop',
    'batch_size': 32
}
'''

HYPER_PARAMS = {
    'filters_1': 4,
    'filters_2': 8,
    'filters_3': 24,
    'kernel_size_1': 11,
    'kernel_size_2': 7,
    'kernel_size_3': 11,
    'dense_units_1': 512,
    'dense_units_2': 256,
    'dropout_rate': 0.2,
    'optimizer': 'rmsprop',
    'batch_size': 8
    }

if __name__ == "__main__":
    # apply processing on train and test dataset
    train_X, train_Y = get_processed_data([data_train["DRFP"],
                                          data_train["ESM1b_norm"]],
                                          data_train["log10_kcat_norm"])

    test_X, test_Y = get_processed_data([data_test["DRFP"],
                                        data_test["ESM1b_norm"]],
                                        data_test["log10_kcat_norm"])

    n_timesteps, n_features = train_X.shape[1], train_X.shape[2]
    # getting the CNN model architecture
    model = create_model(n_timesteps, n_features, **HYPER_PARAMS)

    # train and get model preds
    model_preds = get_model_preds(MODEL_DIR, HYPER_PARAMS, TOTAL_MODELS,
                                  model, train_X, train_Y, test_X, test_Y)
    # calculate weighted mean model predicts
    weighted_avg_pred = calculate_weighted_mean(model_preds, test_Y)
    # output preds
    print(f"ensemble output: {evaluate_model(weighted_avg_pred, test_Y)}")

    esm1b_drfp_weighted_pred = weighted_avg_pred


Model-1 results {'mse': 0.49, 'R2 score': 0.48, 'pearson coefficient': 0.7}
Model-2 results {'mse': 0.5, 'R2 score': 0.48, 'pearson coefficient': 0.69}
Model-3 results {'mse': 0.5, 'R2 score': 0.48, 'pearson coefficient': 0.69}
Model-4 results {'mse': 0.5, 'R2 score': 0.48, 'pearson coefficient': 0.69}
Model-5 results {'mse': 0.5, 'R2 score': 0.48, 'pearson coefficient': 0.69}
ensemble output: {'mse': 0.49, 'R2 score': 0.49, 'pearson coefficient': 0.7}


Model-1 results {'mse': 0.5, 'R2 score': 0.47, 'pearson coefficient': 0.69}
28/28 [==============================] - 0s 2ms/step
Model-2 results {'mse': 0.5, 'R2 score': 0.48, 'pearson coefficient': 0.69}
28/28 [==============================] - 0s 2ms/step
Model-3 results {'mse': 0.5, 'R2 score': 0.47, 'pearson coefficient': 0.69}
28/28 [==============================] - 0s 2ms/step
Model-4 results {'mse': 0.5, 'R2 score': 0.47, 'pearson coefficient': 0.69}
28/28 [==============================] - 0s 3ms/step
Model-5 results {'mse': 0.5, 'R2 score': 0.47, 'pearson coefficient': 0.69}
ensemble output: {'mse': 0.5, 'R2 score': 0.48, 'pearson coefficient': 0.69}

## ESM1b + Difference

### Hyperparameter optimization

In [None]:
BEST_R2 = 0
BEST_HYPER_PARAMS = None
BEST_MODEL = "../../models/hyperparam_tune_models/esm1b_diff.h5"
BEST_HYPER_PARAMS_FILE = "../../hyperparameters/esm1b_diff.txt"
# Note: Due to limited GPU access time on Colab, if hyperparameter tuning stops at some point,
# we can resume the iteration from the point where it left off.
# Set the starting point for iteration
START = 0
# Define the total number of iterations (assuming you want to perform 1000 iterations in total)
TOTAL_ITERATION = 500


# Define the hyperparameter search space
PARAM_SPACE = {
    "filters_1": list(range(2, 15, 2)),
    "filters_2": list(range(4, 25, 2)),
    "filters_3": list(range(8, 35, 2)),
    "kernel_size_1": list(range(3, 19, 2)),
    "kernel_size_2": list(range(5, 17, 2)),
    "kernel_size_3": list(range(7, 15, 2)),
    "dense_units_1": [64, 128, 256, 512],
    "dense_units_2": [8, 16, 32, 64, 128, 256],
    "dropout_rate": [0.10, 0.2, 0.3, 0.4, 0.5],
    "optimizer": ["nadam", "adam", "rmsprop"],
    "batch_size": [8, 16, 24, 32, 64, 128],
}


if __name__ == "__main__":
    # apply processing on train and test dataset
    train_X, train_Y = get_processed_data([data_train["difference_fp"],
                                          data_train["ESM1b_norm"]],
                                          data_train["log10_kcat_norm"])

    test_X, test_Y = get_processed_data([data_test["difference_fp"],
                                        data_test["ESM1b_norm"]],
                                        data_test["log10_kcat_norm"])

    n_timesteps, n_features = train_X.shape[1], train_X.shape[2]

    # To avoid process on duplicate params
    processed_params = []
    for iteration in range(START, TOTAL_ITERATION):
        print(f"Iteration-{iteration}...")

        # randomly select the params from params space
        params = {
            key: np.random.choice(value) for key, value in PARAM_SPACE.items()
            }

        if (iteration < START) or (params in processed_params) or is_not_require_params(params):
            continue

        model = create_model(n_timesteps, n_features, **params)
        model = train_model(model, train_X, train_Y, test_X, test_Y, params)
        y_pred = model.predict(test_X).reshape(-1)
        curr_r2 = round(r2_score(test_Y, y_pred), 2)

        if curr_r2 > BEST_R2:
            BEST_R2 = curr_r2
            BEST_HYPER_PARAMS = params

            # delete_file(BEST_MODEL)
            # delete_file(BEST_HYPER_PARAMS_FILE)

            save_best_params(BEST_HYPER_PARAMS_FILE, BEST_HYPER_PARAMS, BEST_R2)
            save_model(model, BEST_MODEL)
            print(f"New best R2 score: {BEST_R2}")
            print(f"New Best hyperparameters: {BEST_HYPER_PARAMS}")

        processed_params.append(params)


### Training

In [None]:
MODEL_DIR = "../../models/train_models/"
TOTAL_MODELS = 5

# Trained hyperparameter
'''

HYPER_PARAMS = {
    "filters_1": 6,
    "filters_2": 6,
    "filters_3": 16,
    "kernel_size_1": 13,
    "kernel_size_2": 5,
    "kernel_size_3": 13,
    "dense_units_1": 256,
    "dense_units_2": 8,
    "dropout_rate": 0.5,
    "optimizer": "nadam",
    "batch_size": 24,
}

HYPER_PARAMS = {
    'filters_1': 2,
    'filters_2': 8,
    'filters_3': 12,
    'kernel_size_1': 17,
    'kernel_size_2': 9,
    'kernel_size_3': 13,
    'dense_units_1': 512,
    'dense_units_2': 256,
    'dropout_rate': 0.1,
    'optimizer': 'rmsprop',
    'batch_size': 8
}

HYPER_PARAMS = {
    'filters_1': 6,
    'filters_2': 12,
    'filters_3': 26,
    'kernel_size_1': 15,
    'kernel_size_2': 5,
    'kernel_size_3': 9,
    'dense_units_1': 256,
    'dense_units_2': 32,
    'dropout_rate': 0.1,
    'optimizer': 'nadam',
    'batch_size': 8
    }
'''

HYPER_PARAMS = {
    'filters_1': 10,
    'filters_2': 10,
    'filters_3': 10,
    'kernel_size_1': 5,
    'kernel_size_2': 11,
    'kernel_size_3': 11,
    'dense_units_1': 512,
    'dense_units_2': 16,
    'dropout_rate': 0.5,
    'optimizer': 'nadam',
    'batch_size': 64
    }

if __name__ == "__main__":
    # apply processing on train and test dataset
    train_X, train_Y = get_processed_data([data_train["difference_fp"],
                                          data_train["ESM1b_norm"]],
                                          data_train["log10_kcat_norm"])

    test_X, test_Y = get_processed_data([data_test["difference_fp"],
                                        data_test["ESM1b_norm"]],
                                        data_test["log10_kcat_norm"])

    n_timesteps, n_features = train_X.shape[1], train_X.shape[2]
    # getting the CNN model architecture
    model = create_model(n_timesteps, n_features, **HYPER_PARAMS)
    # train and get model preds
    model_preds = get_model_preds(MODEL_DIR, HYPER_PARAMS, TOTAL_MODELS,
                                  model, train_X, train_Y, test_X, test_Y)
    # calculate weighted mean model predicts
    weighted_avg_pred = calculate_weighted_mean(model_preds, test_Y)
    # output preds
    print(f"ensemble output: {evaluate_model(weighted_avg_pred, test_Y)}")

    esm1b_diff_weighted_pred = weighted_avg_pred

Model-1 results {'mse': 0.5, 'R2 score': 0.47, 'pearson coefficient': 0.69}
Model-2 results {'mse': 0.5, 'R2 score': 0.48, 'pearson coefficient': 0.69}
Model-3 results {'mse': 0.5, 'R2 score': 0.47, 'pearson coefficient': 0.69}
Model-4 results {'mse': 0.5, 'R2 score': 0.47, 'pearson coefficient': 0.69}
Model-5 results {'mse': 0.5, 'R2 score': 0.47, 'pearson coefficient': 0.69}
ensemble output: {'mse': 0.5, 'R2 score': 0.48, 'pearson coefficient': 0.69}


## ESM1b_ts + DRFP

### Hyperparameter optimization

In [None]:
BEST_R2 = 0
BEST_HYPER_PARAMS = None
BEST_MODEL = "../../models/hyperparam_tune_models/esm1b_ts_drfp.h5"
BEST_HYPER_PARAMS_FILE = "../../hyperparameters/esm1b_ts_drfp.txt"
# Note: Due to limited GPU access time on Colab, if hyperparameter tuning stops at some point,
# we can resume the iteration from the point where it left off.
# Set the starting point for iteration
START = 0
# Define the total number of iterations (assuming you want to perform 1000 iterations in total)
TOTAL_ITERATION = 500

# Define the hyperparameter search space
PARAM_SPACE = {
    "filters_1": list(range(2, 15, 2)),
    "filters_2": list(range(4, 25, 2)),
    "filters_3": list(range(8, 35, 2)),
    "kernel_size_1": list(range(3, 19, 2)),
    "kernel_size_2": list(range(5, 17, 2)),
    "kernel_size_3": list(range(7, 15, 2)),
    "dense_units_1": [64, 128, 256, 512],
    "dense_units_2": [8, 16, 32, 64, 128, 256],
    "dropout_rate": [0.10, 0.2, 0.3, 0.4, 0.5],
    "optimizer": ["nadam", "adam", "rmsprop"],
    "batch_size": [8, 16, 24, 32, 64, 128],
}


if __name__ == "__main__":
    # apply processing on train and test dataset
    train_X, train_Y = get_processed_data([data_train["DRFP"],
                                          data_train["ESM1b_ts_norm"]],
                                          data_train["log10_kcat_norm"])

    test_X, test_Y = get_processed_data([data_test["DRFP"],
                                          data_test["ESM1b_ts_norm"]],
                                          data_test["log10_kcat_norm"])

    n_timesteps, n_features = train_X.shape[1], train_X.shape[2]

    # To avoid process on duplicate params
    processed_params = []
    for iteration in range(START, TOTAL_ITERATION):
        print(f"Iteration-{iteration}...")
        # randomly select the params from params space
        params = {
            key: np.random.choice(value) for key, value in PARAM_SPACE.items()
            }

        if (iteration < START) or (params in processed_params) or is_not_require_params(params):
            continue

        model = create_model(n_timesteps, n_features, **params)
        model = train_model(model, train_X, train_Y, test_X, test_Y, params)
        y_pred = model.predict(test_X).reshape(-1)
        curr_r2 = round(r2_score(test_Y, y_pred), 2)

        if curr_r2 > BEST_R2:
            BEST_R2 = curr_r2
            BEST_HYPER_PARAMS = params

            # delete_file(BEST_MODEL)
            # delete_file(BEST_HYPER_PARAMS_FILE)

            save_best_params(BEST_HYPER_PARAMS_FILE, BEST_HYPER_PARAMS, BEST_R2)
            save_model(model, BEST_MODEL)
            print(f"New best R2 score: {BEST_R2}")
            print(f"New Best hyperparameters: {BEST_HYPER_PARAMS}")

        processed_params.append(params)


### Training

In [None]:
BEST_MODEL = "../../models/train_models/"
TOTAL_MODELS = 5


# Trained hyperparameter
'''
HYPER_PARAMS = {
    "filters_1": 4,
    "filters_2": 14,
    "filters_3": 16,
    "kernel_size_1": 13,
    "kernel_size_2": 9,
    "kernel_size_3": 11,
    "dense_units_1": 512,
    "dense_units_2": 128,
    "dropout_rate": 0.3,
    "optimizer": "rmsprop",
    "batch_size": 8,
}
'''
HYPER_PARAMS = {
    'filters_1': 12,
    'filters_2': 22,
    'filters_3': 28,
    'kernel_size_1': 5,
    'kernel_size_2': 5,
    'kernel_size_3': 9,
    'dense_units_1': 64,
    'dense_units_2': 8,
    'dropout_rate': 0.2,
    'optimizer': 'rmsprop',
    'batch_size': 128
    }

if __name__ == "__main__":
    # apply processing on train and test dataset
    train_X, train_Y = get_processed_data([data_train["DRFP"],
                                          data_train["ESM1b_ts_norm"]],
                                          data_train["log10_kcat_norm"])

    test_X, test_Y = get_processed_data([data_test["DRFP"],
                                        data_test["ESM1b_ts_norm"]],
                                        data_test["log10_kcat_norm"])

    n_timesteps, n_features = train_X.shape[1], train_X.shape[2]
    # getting the CNN model architecture
    model = create_model(n_timesteps, n_features, **HYPER_PARAMS)
    # train and get model preds
    model_preds = get_model_preds(MODEL_DIR, HYPER_PARAMS, TOTAL_MODELS,
                                  model, train_X, train_Y, test_X, test_Y)
    # calculate weighted mean model predicts
    weighted_avg_pred = calculate_weighted_mean(model_preds, test_Y)
    # output preds
    print(f"final output{evaluate_model(weighted_avg_pred, test_Y)}")

    esm1b_ts_drfp_weighted_pred = weighted_avg_pred


Model-1 results {'mse': 0.51, 'R2 score': 0.46, 'pearson coefficient': 0.68}
Model-2 results {'mse': 0.51, 'R2 score': 0.46, 'pearson coefficient': 0.68}
Model-3 results {'mse': 0.51, 'R2 score': 0.46, 'pearson coefficient': 0.68}
Model-4 results {'mse': 0.51, 'R2 score': 0.46, 'pearson coefficient': 0.68}
Model-5 results {'mse': 0.51, 'R2 score': 0.46, 'pearson coefficient': 0.68}
final output{'mse': 0.51, 'R2 score': 0.47, 'pearson coefficient': 0.68}


## ESM1bts + Difference FP

### Hyperparameter optimization

In [None]:
BEST_R2 = 0
BEST_HYPER_PARAMS = None
PROCESSED_PARAMS = "../../models/hyperparam_tune_models/processed_params_esm1b_ts_diff.txt"
BEST_MODEL = "../../models/hyperparam_tune_models/esm1b_ts_diff.h5"
BEST_HYPER_PARAMS_FILE = "../../hyperparameters/esm1b_ts_diff.txt"
# Note: Due to limited GPU access time on Colab, if hyperparameter tuning stops at some point,
# we can resume the iteration from the point where it left off.
# Set the starting point for iteration
START = 0
# Define the total number of iterations (assuming you want to perform 1000 iterations in total)
TOTAL_ITERATION = 500
# Define the hyperparameter search space
PARAM_SPACE = {
    "filters_1": list(range(2, 15, 2)),
    "filters_2": list(range(4, 25, 2)),
    "filters_3": list(range(8, 35, 2)),
    "kernel_size_1": list(range(3, 19, 2)),
    "kernel_size_2": list(range(5, 17, 2)),
    "kernel_size_3": list(range(7, 15, 2)),
    "dense_units_1": [64, 128, 256, 512],
    "dense_units_2": [8, 16, 32, 64, 128, 256],
    "dropout_rate": [0.10, 0.2, 0.3, 0.4, 0.5],
    "optimizer": ["nadam", "adam", "rmsprop"],
    "batch_size": [8, 16, 24, 32, 64, 128],
}


if __name__ == "__main__":
    # apply processing on train and test dataset

    train_X, train_Y = get_processed_data([data_train["difference_fp"],
                                          data_train["ESM1b_ts_norm"]],
                                          data_train["log10_kcat_norm"])

    test_X, test_Y = get_processed_data([data_test["difference_fp"],
                                        data_test["ESM1b_ts_norm"]],
                                        data_test["log10_kcat_norm"])

    n_timesteps, n_features = train_X.shape[1], train_X.shape[2]

    # To avoid process on duplicate params
    processed_params = []
    for iteration in range(START, TOTAL_ITERATION):
        print(f"Iteration-{iteration}...")
        # randomly select the params from params space
        params = {
            key: np.random.choice(value) for key, value in PARAM_SPACE.items()
            }

        if (iteration < START) or (params in processed_params) or is_not_require_params(params):
            continue

        model = create_model(n_timesteps, n_features, **params)
        model = train_model(model, train_X, train_Y, test_X, test_Y, params)
        y_pred = model.predict(test_X).reshape(-1)
        curr_r2 = round(r2_score(test_Y, y_pred), 2)

        if curr_r2 > BEST_R2:
            BEST_R2 = curr_r2
            BEST_HYPER_PARAMS = params

            # delete_file(BEST_MODEL)
            # delete_file(BEST_HYPER_PARAMS_FILE)

            save_best_params(BEST_HYPER_PARAMS_FILE, BEST_HYPER_PARAMS, BEST_R2)
            save_model(model, BEST_MODEL)
            print(f"New best R2 score: {BEST_R2}")
            print(f"New Best hyperparameters: {BEST_HYPER_PARAMS}")

        processed_params.append(params)


Iteration-0...
Iteration-1...
New best R2 score: 0.43
New Best hyperparameters: {'filters_1': 8, 'filters_2': 20, 'filters_3': 28, 'kernel_size_1': 11, 'kernel_size_2': 13, 'kernel_size_3': 13, 'dense_units_1': 64, 'dense_units_2': 8, 'dropout_rate': 0.1, 'optimizer': 'rmsprop', 'batch_size': 128}
Iteration-2...
Iteration-3...
Iteration-4...
Iteration-5...
New best R2 score: 0.44
New Best hyperparameters: {'filters_1': 10, 'filters_2': 20, 'filters_3': 20, 'kernel_size_1': 15, 'kernel_size_2': 9, 'kernel_size_3': 9, 'dense_units_1': 256, 'dense_units_2': 64, 'dropout_rate': 0.4, 'optimizer': 'rmsprop', 'batch_size': 128}
Iteration-6...
New best R2 score: 0.45
New Best hyperparameters: {'filters_1': 10, 'filters_2': 10, 'filters_3': 10, 'kernel_size_1': 5, 'kernel_size_2': 11, 'kernel_size_3': 11, 'dense_units_1': 512, 'dense_units_2': 16, 'dropout_rate': 0.5, 'optimizer': 'nadam', 'batch_size': 64}
Iteration-7...
Iteration-8...
New best R2 score: 0.46
New Best hyperparameters: {'filter

### Training

In [None]:
MODEL_DIR = "../../models/train_models/"
TOTAL_MODELS = 5

# Trained hyperparameter
'''
HYPER_PARAMS = {
    "filters_1": 14,
    "filters_2": 14,
    "filters_3": 24,
    "kernel_size_1": 3,
    "kernel_size_2": 9,
    "kernel_size_3": 9,
    "dense_units_1": 512,
    "dense_units_2": 128,
    # "dense_units_3": 16,
    "dropout_rate": 0.1,
    "optimizer": "adam",
    "batch_size": 8,
}
'''

HYPER_PARAMS = {
    'filters_1': 12,
    'filters_2': 12,
    'filters_3': 14,
    'kernel_size_1': 13,
    'kernel_size_2': 15,
    'kernel_size_3': 9,
    'dense_units_1': 512,
    'dense_units_2': 8,
    'dropout_rate': 0.5,
    'optimizer': 'adam',
    'batch_size': 32
    }
if __name__ == "__main__":
    # apply processing on train and test dataset

    train_X, train_Y = get_processed_data([data_train["difference_fp"],
                                              data_train["ESM1b_ts_norm"]],
                                              data_train["log10_kcat_norm"])

    test_X, test_Y = get_processed_data([data_test["difference_fp"],
                                        data_test["ESM1b_ts_norm"]],
                                        data_test["log10_kcat_norm"])

    n_timesteps, n_features = train_X.shape[1], train_X.shape[2]
    # getting the CNN model architecture
    model = create_model(n_timesteps, n_features, **HYPER_PARAMS)
    # train and get model preds
    model_preds = get_model_preds(MODEL_DIR, HYPER_PARAMS, TOTAL_MODELS,
                                  model, train_X, train_Y, test_X, test_Y)
    # calculate weighted mean model predicts
    weighted_avg_pred = calculate_weighted_mean(model_preds, test_Y)
    # output preds
    print(f"final output{evaluate_model(weighted_avg_pred, test_Y)}")

    esm1b_ts_diff_weighted_pred = weighted_avg_pred

Model-1 results {'mse': 0.56, 'R2 score': 0.41, 'pearson coefficient': 0.64}
Model-2 results {'mse': 0.52, 'R2 score': 0.46, 'pearson coefficient': 0.68}
Model-3 results {'mse': 0.52, 'R2 score': 0.45, 'pearson coefficient': 0.67}
Model-4 results {'mse': 0.52, 'R2 score': 0.45, 'pearson coefficient': 0.67}
Model-5 results {'mse': 0.52, 'R2 score': 0.45, 'pearson coefficient': 0.67}
final output{'mse': 0.51, 'R2 score': 0.47, 'pearson coefficient': 0.68}


## ESM1b + Structural FP

### Hyperparameter Tuning

In [None]:
BEST_R2 = 0
BEST_HYPER_PARAMS = None
BEST_MODEL = "../../models/hyperparam_tune_models/esm1b_struct.h5"
BEST_HYPER_PARAMS_FILE = "../../hyperparameters/esm1b_struct.txt"
# Note: Due to limited GPU access time on Colab, if hyperparameter tuning stops at some point,
# we can resume the iteration from the point where it left off.
# Set the starting point for iteration
START = 0
# Define the total number of iterations (assuming you want to perform 1000 iterations in total)
TOTAL_ITERATION = 500

# Define the hyperparameter search space
PARAM_SPACE = {
    "filters_1": list(range(2, 15, 2)),
    "filters_2": list(range(4, 25, 2)),
    "filters_3": list(range(8, 35, 2)),
    "kernel_size_1": list(range(3, 19, 2)),
    "kernel_size_2": list(range(5, 17, 2)),
    "kernel_size_3": list(range(7, 15, 2)),
    "dense_units_1": [64, 128, 256, 512],
    "dense_units_2": [8, 16, 32, 64, 128, 256],
    "dropout_rate": [0.10, 0.2, 0.3, 0.4, 0.5],
    "optimizer": ["nadam", "adam", "rmsprop"],
    "batch_size": [8, 16, 24, 32, 64, 128],
}


if __name__ == "__main__":
    # apply processing on train and test dataset
    train_X, train_Y = get_processed_data([data_train["structural_fp"],
                                          data_train["ESM1b_norm"]],
                                          data_train["log10_kcat_norm"])

    test_X, test_Y = get_processed_data(data_test["structural_fp"],
                                        data_test["ESM1b_norm"],
                                        data_test["log10_kcat_norm"])

    n_timesteps, n_features = train_X.shape[1], train_X.shape[2]

    # To avoid process on duplicate params
    processed_params = []
    for iteration in range(START, TOTAL_ITERATION):
        print(f"Iteration-{iteration}...")
        # randomly select the params from params space
        params = {
            key: np.random.choice(value) for key, value in PARAM_SPACE.items()
            }

        if (iteration < START) or (params in processed_params) or is_not_require_params(params):
            continue

        model = create_model(n_timesteps, n_features, **params)
        model = train_model(model, train_X, train_Y, test_X, test_Y, params)
        y_pred = model.predict(test_X).reshape(-1)
        curr_r2 = round(r2_score(test_Y, y_pred), 2)

        if curr_r2 > BEST_R2:
            BEST_R2 = curr_r2
            BEST_HYPER_PARAMS = params

            # delete_file(BEST_MODEL)
            # delete_file(BEST_HYPER_PARAMS_FILE)

            save_best_params(BEST_HYPER_PARAMS_FILE, BEST_HYPER_PARAMS, BEST_R2)
            save_model(model, BEST_MODEL)
            print(f"New best R2 score: {BEST_R2}")
            print(f"New Best hyperparameters: {BEST_HYPER_PARAMS}")

        processed_params.append(params)


###Training

In [None]:
MODEL_DIR = "../../models/train_models/"
TOTAL_MODELS = 5
# Trained hyperparameter
'''
HYPER_PARAMS = {
    "filters_1": 6,
    "filters_2": 6,
    "filters_3": 22,
    "kernel_size_1": 9,
    "kernel_size_2": 15,
    "kernel_size_3": 13,
    "dense_units_1": 256,
    "dense_units_2": 128,
    "dropout_rate": 0.2,
    "optimizer": "adam",
    "batch_size": 128,
}

HYPER_PARAMS = {
    'filters_1': 6,
    'filters_2': 18,
    'filters_3': 22,
    'kernel_size_1': 5,
    'kernel_size_2': 9,
    'kernel_size_3': 7,
    'dense_units_1': 64,
    'dense_units_2': 8,
    'dropout_rate': 0.3,
    'optimizer': 'nadam',
    'batch_size': 24
    }
'''
HYPER_PARAMS = {
    'filters_1': 8,
    'filters_2': 12,
    'filters_3': 20,
    'kernel_size_1': 9,
    'kernel_size_2': 15,
    'kernel_size_3': 7,
    'dense_units_1': 512,
    'dense_units_2': 64,
    'dropout_rate': 0.3,
    'optimizer': 'rmsprop',
    'batch_size': 32
    }

if __name__ == "__main__":
    # apply processing on train and test dataset
    train_X, train_Y = get_processed_data([data_train["structural_fp"],
                                          data_train["ESM1b_norm"]],
                                          data_train["log10_kcat_norm"])

    test_X, test_Y = get_processed_data([data_test["structural_fp"],
                                          data_test["ESM1b_norm"]],
                                          data_test["log10_kcat_norm"])

    n_timesteps, n_features = train_X.shape[1], train_X.shape[2]
    # getting the CNN model architecture
    model = create_model(n_timesteps, n_features, **HYPER_PARAMS)
    # train and get model preds
    model_preds = get_model_preds(MODEL_DIR, HYPER_PARAMS, TOTAL_MODELS,
                                  model, train_X, train_Y, test_X, test_Y)
    # calculate weighted mean model predicts
    weighted_avg_pred = calculate_weighted_mean(model_preds, test_Y)
    # output preds
    print(f"final output{evaluate_model(weighted_avg_pred, test_Y)}")

    esm1b_struct_weighted_pred = weighted_avg_pred


Model-1 results {'mse': 0.6, 'R2 score': 0.37, 'pearson coefficient': 0.61}
Model-2 results {'mse': 0.57, 'R2 score': 0.4, 'pearson coefficient': 0.63}
Model-3 results {'mse': 0.57, 'R2 score': 0.4, 'pearson coefficient': 0.63}
Model-4 results {'mse': 0.57, 'R2 score': 0.4, 'pearson coefficient': 0.63}
Model-5 results {'mse': 0.57, 'R2 score': 0.4, 'pearson coefficient': 0.63}
final output{'mse': 0.56, 'R2 score': 0.41, 'pearson coefficient': 0.64}


## ESM1b_ts + Structural FP

### Hyperparameter tuning

In [None]:
BEST_R2 = 0
BEST_HYPER_PARAMS = None
PROCESSED_PARAMS = "../../models/hyperparam_tune_models/processed_params_esm1b_ts_struct.txt"
BEST_MODEL = "../../models/hyperparam_tune_models/esm1b_ts_struct.h5"
BEST_HYPER_PARAMS_FILE = "../../hyperparameters/esm1b_ts_struct.txt"

# Note: Due to limited GPU access time on Colab, if hyperparameter tuning stops at some point,
# we can resume the iteration from the point where it left off.
# Set the starting point for iteration
START = 0
# Define the total number of iterations (assuming you want to perform 1000 iterations in total)
TOTAL_ITERATION = 500


# Define the hyperparameter search space
PARAM_SPACE = {
    "filters_1": list(range(2, 15, 2)),
    "filters_2": list(range(4, 25, 2)),
    "filters_3": list(range(8, 35, 2)),
    "kernel_size_1": list(range(3, 19, 2)),
    "kernel_size_2": list(range(5, 17, 2)),
    "kernel_size_3": list(range(7, 15, 2)),
    "dense_units_1": [64, 128, 256, 512],
    "dense_units_2": [8, 16, 32, 64, 128, 256],
    "dropout_rate": [0.10, 0.2, 0.3, 0.4, 0.5],
    "optimizer": ["nadam", "adam", "rmsprop"],
    "batch_size": [8, 16, 24, 32, 64, 128],
}


if __name__ == "__main__":
    # apply processing on train and test dataset
    train_X, train_Y = get_processed_data([data_train["structural_fp"],
                                          data_train["ESM1b_ts_norm"]],
                                          data_train["log10_kcat_norm"])

    test_X, test_Y = get_processed_data([data_test["structural_fp"],
                                        data_test["ESM1b_ts_norm"]],
                                        data_test["log10_kcat_norm"])

    n_timesteps, n_features = train_X.shape[1], train_X.shape[2]

    # To avoid process on duplicate params
    processed_params = []
    for iteration in range(START, TOTAL_ITERATION):
        print(f"Iteration-{iteration}...")
        # randomly select the params from params space
        params = {
            key: np.random.choice(value) for key, value in PARAM_SPACE.items()
            }

        if (iteration < START) or (params in processed_params) or is_not_require_params(params):
            continue

        model = create_model(n_timesteps, n_features, **params)
        model = train_model(model, train_X, train_Y, test_X, test_Y, params)
        y_pred = model.predict(test_X).reshape(-1)
        curr_r2 = round(r2_score(test_Y, y_pred), 2)

        if curr_r2 > BEST_R2:
            BEST_R2 = curr_r2
            BEST_HYPER_PARAMS = params

            # delete_file(BEST_MODEL)
            # delete_file(BEST_HYPER_PARAMS_FILE)

            save_best_params(BEST_HYPER_PARAMS_FILE, BEST_HYPER_PARAMS, BEST_R2)
            save_model(model, BEST_MODEL)
            print(f"New best R2 score: {BEST_R2}")
            print(f"New Best hyperparameters: {BEST_HYPER_PARAMS}")

        processed_params.append(params)


Iteration-0...
Iteration-1...
Iteration-2...
Iteration-3...
New best R2 score: 0.41
New Best hyperparameters: {'filters_1': 6, 'filters_2': 22, 'filters_3': 30, 'kernel_size_1': 9, 'kernel_size_2': 15, 'kernel_size_3': 9, 'dense_units_1': 256, 'dense_units_2': 256, 'dropout_rate': 0.3, 'optimizer': 'rmsprop', 'batch_size': 32}
Iteration-4...
Iteration-5...
Iteration-6...
Iteration-7...
Iteration-8...
Iteration-9...
Iteration-10...
Iteration-11...
New best R2 score: 0.43
New Best hyperparameters: {'filters_1': 6, 'filters_2': 8, 'filters_3': 8, 'kernel_size_1': 7, 'kernel_size_2': 13, 'kernel_size_3': 9, 'dense_units_1': 256, 'dense_units_2': 16, 'dropout_rate': 0.1, 'optimizer': 'rmsprop', 'batch_size': 8}
Iteration-12...
Iteration-13...
Iteration-14...
Iteration-15...
New best R2 score: 0.44
New Best hyperparameters: {'filters_1': 8, 'filters_2': 10, 'filters_3': 34, 'kernel_size_1': 11, 'kernel_size_2': 13, 'kernel_size_3': 11, 'dense_units_1': 256, 'dense_units_2': 128, 'dropout_rat

### Training

In [None]:
MODEL_DIR = "../../models/train_models/"
TOTAL_MODELS = 5

# Trained hyperparameter
'''
HYPER_PARAMS = {
    "filters_1": 14,
    "filters_2": 24,
    "filters_3": 24,
    "kernel_size_1": 15,
    "kernel_size_2": 7,
    "kernel_size_3": 9,
    "dense_units_1": 512,
    "dense_units_2": 128,
    "dropout_rate": 0.3,
    "optimizer": "rmsprop",
    "batch_size": 8,
}

HYPER_PARAMS = {
    'filters_1': 14,
    'filters_2': 16,
    'filters_3': 28,
    'kernel_size_1': 13,
    'kernel_size_2': 11,
    'kernel_size_3': 13,
    'dense_units_1': 512,
    'dense_units_2': 8,
    'dropout_rate': 0.3,
    'optimizer': 'rmsprop',
    'batch_size': 24}

'''
HYPER_PARAMS = {
    'filters_1': 4,
    'filters_2': 14,
    'filters_3': 16,
    'kernel_size_1': 13,
    'kernel_size_2': 9,
    'kernel_size_3': 11,
    'dense_units_1': 512,
    'dense_units_2': 128,
    'dropout_rate': 0.3,
    'optimizer': 'rmsprop',
    'batch_size': 8
    }

if __name__ == "__main__":
    # apply processing on train and test dataset
    train_X, train_Y = get_processed_data([data_train["structural_fp"],
                                          data_train["ESM1b_ts_norm"]],
                                          data_train["log10_kcat_norm"])

    test_X, test_Y = get_processed_data([data_test["structural_fp"],
                                        data_test["ESM1b_ts_norm"]],
                                        data_test["log10_kcat_norm"])

    n_timesteps, n_features = train_X.shape[1], train_X.shape[2]
    # getting the CNN model architecture
    model = create_model(n_timesteps, n_features, **HYPER_PARAMS)
    # train and get model preds
    model_preds = get_model_preds(MODEL_DIR, HYPER_PARAMS, TOTAL_MODELS,
                                  model, train_X, train_Y, test_X, test_Y)
    # calculate weighted mean model predicts
    weighted_avg_pred = calculate_weighted_mean(model_preds, test_Y)
    # output preds
    print(f"ensemble output: {evaluate_model(weighted_avg_pred, test_Y)}")

    esm1b_ts_struct_weighted_pred = weighted_avg_pred


Model-1 results {'mse': 0.56, 'R2 score': 0.42, 'pearson coefficient': 0.64}
Model-2 results {'mse': 0.57, 'R2 score': 0.4, 'pearson coefficient': 0.64}
Model-3 results {'mse': 0.57, 'R2 score': 0.4, 'pearson coefficient': 0.63}
Model-4 results {'mse': 0.57, 'R2 score': 0.4, 'pearson coefficient': 0.63}
Model-5 results {'mse': 0.57, 'R2 score': 0.4, 'pearson coefficient': 0.63}
ensemble output: {'mse': 0.56, 'R2 score': 0.42, 'pearson coefficient': 0.64}


# ensemble

In [None]:
combined_weighted_preds = np.array([
    esm1b_drfp_weighted_pred,
    esm1b_diff_weighted_pred,
    esm1b_ts_drfp_weighted_pred,
    esm1b_ts_diff_weighted_pred,
    esm1b_struct_weighted_pred,
    esm1b_ts_struct_weighted_pred
    ])

weighted_avg_pred = calculate_weighted_mean(combined_weighted_preds, test_Y)
print(f"ensemble output: {evaluate_model(weighted_avg_pred, test_Y)}")

ensemble output: {'mse': 0.44, 'R2 score': 0.53, 'pearson coefficient': 0.73}


ensemble output: {'mse': 0.44, 'R2 score': 0.54, 'pearson coefficient': 0.73}
