In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
%cd /content/drive/MyDrive/enz-eff-project
!pip install -r requirements.txt

/content/drive/.shortcut-targets-by-id/1iS6gSWfUE3cZnmrNWbbV9_W_zQH3vaiu/enz-eff-project


In [21]:
%cd improved_code/model_training

/content/drive/.shortcut-targets-by-id/1iS6gSWfUE3cZnmrNWbbV9_W_zQH3vaiu/enz-eff-project/improved_code/model_training


# CNN
Hyperparameter tuning +  Model training + saving best Model

In [22]:
import random
import numpy as np
import tensorflow as tf


# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)

from os.path import join
import pandas as pd

from sklearn.metrics import r2_score
import numpy as np
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from tensorflow.keras.models import load_model, save_model
from tensorflow.keras import regularizers, initializers, optimizers, models, layers
from tensorflow.keras.losses import MSE
from tensorflow.keras.activations import relu
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow import keras
from scipy.optimize import minimize


data_train = pd.read_pickle(
    join("..", "..", "data", "kcat_data", "splits", "train_df_kcat_new.pkl")
)
data_test = pd.read_pickle(
    join("..", "..", "data", "kcat_data", "splits", "test_df_kcat_new.pkl")
)


## ESM1b + DRFP

In [23]:
BEST_MODEL = "../../models/best_models/cnn/cnn_esm1b_drfp_r2_48.h5"

def get_input_dims(train_X):
    sample_size = train_X.shape[0]  # number of samples in train set
    time_steps = train_X.shape[1]  # number of features in train set
    input_dimension = 1  # each feature is represented by 1 number
    train_data_reshaped = train_X.reshape(sample_size, time_steps, input_dimension)

    return train_data_reshaped


def evaluate_model(pred_Y, test_Y):
    mse = mean_squared_error(test_Y, pred_Y)
    r2 = r2_score(test_Y,  pred_Y)
    output = {"mse": mse, "R2 score": r2, "pearson coefficient": np.sqrt(r2)}
    print("Output: ", output)

if __name__ == "__main__":
    # create input matrices:
    train_X = np.array(list(data_train["DRFP"]))
    train_X = np.concatenate(
        [train_X, np.array(list(data_train["ESM1b_norm"]))], axis=1
    )
    train_Y = np.array(list(data_train["log10_kcat_norm"]))

    test_X = np.array(list(data_test["DRFP"]))
    test_X = np.concatenate([test_X, np.array(list(data_test["ESM1b_norm"]))], axis=1)
    test_Y = np.array(list(data_test["log10_kcat_norm"]))

    train_data_reshaped = get_input_dims(train_X)
    test_data_reshaped = get_input_dims(test_X)

    n_timesteps = train_data_reshaped.shape[1]
    n_features = train_data_reshaped.shape[2]

    model = load_model(BEST_MODEL)
    esm1b_drfp_pred_Y = model.predict(test_X)
    evaluate_model(esm1b_drfp_pred_Y, test_Y)
    # save_model(model, BEST_MODEL)


Output:  {'mse': 0.49158225407714273, 'R2 score': 0.48304191371349237, 'pearson coefficient': 0.6950121680326844}


## ESM1b + Difference

In [24]:
BEST_MODEL = "../../models/best_models/cnn/cnn_esm1b_diff_r2_47.h5"

def get_input_dims(train_X):
    sample_size = train_X.shape[0]  # number of samples in train set
    time_steps = train_X.shape[1]  # number of features in train set
    input_dimension = 1  # each feature is represented by 1 number
    train_data_reshaped = train_X.reshape(sample_size, time_steps, input_dimension)

    return train_data_reshaped


def evaluate_model(pred_Y, test_Y):
    mse = mean_squared_error(test_Y, pred_Y)
    r2 = r2_score(test_Y,  pred_Y)
    output = {"mse": mse, "R2 score": r2, "pearson coefficient": np.sqrt(r2)}
    print("Output: ", output)

if __name__ == "__main__":
    # create input matrices:
    train_X = np.array(list(data_train["difference_fp"]))
    train_X = np.concatenate(
        [train_X, np.array(list(data_train["ESM1b_norm"]))], axis=1
    )
    train_Y = np.array(list(data_train["log10_kcat_norm"]))

    test_X = np.array(list(data_test["difference_fp"]))
    test_X = np.concatenate([test_X, np.array(list(data_test["ESM1b_norm"]))], axis=1)
    test_Y = np.array(list(data_test["log10_kcat_norm"]))

    train_data_reshaped = get_input_dims(train_X)
    test_data_reshaped = get_input_dims(test_X)

    n_timesteps = train_data_reshaped.shape[1]
    n_features = train_data_reshaped.shape[2]

    model = load_model(BEST_MODEL)
    esm1b_diff_pred_Y = model.predict(test_X)
    evaluate_model(esm1b_diff_pred_Y, test_Y)
    # save_model(model, BEST_MODEL)


Output:  {'mse': 0.5009479778263921, 'R2 score': 0.47319272451688577, 'pearson coefficient': 0.6878900526369645}


## ESM1b_ts + DRFP

In [25]:
BEST_MODEL = "../../models/best_models/cnn/cnn_esm1bts_norm_drfp_r2_478.h5"

def get_input_dims(train_X):
    sample_size = train_X.shape[0]  # number of samples in train set
    time_steps = train_X.shape[1]  # number of features in train set
    input_dimension = 1  # each feature is represented by 1 number
    train_data_reshaped = train_X.reshape(sample_size, time_steps, input_dimension)

    return train_data_reshaped


def evaluate_model(pred_Y, test_Y):
    mse = mean_squared_error(test_Y, pred_Y)
    r2 = r2_score(test_Y,  pred_Y)
    output = {"mse": mse, "R2 score": r2, "pearson coefficient": np.sqrt(r2)}
    print("Output: ", output)

if __name__ == "__main__":
    # create input matrices:
    train_X = np.array(list(data_train["DRFP"]))
    train_X = np.concatenate(
        [train_X, np.array(list(data_train["ESM1b_ts_norm"]))], axis=1
    )
    train_Y = np.array(list(data_train["log10_kcat_norm"]))

    test_X = np.array(list(data_test["DRFP"]))
    test_X = np.concatenate(
        [test_X, np.array(list(data_test["ESM1b_ts_norm"]))], axis=1
    )
    test_Y = np.array(list(data_test["log10_kcat_norm"]))

    train_data_reshaped = get_input_dims(train_X)
    test_data_reshaped = get_input_dims(test_X)

    n_timesteps = train_data_reshaped.shape[1]
    n_features = train_data_reshaped.shape[2]

    model = load_model(BEST_MODEL)
    esm1bts_drfp_pred_Y = model.predict(test_X)
    evaluate_model(esm1bts_drfp_pred_Y, test_Y)
    # save_model(model, BEST_MODEL)


Output:  {'mse': 0.49625629003819605, 'R2 score': 0.4781266006288153, 'pearson coefficient': 0.6914669917131369}


## ESM1bts + Difference FP

In [26]:
BEST_MODEL = "../../models/best_models/cnn/cnn_esm1bts_norm_diff_r2_48.h5"

def get_input_dims(train_X):
    sample_size = train_X.shape[0]  # number of samples in train set
    time_steps = train_X.shape[1]  # number of features in train set
    input_dimension = 1  # each feature is represented by 1 number
    train_data_reshaped = train_X.reshape(sample_size, time_steps, input_dimension)

    return train_data_reshaped


def evaluate_model(pred_Y, test_Y):
    mse = mean_squared_error(test_Y, pred_Y)
    r2 = r2_score(test_Y,  pred_Y)
    output = {"mse": mse, "R2 score": r2, "pearson coefficient": np.sqrt(r2)}
    print("Output: ", output)

if __name__ == "__main__":
    # create input matrices:
    train_X = np.array(list(data_train["difference_fp"]))
    train_X = np.concatenate(
        [train_X, np.array(list(data_train["ESM1b_ts_norm"]))], axis=1
    )
    train_Y = np.array(list(data_train["log10_kcat_norm"]))

    test_X = np.array(list(data_test["difference_fp"]))
    test_X = np.concatenate(
        [test_X, np.array(list(data_test["ESM1b_ts_norm"]))], axis=1
    )
    test_Y = np.array(list(data_test["log10_kcat_norm"]))

    train_data_reshaped = get_input_dims(train_X)
    test_data_reshaped = get_input_dims(test_X)

    n_timesteps = train_data_reshaped.shape[1]
    n_features = train_data_reshaped.shape[2]

    model = load_model(BEST_MODEL)
    esm1bts_diff_pred_Y = model.predict(test_X)
    evaluate_model(esm1bts_diff_pred_Y, test_Y)
    # save_model(model, BEST_MODEL)


Output:  {'mse': 0.4928141962415819, 'R2 score': 0.4817463778017257, 'pearson coefficient': 0.6940795183563089}


## ESM1b + Structural FP

In [27]:
BEST_MODEL = "../../models/best_models/cnn/cnn_esm1b_struct_r2_46.h5"

def get_input_dims(train_X):
    sample_size = train_X.shape[0]  # number of samples in train set
    time_steps = train_X.shape[1]  # number of features in train set
    input_dimension = 1  # each feature is represented by 1 number
    train_data_reshaped = train_X.reshape(sample_size, time_steps, input_dimension)

    return train_data_reshaped


def evaluate_model(pred_Y, test_Y):
    mse = mean_squared_error(test_Y, pred_Y)
    r2 = r2_score(test_Y,  pred_Y)
    output = {"mse": mse, "R2 score": r2, "pearson coefficient": np.sqrt(r2)}
    print("Output: ", output)

if __name__ == "__main__":
    # create input matrices:
    train_X = np.array(list(data_train["structural_fp"]))
    train_X = np.concatenate(
        [train_X, np.array(list(data_train["ESM1b_norm"]))], axis=1
    )
    train_Y = np.array(list(data_train["log10_kcat_norm"]))

    test_X = np.array(list(data_test["structural_fp"]))
    test_X = np.concatenate([test_X, np.array(list(data_test["ESM1b_norm"]))], axis=1)
    test_Y = np.array(list(data_test["log10_kcat_norm"]))
    train_data_reshaped = get_input_dims(train_X)
    test_data_reshaped = get_input_dims(test_X)

    n_timesteps = train_data_reshaped.shape[1]
    n_features = train_data_reshaped.shape[2]

    model = load_model(BEST_MODEL)
    esm1b_struct_pred_Y = model.predict(test_X)
    evaluate_model(esm1b_struct_pred_Y, test_Y)
    # save_model(model, BEST_MODEL)


Output:  {'mse': 0.5110446376248611, 'R2 score': 0.46257486782248347, 'pearson coefficient': 0.6801285671271892}


## ESM1b_ts + Structural FP

In [28]:
BEST_MODEL = "../../models/best_models/cnn/cnn_esm1b_struct_r2_457.h5"

def get_input_dims(train_X):
    sample_size = train_X.shape[0]  # number of samples in train set
    time_steps = train_X.shape[1]  # number of features in train set
    input_dimension = 1  # each feature is represented by 1 number
    train_data_reshaped = train_X.reshape(sample_size, time_steps, input_dimension)

    return train_data_reshaped


def evaluate_model(pred_Y, test_Y):
    mse = mean_squared_error(test_Y, pred_Y)
    r2 = r2_score(test_Y,  pred_Y)
    output = {"mse": mse, "R2 score": r2, "pearson coefficient": np.sqrt(r2)}
    print("Output: ", output)

if __name__ == "__main__":
    # create input matrices:
    train_X = np.array(list(data_train["structural_fp"]))
    train_X = np.concatenate(
        [train_X, np.array(list(data_train["ESM1b_ts_norm"]))], axis=1
    )
    train_Y = np.array(list(data_train["log10_kcat_norm"]))

    test_X = np.array(list(data_test["structural_fp"]))
    test_X = np.concatenate(
        [test_X, np.array(list(data_test["ESM1b_ts_norm"]))], axis=1
    )
    test_Y = np.array(list(data_test["log10_kcat_norm"]))
    train_data_reshaped = get_input_dims(train_X)
    test_data_reshaped = get_input_dims(test_X)

    n_timesteps = train_data_reshaped.shape[1]
    n_features = train_data_reshaped.shape[2]

    model = load_model(BEST_MODEL)
    esm1bts_struct_pred_Y = model.predict(test_X)
    evaluate_model(esm1bts_struct_pred_Y, test_Y)
    # save_model(model, BEST_MODEL)


Output:  {'mse': 0.5158389164383227, 'R2 score': 0.45753310486221566, 'pearson coefficient': 0.6764119342990746}


In [29]:
def evaluate_models_weight(weights, model_preds, true_values):
    weighted_avg = np.average(model_preds, weights=weights, axis=0)
    r2 = r2_score(true_values, weighted_avg)
    return -r2  # We want to maximize R2, so minimize -R2



In [30]:
def calculate_weighted_mean(model_preds, true_values):
    num_models = model_preds.shape[0]
    # Initial weights (for illustration, you can adjust this as needed)
    initial_weights = np.ones(num_models) / num_models
    # Constraint: the sum of weights must be 1
    constraints = {"type": "eq", "fun": lambda w: np.sum(w) - 1.0}
    # Bounds for weights (between 0 and 1)
    bounds = [(0, 1)] * num_models
    result = minimize(
        evaluate_models_weight,
        initial_weights,
        args=(model_preds, true_values),
        method="SLSQP",
        bounds=bounds,
        constraints=constraints,
    )
    best_weights = result.x
    return best_weights

## Ensemble

In [31]:
models_pred = np.array(
    [esm1b_drfp_pred_Y,
    esm1b_diff_pred_Y,
    esm1bts_drfp_pred_Y,
    esm1bts_diff_pred_Y,
    esm1b_struct_pred_Y,
    esm1bts_struct_pred_Y ])
best_weights = calculate_weighted_mean(np.array(models_pred), test_Y)
weighted_avg_pred = np.average(models_pred, weights=best_weights, axis=0)

evaluate_model(weighted_avg_pred, test_Y)

Output:  {'mse': 0.43146945690247107, 'R2 score': 0.5462577770425836, 'pearson coefficient': 0.7390925361837878}
