In [1]:
#import libraries
import pathlib
import torch
import esm
from esm import pretrained
from esm import FastaBatchedDataset
from tqdm import tqdm

In [2]:
import re, os
import pandas as pd
import numpy as np
import random
import glob
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import torch
from tensorflow.keras.models import load_model

# sort filenames in order
def numeric_sort(file_list):
    def extract_num(f):
        m = re.search(r'(\d+)', f)
        return int(m.group(1)) if m else float('inf')
    return sorted(file_list, key=extract_num)

# Load the query sequence representations
def load_protein_representations(folder_path, files):
    queryproteinrep = []
    for file_name in files:
        file_path = os.path.join(folder_path, file_name)
        if os.path.exists(file_path):
            rep_changes = torch.load(file_path)['mean_representations'][36]
            queryproteinrep.append(rep_changes.tolist())
        else:
            print(f"File {file_path} not found.")
    return torch.tensor(queryproteinrep)


# Automatically detect project root
current_path = pathlib.Path().resolve()

current_path = current_path.parent

project_root = current_path

# Path to sequence representations
folder_path = project_root / "embeddings"
files_test = os.listdir(folder_path)
files_test = numeric_sort(files_test)  # Sort files numerically

pos = pd.read_csv("/media/nafiislam/T7/LLPSEmbed_new/averaged_features/ESM2_3B/embeddings_test_positive.csv", header=None).values
neg = pd.read_csv("/media/nafiislam/T7/LLPSEmbed_new/averaged_features/ESM2_3B/embeddings_test_negative.csv", header=None).values
query_rep = np.vstack((pos, neg))

def create_model():
    model = Sequential([
        Dense(128, activation='relu', input_shape=(2560,)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid')  
    ])

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

    model.compile(optimizer=optimizer,
                    loss='binary_crossentropy',
                    metrics=['accuracy'])
    return model

model = create_model()

# Load all saved models for ensemble predictions
import glob

# Path to models directory using project_root
models_path = project_root / "models"
model_files = sorted(models_path.glob("dataset_*.h5"))

# Convert Path objects to strings for compatibility if needed
#model_files = [str(file) for file in model_files]

# Initialize lists for ensemble predictions
loaded_predictions = []
loaded_probabilities = []


for model_file in model_files:
    loaded_model = load_model(model_file, compile = False)
    # Make predictions on the test set
    y_pred_proba = loaded_model.predict(query_rep, verbose=0)
    y_pred = (y_pred_proba > 0.5).astype(int)
    loaded_predictions.append(y_pred)
    loaded_probabilities.append(y_pred_proba)
    
# Convert predictions and probabilities to numpy arrays
ensemble_predictions = np.array(loaded_predictions)  
ensemble_probabilities = np.array(loaded_probabilities) 

# Using the model to make predictions
predictions = model.predict(query_rep)
predicted_labels = (predictions > 0.5).astype(int)

# After all folds are processed, calculate the final ensemble accuracy and other metrics
ensemble_predictions = np.array(ensemble_predictions)  
ensemble_probabilities = np.array(ensemble_probabilities) 

# Majority voting for final predictions
votes = np.sum(ensemble_predictions, axis=0)  
majority_decision = (votes > (ensemble_predictions.shape[0] // 2)).astype(int)

# Handle ties (if any)
ties = (votes == ensemble_predictions.shape[0] // 2)  
if np.any(ties):
    avg_probabilities = ensemble_probabilities.mean(axis=0)
    majority_decision[ties] = (avg_probabilities[ties] >= 0.5).astype(int)

# Final ensemble predictions
final_predictions = majority_decision
final_probabilities = ensemble_probabilities.mean(axis=0)

y_true = np.array([1]*len(pos) + [0]*len(neg))

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix, matthews_corrcoef,
)
from sklearn.metrics import balanced_accuracy_score


def find_metrics(y_test, y_predict, y_proba):

    tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()  # y_true, y_pred

    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)

    acc = accuracy_score(y_test, y_predict)
    prec = tp / (tp + fp)
    f1_score_1 = 2 * prec * sensitivity / (prec + sensitivity)
    mcc = matthews_corrcoef(y_test, y_predict)
    auc = roc_auc_score(y_test, y_proba)
    auPR = average_precision_score(y_test, y_proba)
    bal_acc = balanced_accuracy_score(y_test, y_predict)

    return sensitivity, specificity, acc, prec, f1_score_1, mcc, auc, y_proba, bal_acc, auPR

sensitivity, specificity, acc, prec, f1_score_1, mcc, auc, y_proba, bal_acc, auPR = find_metrics(y_true, final_predictions, final_probabilities)
print(f"Accuracy: {acc:.3f}")
print(f"Sensitivity: {sensitivity:.3f}")
print(f"Specificity: {specificity:.3f}")
print(f"Precision: {prec:.3f}")
print(f"F1 Score: {f1_score_1:.3f}")
print(f"MCC: {mcc:.3f}")
print(f"AUC: {auc:.3f}")
print(f"auPR: {auPR:.3f}")
print(f"Balanced Accuracy: {bal_acc:.3f}")

np.save("y_proba.npy", final_probabilities)
np.save("y_true.npy", y_true)
np.save("y_predict.npy", final_predictions)

from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
import csv

fpr, tpr, _ = roc_curve(np.array(y_true), np.array(final_probabilities))

f = open("./outputs_ensemble/fpr.csv", "w", newline="")
writer = csv.writer(f)
writer.writerows([[fp] for fp in fpr])
f.close()

f = open("./outputs_ensemble/tpr.csv", "w", newline="")
writer = csv.writer(f)
writer.writerows([[tp] for tp in tpr])
f.close()

precision, recall, _ = precision_recall_curve(np.array(y_true), np.array(final_probabilities))

f_name = f'./outputs_ensemble/precision.csv'
with open(f_name, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows([[p] for p in precision])

f_name = f'./outputs_ensemble/recall.csv'
with open(f_name, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows([[r] for r in recall])

2026-01-26 21:22:57.767346: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-26 21:22:57.871931: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769440977.914515    6709 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769440977.926495    6709 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769440978.016302    6709 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
Accuracy: 0.774
Sensitivity: 0.743
Specificity: 0.793
Precision: 0.689
F1 Score: 0.715
MCC: 0.530
AUC: 0.831
auPR: 0.731
Balanced Accuracy: 0.768
