In [1]:
# %%
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tensorflow.keras.models import load_model
import joblib

# --- Parâmetros ---
SEQ_FASTA = "MPISSSSSSSTKSMRRAASELERSDSVTSPRFIGRRQSLIEDARKEREAAAAAAEAAEATEQIVFEEEDGKALLNLFFTLRSSKTPALSRSLKVFETFEAKIHHLETRPCRKPRDSLEGLEYFVRCEVHLSDVSTLISSIKRIAEDVKTTKEVKFHWFPKKISELDRCHHLITKFDPDLDQEHPGFTDPVYRQRRKMIGDIAFRYKQGEPIPRVEYTEEEIGTWREVYSTLRDLYTTHACSEHLEAFNLLERHCGYSPENIPQLEDVSRFLRERTGFQLRPVAGLLSARDFLASLAFRVFQCTQYIRHASSPMHSPEPDCVHELLGHVPILADRVFAQFSQNIGLASLGASEEDIEKLSTLYWFTVEFGLCKQGGIVKAYGAGLLSSYGELVHALSDEPERREFDPEAAAIQPYQDQNYQSVYFVSESFTDAKEKLRSYVAGIKRPFSVRFDPYTYSIEVLDNPLKIRGGLESVKDELKMLTDALNVLA"
TOP_N = 10

# --- 1. Função para dividir sequência (512 para Protbert e Protbertbfd. 1024 para ESM2) ---
def slice_sequence(seq, chunk_size):
    return [seq[i:i+chunk_size] for i in range(0, len(seq), chunk_size)]

# --- 2. Função para gerar embeddings médios ---
def get_embedding_mean(model_name, seq, chunk_size):
    tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False)
    model     = AutoModel.from_pretrained(model_name)
    model.eval()

    chunks = [seq[i:i+chunk_size] for i in range(0, len(seq), chunk_size)]
    embeddings = []

    for chunk in chunks:
        seq_chunk = " ".join(list(chunk))
        # tokenizar SEM truncar
        inputs = tokenizer(seq_chunk,
                           return_tensors="pt",
                           truncation=False,         # ≤ 512 ou 1024 já garantido
                           padding=False)
        with torch.no_grad():
            cls = model(**inputs).last_hidden_state[:, 0, :].squeeze().numpy()
        embeddings.append(cls)

    return np.mean(embeddings, axis=0, keepdims=True)   # (1, dim)

print("A gerar embeddings por chunks...")
emb_pb  = get_embedding_mean("Rostlab/prot_bert", SEQ_FASTA, 512)
emb_bfd = get_embedding_mean("Rostlab/prot_bert_bfd", SEQ_FASTA, 512)
emb_esm = get_embedding_mean("facebook/esm2_t33_650M_UR50D", SEQ_FASTA, 1024)

# --- 3. Carregar os MLPs base ---
mlp_pb  = load_model("models/protbert_mlp.keras")
mlp_bfd = load_model("models/protbertbfd_mlp.keras")
mlp_esm = load_model("models/esm2_mlp.keras")

# --- 4. Gerar predições base (garantir 597 colunas) ---
print("A fazer predições base...")
y_pb  = mlp_pb.predict(emb_pb)[:, :597]
y_bfd = mlp_bfd.predict(emb_bfd)[:, :597]
y_esm = mlp_esm.predict(emb_esm)[:, :597]

# --- 5. Concatenar para o stacking ---
X_stack = np.concatenate([y_pb, y_bfd, y_esm], axis=1)

# --- 6. Carregar modelo de stacking ---
stacking = load_model("models/modelo_ensemble_stacking.keras")
y_pred = stacking.predict(X_stack)

# --- 7. Carregar binarizador (597 GO terms) ---
mlb = joblib.load("data/mlb_597.pkl")
go_terms = mlb.classes_

# --- 8. Mostrar resultados ---
print("\n GO terms com prob ≥ 0.5:")
predicted_terms = mlb.inverse_transform((y_pred >= 0.5).astype(int))
print(predicted_terms[0] if predicted_terms[0] else "Nenhum GO term acima de 0.5")

print(f"\n Top {TOP_N} GO terms mais prováveis:")
top_idx = np.argsort(-y_pred[0])[:TOP_N]
for i in top_idx:
    print(f"{go_terms[i]} : {y_pred[0][i]:.4f}")


A gerar embeddings por chunks...


Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


A fazer predições base...

 GO terms com prob ≥ 0.5:
('GO:0003674', 'GO:0003824', 'GO:0005488', 'GO:0016491', 'GO:0036094', 'GO:0043167')

 Top 10 GO terms mais prováveis:
GO:0003674 : 0.9975
GO:0003824 : 0.9156
GO:0036094 : 0.6652
GO:0043167 : 0.6336
GO:0016491 : 0.6327
GO:0005488 : 0.5595
GO:0043169 : 0.4801
GO:0140096 : 0.4790
GO:0051213 : 0.4551
GO:0046872 : 0.4098
