In [None]:
# import modules
import torch
from tqdm import tqdm
from sklearn import metrics
import numpy as np
import pandas as pd
import sys
sys.path.append('../')
from model.modeling_UniBioseq import UniBioseqForEmbedding, UniBioseqForSequenceClassification_convbert, UniBioseqForTokenClassification_convbert
from model.tokenization_UniBioseq import UBSLMTokenizer

  from .autonotebook import tqdm as notebook_tqdm


[2025-01-15 03:02:19,980] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


# SolP

In [None]:
# get test data

df = pd.read_csv("your_path/SolP/SolP.csv")
df = df[df['split']=='test'].reset_index(drop=True)
sequences = list(df['sequence'])
labels = list(df['labels'])

In [None]:
# predict
model_file = "your_path/SolP_result/model_file"
device = "cuda"
model = UniBioseqForSequenceClassification_convbert.from_pretrained(model_file)
tokenizer = UBSLMTokenizer.from_pretrained(model_file)
model.to(device)
model.eval()
preds = []
for input_seq in tqdm(sequences):
    input_ids = tokenizer(input_seq, return_tensors="pt")['input_ids'].to(device)
    with torch.no_grad():
        preds.append(torch.sigmoid(model(input_ids)['logits'][0][0].cpu()).item()>0.5)

print(metrics.accuracy_score(labels, preds))

No vocab_list supplied for folding model, assuming the UBL vocabulary!
100%|██████████| 2001/2001 [01:36<00:00, 20.68it/s]

0.768615692153923





# LocP

In [None]:
# get test data
df = pd.read_csv("your_path/LocP/df_test.csv")
sequences = list(df['seq'])
labels = list(df['labels'])

In [None]:
# predict
model_file = "your_path/LocP_result/model_file"
device = "cuda"
model = UniBioseqForSequenceClassification_convbert.from_pretrained(model_file)
tokenizer = UBSLMTokenizer.from_pretrained(model_file)
model.to(device)
model.eval()
preds = []
for input_seq in tqdm(sequences):
    input_ids = tokenizer(input_seq, return_tensors="pt")['input_ids'].to(device)
    with torch.no_grad():
        preds.append(np.argmax(model(input_ids)['logits'].cpu().numpy()))

print(metrics.accuracy_score(labels, preds))

No vocab_list supplied for folding model, assuming the UBL vocabulary!
100%|██████████| 1842/1842 [02:37<00:00, 11.68it/s]

0.8371335504885994





# FoldP

In [None]:
# get test data
df = pd.read_csv("your_path/FoldP.csv")
df = df[df['split']=='test'].reset_index(drop=True)
sequences = list(df['sequence'])
labels = list(df['labels'])

In [None]:
# predict
model_file = "your_path/FoldP_result/model_file"
device = "cuda"
model = UniBioseqForSequenceClassification_convbert.from_pretrained(model_file)
tokenizer = UBSLMTokenizer.from_pretrained(model_file)
model.to(device)
model.eval()
preds = []
for input_seq in tqdm(sequences):
    input_ids = tokenizer(input_seq, return_tensors="pt")['input_ids'].to(device)
    with torch.no_grad():
        preds.append(np.argmax(model(input_ids)['logits'].cpu().numpy()))

print(metrics.accuracy_score(labels, preds))

No vocab_list supplied for folding model, assuming the UBL vocabulary!
100%|██████████| 1254/1254 [00:45<00:00, 27.50it/s]

0.6850079744816587





# SSP3

In [None]:
# get test data
df = pd.read_pickle("your_path/ssp3.pkl")
df_casp12 = df[df['split']=='casp12']
df_casp14 = df[df['split']=='casp14']
df_cb513 = df[df['split']=='cb513']
df_ts115 = df[df['split']=='ts115']
sequences_casp12 = list(df_casp12['sequence'])
labels_casp12 = list(df_casp12['labels'])
sequences_casp14 = list(df_casp14['sequence'])
labels_casp14 = list(df_casp14['labels'])
sequences_cb513 = list(df_cb513['sequence'])
labels_cb513 = list(df_cb513['labels'])
sequences_ts115 = list(df_ts115['sequence'])
labels_ts115 = list(df_ts115['labels'])
def get_labels(labels):
    labels_list = []
    for i in labels:
        labels_list+=i
    labels_np = np.array(labels_list)
    labels_index = labels_np!=-100
    labels_cal = labels_np[labels_index]
    labels_cal[labels_cal == 3] = 2
    labels_cal[labels_cal == 1] = 0
    labels_cal[labels_cal == 4] = 1
    return labels_cal, labels_index

In [None]:
# predict & calculate accuracy
model_file = "your_path/ssp3_result/model_file"
device = "cuda"
model = UniBioseqForTokenClassification_convbert.from_pretrained(model_file)
tokenizer = UBSLMTokenizer.from_pretrained(model_file)
model.to(device)
model.eval()
preds = []
for input_seq in tqdm(sequences_casp12):
    input_ids = tokenizer(input_seq, return_tensors="pt")['input_ids'].to(device)
    with torch.no_grad():
        outputs = model(input_ids)['logits'][0].cpu().numpy()
        pred = np.argmax(outputs, axis=1).tolist()
        preds += pred
preds = np.array(preds)
labels_cal = get_labels(labels_casp12)[0]
preds_cal = preds[get_labels(labels_casp12)[1]]
print(metrics.accuracy_score(labels_cal, preds_cal))

No vocab_list supplied for folding model, assuming the UBL vocabulary!
100%|██████████| 20/20 [00:01<00:00, 18.11it/s]

0.8174550299800133



