In [5]:
import torch
from torch import nn

# 加载模型类（确保它与保存模型时使用的类定义相同）
class CNNLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, cnn_filters, lstm_hidden, num_classes):
        super(CNNLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv = nn.Conv1d(embedding_dim, cnn_filters, kernel_size=20)
        self.lstm = nn.LSTM(cnn_filters, lstm_hidden, batch_first=True)
        self.fc = nn.Linear(lstm_hidden, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv(x)
        x, _ = self.lstm(x.permute(0, 2, 1))
        x = self.fc(x[:, -1, :])
        return torch.sigmoid(x)

# 加载模型
def load_model(model_path, model):
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
    return model

# 氨基酸到整数的映射
aa_to_int = {'A':1, 'R':2, 'N':3, 'D':4, 'C':5, 'E':6, 'Q':7, 'G':8, 'H':9, 'I':10, 
             'L':11, 'K':12, 'M':13, 'F':14, 'P':15, 'S':16, 'T':17, 'W':18, 'Y':19, 
             'V':20, 'U':21, 'X':22}  # 假设X代表了所有未知的氨基酸

# 编码函数
def encode_sequence(seq, max_length):
    encoded_seq = [aa_to_int.get(aa, 22) for aa in seq]  # 未知氨基酸编码为22
    padding = [0] * (max_length - len(encoded_seq))  # 0 作为padding值
    return torch.tensor([encoded_seq + padding], dtype=torch.long)  # 注意返回一个batch的形式

# 预测函数
def predict(model, sequence, max_length):
    model.eval()
    with torch.no_grad():
        encoded_seq = encode_sequence(sequence, max_length)
        prediction = model(encoded_seq)
        return prediction.item() * 100  # 转换为百分比形式

# 设置参数
max_length = 1000  # 这个值应该与训练模型时使用的最大长度相同
model_path = r"model.pth"  # 模型保存路径

# 实例化模型
cnn_filters = 64
lstm_hidden = 128
embedding_dim = 8
vocab_size = len(aa_to_int) + 1  # 加1是因为padding的0也算一个"词"
num_classes = 1  # 输出一个概率值

# 创建模型实例
model = CNNLSTM(vocab_size, embedding_dim, cnn_filters, lstm_hidden, num_classes)

# 加载模型
model = load_model(model_path, model)

# 假设有一个未知的氨基酸序列
unknown_sequence = "sequence"

# 进行预测
percentage = predict(model, unknown_sequence, max_length)
print(f"该氨基酸序列为抗氧化蛋白的可能性为：{percentage:.2f}%")

该氨基酸序列为抗氧化蛋白的可能性为：100.00%
