In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.optim import Adam
import numpy as np

# 假设你已经有了一个氨基酸到整数的映射字典
aa_to_int = {'A':1, 'R':2, 'N':3, 'D':4, 'C':5, 'E':6, 'Q':7, 'G':8, 'H':9, 'I':10, 
             'L':11, 'K':12, 'M':13, 'F':14, 'P':15, 'S':16, 'T':17, 'W':18, 'Y':19, 
             'V':20, 'U':21, 'X':22}

# 函数将氨基酸序列转换为整数张量
def encode_sequence(seq, max_length):
    encoded_seq = [aa_to_int.get(aa, 22) for aa in seq]  # 22为未知氨基酸的编码
    padding = [0] * (max_length - len(encoded_seq))  # 0 作为padding值
    return torch.tensor(encoded_seq + padding, dtype=torch.long)

class ProteinDataset(Dataset):
    def __init__(self, positive_file, negative_file, max_length):
        self.sequences = []
        self.labels = []
        self.max_length = max_length
        
        # 从正面样本文件读取数据
        with open(positive_file, 'r') as file:
            for line in file:
                seq = line.strip()
                if len(seq) <= self.max_length:
                    self.sequences.append(encode_sequence(seq, self.max_length))
                    self.labels.append(1)  # 抗氧化为1

        # 从负面样本文件读取数据
        with open(negative_file, 'r') as file:
            for line in file:
                seq = line.strip()
                if len(seq) <= self.max_length:
                    self.sequences.append(encode_sequence(seq, self.max_length))
                    self.labels.append(0)  # 非抗氧化为0

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

# 确定氨基酸序列的最大长度
max_length = 1000

# 实例化数据集
positive_file = r"C:\Users\Administrator\Desktop\AI\AOP_MDL\Dataset\MIXperoxiredoxin1000.txt"
negative_file = r"C:\Users\Administrator\Desktop\AI\AOP_MDL\Dataset\NGS0_1000.txt"
dataset = ProteinDataset(positive_file, negative_file, max_length)

# 创建数据加载器
data_loader = DataLoader(dataset, batch_size=64, shuffle=True)

class CNNLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, cnn_filters, lstm_hidden, num_classes):
        super(CNNLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv = nn.Conv1d(embedding_dim, cnn_filters, kernel_size=20)
        self.lstm = nn.LSTM(cnn_filters, lstm_hidden, batch_first=True)
        self.fc = nn.Linear(lstm_hidden, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv(x)
        x, _ = self.lstm(x.permute(0, 2, 1))
        x = self.fc(x[:, -1, :])
        return torch.sigmoid(x)

# 实例化模型
cnn_filters = 64
lstm_hidden = 128
embedding_dim = 8
vocab_size = len(aa_to_int) + 1  # 加1是因为padding的0也算一个"词"
num_classes = 1  # 输出一个概率值

model = CNNLSTM(vocab_size, embedding_dim, cnn_filters, lstm_hidden, num_classes)

# 选择设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr=1e-4)

# 训练模型
num_epochs = 10  # 根据需要设定

for epoch in range(num_epochs):
    for inputs, targets in data_loader:
        inputs, targets = inputs.to(device), targets.to(device, dtype=torch.float32)
        
        # 前向传播
        outputs = model(inputs).squeeze()  # 输出需要squeeze
        loss = criterion(outputs, targets)
        
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

torch.save(model.state_dict(), 'CLSTM1000.pth')

Epoch 1/10, Loss: 0.4419744610786438
Epoch 2/10, Loss: 0.050105981528759
Epoch 3/10, Loss: 0.020713387057185173
Epoch 4/10, Loss: 0.022814620286226273
Epoch 5/10, Loss: 0.05312681570649147
Epoch 6/10, Loss: 0.0201401486992836
Epoch 7/10, Loss: 0.000878523220308125
Epoch 8/10, Loss: 0.00021546988864429295
Epoch 9/10, Loss: 0.0002585463225841522
Epoch 10/10, Loss: 0.00013791306992061436
