In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.optim import Adam
import numpy as np

# 假设你已经有了一个氨基酸到整数的映射字典
aa_to_int = {'A':1, 'R':2, 'N':3, 'D':4, 'C':5, 'E':6, 'Q':7, 'G':8, 'H':9, 'I':10, 
             'L':11, 'K':12, 'M':13, 'F':14, 'P':15, 'S':16, 'T':17, 'W':18, 'Y':19, 
             'V':20, 'U':21, 'X':22}

# 函数将氨基酸序列转换为整数张量
def encode_sequence(seq, max_length):
    encoded_seq = [aa_to_int.get(aa, 22) for aa in seq]  # 22为未知氨基酸的编码
    padding = [0] * (max_length - len(encoded_seq))  # 0 作为padding值
    return torch.tensor(encoded_seq + padding, dtype=torch.long)

class ProteinDataset(Dataset):
    def __init__(self, positive_file, negative_file, max_length):
        self.sequences = []
        self.labels = []
        self.max_length = max_length
        
        # 从正面样本文件读取数据
        with open(positive_file, 'r') as file:
            for line in file:
                seq = line.strip()
                if len(seq) <= self.max_length:
                    self.sequences.append(encode_sequence(seq, self.max_length))
                    self.labels.append(1)  # 抗氧化为1

        # 从负面样本文件读取数据
        with open(negative_file, 'r') as file:
            for line in file:
                seq = line.strip()
                if len(seq) <= self.max_length:
                    self.sequences.append(encode_sequence(seq, self.max_length))
                    self.labels.append(0)  # 非抗氧化为0

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

# 确定氨基酸序列的最大长度
max_length = 1500

# 实例化数据集
positive_file = r"C:\Users\Administrator\Desktop\AI\AIP_MDL\Dateset\NGR_inflammatory response\NGR_inflammatory response_1500.txt"
negative_file = r"C:\Users\Administrator\Desktop\AI\AIP_MDL\Dateset\NGR_inflammatory response\NOT_NGR_inflammatory response_1500.txt"
dataset = ProteinDataset(positive_file, negative_file, max_length)

# 创建数据加载器
data_loader = DataLoader(dataset, batch_size=64, shuffle=True)

class CNNLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, cnn_filters, lstm_hidden, num_classes):
        super(CNNLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv = nn.Conv1d(embedding_dim, cnn_filters, kernel_size=20)
        self.lstm = nn.LSTM(cnn_filters, lstm_hidden, batch_first=True)
        self.fc = nn.Linear(lstm_hidden, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv(x)
        x, _ = self.lstm(x.permute(0, 2, 1))
        x = self.fc(x[:, -1, :])
        return torch.sigmoid(x)

# 实例化模型
cnn_filters = 64
lstm_hidden = 128
embedding_dim = 8
vocab_size = len(aa_to_int) + 1  # 加1是因为padding的0也算一个"词"
num_classes = 1  # 输出一个概率值

# 根据你的需求设定模型保存的间隔
save_interval = 10

model = CNNLSTM(vocab_size, embedding_dim, cnn_filters, lstm_hidden, num_classes)

# 选择设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr=1e-4)

# 训练模型并保存
num_epochs = 60  # 总训练次数

for epoch in range(num_epochs):
    for batch_index, (inputs, targets) in enumerate(data_loader):
        inputs, targets = inputs.to(device), targets.to(device, dtype=torch.float32)
        
        # 前向传播
        outputs = model(inputs).squeeze()  # 输出需要squeeze
        loss = criterion(outputs, targets)
        
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # 每训练save_interval次数保存一次模型
    if (epoch + 1) % save_interval == 0:
        model_file_name = f'NGR_inflammatory_1500_{epoch + 1}.pth'
        torch.save(model.state_dict(), model_file_name)
        print(f'Saved model as {model_file_name}')
        
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# 最后，保存训练完成后的模型
model_file_name = f'NGR_inflammatory_1500_{num_epochs}.pth'
torch.save(model.state_dict(), model_file_name)
print(f'Final model saved as {model_file_name}')


Epoch 1/60, Loss: 0.011540639214217663
Epoch 2/60, Loss: 0.011555289849638939
Epoch 3/60, Loss: 0.011375408619642258
Epoch 4/60, Loss: 0.00943007506430149
Epoch 5/60, Loss: 0.01046448852866888
Epoch 6/60, Loss: 0.010176164098083973
Epoch 7/60, Loss: 0.011979208327829838
Epoch 8/60, Loss: 0.01130638737231493
Epoch 9/60, Loss: 0.010721628554165363
Saved model as NGR_inflammatory_1500_10.pth
Epoch 10/60, Loss: 0.011061758734285831
Epoch 11/60, Loss: 0.010932221077382565
Epoch 12/60, Loss: 0.009462952613830566
Epoch 13/60, Loss: 0.006808241829276085
Epoch 14/60, Loss: 0.007138532120734453
Epoch 15/60, Loss: 0.31622254848480225
Epoch 16/60, Loss: 0.0007180493557825685
Epoch 17/60, Loss: 0.0121309207752347
Epoch 18/60, Loss: 0.000853040546644479
Epoch 19/60, Loss: 0.011740284040570259
Saved model as NGR_inflammatory_1500_20.pth
Epoch 20/60, Loss: 0.0012258735951036215
Epoch 21/60, Loss: 0.0018055916298180819
Epoch 22/60, Loss: 0.0008922812412492931
Epoch 23/60, Loss: 0.0031616734340786934
Ep