In [None]:
import pandas as pd
import numpy as np
import pickle

# Load data from the pickle file
file_path = '/content/drive/MyDrive/MLRG/processed_data.pkl'
with open(file_path, 'rb') as f:
    data = pickle.load(f)

# Show the first few rows of the data
print(data.head())


                                             species  \
0  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
0  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
0  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
0  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
0  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   

                                         upstream200    stress  \
0  [[0, 0, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0,...  0.033641   
0  [[0, 0, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0,...  0.013922   
0  [[0, 0, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0,... -0.806374   
0  [[0, 0, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0,... -0.026784   
0  [[0, 0, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0,...  0.922333   

                         stress_name  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]  
0  [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]  
0  [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]  
0  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  
0  [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]  


In [None]:

# Restore species integer values
data['species'] = data['species'].apply(lambda x: np.argmax(x))

# Base encodings
base_encodings = {
    (1, 0, 0, 0): "A",
    (0, 1, 0, 0): "T",
    (0, 0, 1, 0): "C",
    (0, 0, 0, 1): "G"
}

# Function to decode the sequence and omit [0, 0, 0, 0]
def decode_sequence(encoded_seq):
    return ''.join([base_encodings[tuple(base)] for base in encoded_seq if tuple(base) in base_encodings])

# Restore upstream200 sequences
data['upstream200'] = data['upstream200'].apply(decode_sequence)

# Restore stress_name integer values
data['stress_name'] = data['stress_name'].apply(lambda x: np.argmax(x))

# Show the restored data
print(data.head())


                                             species  \
0  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
0  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
0  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
0  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
0  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   

                                         upstream200    stress  \
0  [[0, 0, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0,...  0.033641   
0  [[0, 0, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0,...  0.013922   
0  [[0, 0, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0,... -0.806374   
0  [[0, 0, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0,... -0.026784   
0  [[0, 0, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0,...  0.922333   

                         stress_name  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]  
0  [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]  
0  [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]  
0  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  
0  [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]  
   species                      

# LSTM

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# 将嵌套列表转换为NumPy数组（假设数据已经是列表格式）
data['species'] = data['species'].apply(lambda x: np.array(x))
data['upstream200'] = data['upstream200'].apply(lambda x: np.array(x))
data['stress_name'] = data['stress_name'].apply(lambda x: np.array(x))

# 准备输入和输出数据
X_species = np.vstack(data['species'].values)
X_upstream200 = np.stack(data['upstream200'].values)
X_stress_name = np.vstack(data['stress_name'].values)
y_stress = data['stress'].values

# 对 upstream200 数据进行标准化
upstream200_shape = X_upstream200.shape
scaler = StandardScaler()
X_upstream200 = scaler.fit_transform(X_upstream200.reshape(-1, upstream200_shape[-1])).reshape(upstream200_shape)

# 拆分训练集和测试集
X_train_species, X_test_species, X_train_upstream200, X_test_upstream200, X_train_stress_name, X_test_stress_name, y_train, y_test = train_test_split(
    X_species, X_upstream200, X_stress_name, y_stress, test_size=0.2, random_state=42
)

# 转换为张量
X_train_species_tensor = torch.tensor(X_train_species, dtype=torch.long)
X_test_species_tensor = torch.tensor(X_test_species, dtype=torch.long)
X_train_upstream200_tensor = torch.tensor(X_train_upstream200, dtype=torch.float32)
X_test_upstream200_tensor = torch.tensor(X_test_upstream200, dtype=torch.float32)
X_train_stress_name_tensor = torch.tensor(X_train_stress_name, dtype=torch.long)
X_test_stress_name_tensor = torch.tensor(X_test_stress_name, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

# 创建数据加载器
batch_size = 64
train_dataset = TensorDataset(X_train_species_tensor, X_train_upstream200_tensor, X_train_stress_name_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_species_tensor, X_test_upstream200_tensor, X_test_stress_name_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
class SimpleLSTMWithEmbedding(nn.Module):
    def __init__(self, species_dim, stress_name_dim, embedding_dim, input_dim_upstream200, hidden_dim, output_dim, num_layers):
        super(SimpleLSTMWithEmbedding, self).__init__()
        self.species_embedding = nn.Embedding(species_dim, embedding_dim)
        self.stress_name_embedding = nn.Embedding(stress_name_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim * 2 + input_dim_upstream200, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, species, upstream200, stress_name):
        species_embedded = self.species_embedding(species).sum(dim=1)
        stress_name_embedded = self.stress_name_embedding(stress_name).sum(dim=1)
        x = torch.cat((species_embedded, upstream200.view(upstream200.size(0), -1), stress_name_embedded), dim=1).unsqueeze(1)
        lstm_out, _ = self.lstm(x)
        x = self.fc(lstm_out[:, -1, :])
        return x

# 确保 species_dim 和 stress_name_dim 大于它们的最大索引值
species_dim = X_train_species_tensor.max() + 1
stress_name_dim = X_train_stress_name_tensor.max() + 1
embedding_dim = 32  # 嵌入维度
input_dim_upstream200 = X_train_upstream200_tensor.shape[1] * X_train_upstream200_tensor.shape[2]
hidden_dim = 256  # 增加LSTM 隐藏层维度
output_dim = 1
num_layers = 3  # 增加 LSTM 层数

# 初始化模型
model = SimpleLSTMWithEmbedding(species_dim, stress_name_dim, embedding_dim, input_dim_upstream200, hidden_dim, output_dim, num_layers)


In [None]:
# 定义损失函数和优化器
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

best_test_loss = float('inf')
best_model_path = '/content/drive/MyDrive/MLRG/best_model_weights.pth'

# 训练模型
num_epochs = 300
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    total_batches = len(train_loader)
    for batch_idx, (batch_species, batch_upstream200, batch_stress_name, batch_y) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(batch_species, batch_upstream200, batch_stress_name)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    # 打印epoch的平均损失
    epoch_loss /= total_batches
    print(f'Epoch [{epoch + 1}/{num_epochs}], Average Loss: {epoch_loss:.4f}')

    # 在测试集上评估模型
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for batch_species, batch_upstream200, batch_stress_name, batch_y in test_loader:
            outputs = model(batch_species, batch_upstream200, batch_stress_name)
            loss = criterion(outputs, batch_y)
            test_loss += loss.item()

    test_loss /= len(test_loader)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Test Loss: {test_loss:.4f}')

    # 保存当前模型权重
    # torch.save(model.state_dict(), '/content/drive/MyDrive/MLRG/model_epoch_{}.pth'.format(epoch + 1))

    # 如果测试损失比最佳损失更低，保存当前模型的权重为最佳模型
    if test_loss < best_test_loss:
        best_test_loss = test_loss
        torch.save(model.state_dict(), best_model_path)
        print(f'Best model weights saved to {best_model_path}')


Epoch [1/300], Average Loss: 0.7780
Epoch [1/300], Test Loss: 0.7771
Best model weights saved to /content/drive/MyDrive/MLRG/best_model_weights.pth
Epoch [2/300], Average Loss: 0.7779
Epoch [2/300], Test Loss: 0.7771
Best model weights saved to /content/drive/MyDrive/MLRG/best_model_weights.pth
Epoch [3/300], Average Loss: 0.7779
Epoch [3/300], Test Loss: 0.7771
Best model weights saved to /content/drive/MyDrive/MLRG/best_model_weights.pth
Epoch [4/300], Average Loss: 0.7779
Epoch [4/300], Test Loss: 0.7771
Best model weights saved to /content/drive/MyDrive/MLRG/best_model_weights.pth
Epoch [5/300], Average Loss: 0.7779
Epoch [5/300], Test Loss: 0.7774
Epoch [6/300], Average Loss: 0.7779
Epoch [6/300], Test Loss: 0.7772
Epoch [7/300], Average Loss: 0.7779
Epoch [7/300], Test Loss: 0.7771


# transformer

In [None]:
data['species'] = data['species'].apply(lambda x: np.argmax(x))
data['stress_name'] = data['stress_name'].apply(lambda x: np.argmax(x))

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import torch
import torch.nn as nn
import torch.optim as optim


data['species'] = data['species'].apply(lambda x: np.array(x))
data['upstream200'] = data['upstream200'].apply(lambda x: np.array(x))
data['stress_name'] = data['stress_name'].apply(lambda x: np.array(x))

# 准备输入和输出数据
X_species = np.vstack(data['species'].values)
X_upstream200 = np.stack(data['upstream200'].values)
X_stress_name = np.vstack(data['stress_name'].values)
y_stress = data['stress'].values

# 拆分训练集和测试集
X_train_species, X_test_species, X_train_upstream200, X_test_upstream200, X_train_stress_name, X_test_stress_name, y_train, y_test = train_test_split(
    X_species, X_upstream200, X_stress_name, y_stress, test_size=0.2, random_state=42
)

# 转换为张量
X_train_species_tensor = torch.tensor(X_train_species, dtype=torch.long)
X_test_species_tensor = torch.tensor(X_test_species, dtype=torch.long)
X_train_upstream200_tensor = torch.tensor(X_train_upstream200, dtype=torch.float32)
X_test_upstream200_tensor = torch.tensor(X_test_upstream200, dtype=torch.float32)
X_train_stress_name_tensor = torch.tensor(X_train_stress_name, dtype=torch.long)
X_test_stress_name_tensor = torch.tensor(X_test_stress_name, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pickle

# Load data from the pickle file
file_path = '/content/drive/MyDrive/MLRG/processed_data.pkl'
with open(file_path, 'rb') as f:
    data = pickle.load(f)

# 将嵌套列表转换为NumPy数组（假设数据已经是列表格式）
data['species'] = data['species'].apply(lambda x: np.array(x))
data['upstream200'] = data['upstream200'].apply(lambda x: np.array(x))
data['stress_name'] = data['stress_name'].apply(lambda x: np.array(x))

# 准备输入和输出数据
X_species = np.vstack(data['species'].values)
X_upstream200 = np.stack(data['upstream200'].values)
X_stress_name = np.vstack(data['stress_name'].values)
y_stress = data['stress'].values

# 拆分训练集和测试集
X_train_species, X_test_species, X_train_upstream200, X_test_upstream200, X_train_stress_name, X_test_stress_name, y_train, y_test = train_test_split(
    X_species, X_upstream200, X_stress_name, y_stress, test_size=0.2, random_state=42
)

# 转换为张量
X_train_species_tensor = torch.tensor(X_train_species, dtype=torch.long)
X_test_species_tensor = torch.tensor(X_test_species, dtype=torch.long)
X_train_upstream200_tensor = torch.tensor(X_train_upstream200, dtype=torch.float32)
X_test_upstream200_tensor = torch.tensor(X_test_upstream200, dtype=torch.float32)
X_train_stress_name_tensor = torch.tensor(X_train_stress_name, dtype=torch.long)
X_test_stress_name_tensor = torch.tensor(X_test_stress_name, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)




In [None]:
# 创建数据加载器
batch_size = 128
train_dataset = TensorDataset(X_train_species_tensor, X_train_upstream200_tensor, X_train_stress_name_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_species_tensor, X_test_upstream200_tensor, X_test_stress_name_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
class SimpleTransformerWithEmbedding(nn.Module):
    def __init__(self, species_dim, stress_name_dim, embedding_dim, input_dim_upstream200, output_dim, nhead, num_layers, dim_feedforward):
        super(SimpleTransformerWithEmbedding, self).__init__()
        self.species_embedding = nn.Embedding(species_dim, embedding_dim)
        self.stress_name_embedding = nn.Embedding(stress_name_dim, embedding_dim)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim * 2 + input_dim_upstream200, nhead=nhead, dim_feedforward=dim_feedforward)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embedding_dim * 2 + input_dim_upstream200, output_dim)

    def forward(self, species, upstream200, stress_name):
        species_embedded = self.species_embedding(species).sum(dim=1)
        stress_name_embedded = self.stress_name_embedding(stress_name).sum(dim=1)
        x = torch.cat((species_embedded, upstream200.view(upstream200.size(0), -1), stress_name_embedded), dim=1)
        x = self.transformer_encoder(x.unsqueeze(0)).squeeze(0)
        x = self.fc(x)
        return x

# 确保 species_dim 和 stress_name_dim 大于它们的最大索引值
species_dim = X_train_species_tensor.max() + 1
stress_name_dim = X_train_stress_name_tensor.max() + 1
embedding_dim = 32  # 降低嵌入维度
input_dim_upstream200 = X_train_upstream200_tensor.shape[1] * X_train_upstream200_tensor.shape[2]
output_dim = 1
nhead = 1
num_layers = 1  # 减少 Transformer 层数
dim_feedforward = 64  # 降低前馈神经网络维度

# 初始化模型
model = SimpleTransformerWithEmbedding(species_dim, stress_name_dim, embedding_dim, input_dim_upstream200, output_dim, nhead, num_layers, dim_feedforward)




In [None]:
print(f"Max species index: {X_train_species.max()}")
print(f"Max stress_name index: {X_train_stress_name.max()}")


Max species index: 29
Max stress_name index: 10


In [None]:
# 定义损失函数和优化器
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

best_test_loss = float('inf')
best_model_path = '/content/drive/MyDrive/MLRG/best_model_weights.pth'

# 训练模型
num_epochs = 300
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    total_batches = len(train_loader)
    for batch_idx, (batch_species, batch_upstream200, batch_stress_name, batch_y) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(batch_species, batch_upstream200, batch_stress_name)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    # 打印epoch的平均损失
    epoch_loss /= total_batches
    print(f'Epoch [{epoch + 1}/{num_epochs}], Average Loss: {epoch_loss:.4f}')

    # 在测试集上评估模型
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for batch_species, batch_upstream200, batch_stress_name, batch_y in test_loader:
            outputs = model(batch_species, batch_upstream200, batch_stress_name)
            loss = criterion(outputs, batch_y)
            test_loss += loss.item()

    test_loss /= len(test_loader)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Test Loss: {test_loss:.4f}')

    # 保存当前模型权重
    # torch.save(model.state_dict(), '/content/drive/MyDrive/MLRG/model_epoch_{}.pth'.format(epoch + 1))

    # 如果测试损失比最佳损失更低，保存当前模型的权重为最佳模型
    if test_loss < best_test_loss:
        best_test_loss = test_loss
        torch.save(model.state_dict(), best_model_path)
        print(f'Best model weights saved to {best_model_path}')


Epoch [1/300], Average Loss: 0.7166
Epoch [1/300], Test Loss: 0.7329
Best model weights saved to /content/drive/MyDrive/MLRG/best_model_weights.pth
Epoch [2/300], Average Loss: 0.7165
Epoch [2/300], Test Loss: 0.7322
Best model weights saved to /content/drive/MyDrive/MLRG/best_model_weights.pth
Epoch [3/300], Average Loss: 0.7148
Epoch [3/300], Test Loss: 0.7335
Epoch [4/300], Average Loss: 0.7138
Epoch [4/300], Test Loss: 0.7263
Best model weights saved to /content/drive/MyDrive/MLRG/best_model_weights.pth
Epoch [5/300], Average Loss: 0.7134
Epoch [5/300], Test Loss: 0.7262
Best model weights saved to /content/drive/MyDrive/MLRG/best_model_weights.pth
Epoch [6/300], Average Loss: 0.7140
Epoch [6/300], Test Loss: 0.7332
Epoch [7/300], Average Loss: 0.7123
Epoch [7/300], Test Loss: 0.7236
Best model weights saved to /content/drive/MyDrive/MLRG/best_model_weights.pth
Epoch [8/300], Average Loss: 0.7120
Epoch [8/300], Test Loss: 0.7327
Epoch [9/300], Average Loss: 0.7107
Epoch [9/300], Te

KeyboardInterrupt: 

In [None]:
model.eval()
with torch.no_grad():
    y_pred_train = []
    y_true_train = []
    for batch_species, batch_upstream200, batch_stress_name, batch_y in train_loader:
        outputs = model(batch_species, batch_upstream200, batch_stress_name)
        y_pred_train.extend(outputs.numpy())
        y_true_train.extend(batch_y.numpy())

    y_pred_test = []
    y_true_test = []
    for batch_species, batch_upstream200, batch_stress_name, batch_y in test_loader:
        outputs = model(batch_species, batch_upstream200, batch_stress_name)
        y_pred_test.extend(outputs.numpy())
        y_true_test.extend(batch_y.numpy())

    train_mse = mean_squared_error(y_true_train, y_pred_train)
    test_mse = mean_squared_error(y_true_test, y_pred_test)

    train_r2 = r2_score(y_true_train, y_pred_train)
    test_r2 = r2_score(y_true_test, y_pred_test)

print(f'Train MSE: {train_mse:.4f}, Train R²: {train_r2:.4f}')
print(f'Test MSE: {test_mse:.4f}, Test R²: {test_r2:.4f}')


Train MSE: 0.6530, Train R²: 0.1606
Test MSE: 0.7088, Test R²: 0.0879
