In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch import nn
from transformers import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from tqdm import tqdm
import os

In [None]:
# 检查设备
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
batch_size = 1
lr = 5e-5
max_length = 128
epochs = 5

In [None]:
# 模型和存储路径
model_name = "facebook/esm2_t33_650M_UR50D"
model_path = "./models"
# 检查路径是否存在，如果不存在则创建
if not os.path.exists(model_path):
    os.makedirs(model_path)
# 加载模型和 tokenizer
Tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=model_path)
model = AutoModel.from_pretrained(model_name, cache_dir=model_path)

In [None]:
# 读取数据
file_path = "./data/df_combined_filtered_grouped.pkl"
df = pd.read_pickle(file_path)
df_non_null_toxicity = df[df['activity_ecoli'].notna()]
data = df_non_null_toxicity[['sequence', 'activity_ecoli']]
data_dict = {
    "sequence": data['sequence'].tolist(),
    "activity_ecoli": data['activity_ecoli'].tolist(),
}


"""
print(len(data_dict['sequence']))
print(len(data_dict['activity_ecoli']))

output：
1865
1865
"""

In [None]:
# 数据集类
class SequenceActivityEcoliDataset(Dataset):
    def __init__(self, sequences, activity_ecoli, tokenizer=Tokenizer, max_length=max_length):
        self.sequences = sequences
        self.activity_ecoli = torch.tensor(activity_ecoli, dtype=torch.float32)
        self.encoded_sequences = tokenizer(
            sequences, return_tensors="pt", padding=True, truncation=True, max_length=max_length
        )

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return {
            'sequence': {key: val[idx] for key, val in self.encoded_sequences.items()},
            'activity_ecoli': self.activity_ecoli[idx]
        }


In [None]:
# 数据集分割
train_sequences, val_sequences, train_activity_ecoli, val_activity_ecoli = train_test_split(
    data_dict['sequence'],
    data_dict['activity_ecoli'],
    test_size=0.2,
    random_state=42
)
train_dataset = SequenceActivityEcoliDataset(train_sequences, train_activity_ecoli, tokenizer=Tokenizer)
val_dataset = SequenceActivityEcoliDataset(val_sequences, val_activity_ecoli, tokenizer=Tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# 定义模型
class Activity_ecoliPredictor(nn.Module):
    def __init__(self, base_model):
        super(Activity_ecoliPredictor, self).__init__()
        self.base_model = base_model
        self.regressor = nn.Linear(base_model.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_hidden_state = outputs.last_hidden_state[:, 0, :]
        activity_ecoli = self.regressor(cls_hidden_state)
        return activity_ecoli

In [None]:
# 初始化模型
Activity_ecoli_model = Activity_ecoliPredictor(model)
Activity_ecoli_model.to(device)

# 损失函数和优化器
criterion = nn.MSELoss()
optimizer = AdamW(Activity_ecoli_model.parameters(), lr=lr)

In [None]:
# 训练和验证
for epoch in range(epochs):
    # 训练阶段
    Activity_ecoli_model.train()
    train_loss = 0.0
    train_progress = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs} (Training)")
    for batch in train_progress:
        input_ids = batch['sequence']['input_ids'].to(device)
        attention_mask = batch['sequence']['attention_mask'].to(device)
        labels = batch['activity_ecoli'].to(device)

        optimizer.zero_grad()
        outputs = Activity_ecoli_model(input_ids, attention_mask)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        avg_batch_loss = train_loss / (len(train_progress))
        train_progress.set_postfix({"Batch Loss": loss.item(), "Avg Loss": avg_batch_loss})
    avg_train_loss = train_loss / len(train_dataloader)

    save_dir = "./autodl-tmp/"
    os.makedirs(save_dir, exist_ok=True)
    cheakpoints_save_path = f"{save_dir}activity_ecoli_epoch{epoch+1}.pth"
    torch.save(Activity_ecoli_model.state_dict(), cheakpoints_save_path)

    # 验证阶段
    Activity_ecoli_model.eval()
    val_loss = 0.0
    val_progress = tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{epochs} (Validation)")
    with torch.no_grad():
        for batch in val_progress:
            input_ids = batch['sequence']['input_ids'].to(device)
            attention_mask = batch['sequence']['attention_mask'].to(device)
            labels = batch['activity_ecoli'].to(device)

            outputs = Activity_ecoli_model(input_ids, attention_mask)
            loss = criterion(outputs.squeeze(), labels)
            val_loss += loss.item()
            avg_val_batch_loss = val_loss / (len(val_progress))
            val_progress.set_postfix({"Batch Loss": loss.item(), "Avg Loss": avg_val_batch_loss})
    avg_val_loss = val_loss / len(val_dataloader)

    print(f"Epoch {epoch+1}/{epochs} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

# 保存模型
save_path = "./autodl-tmp/activity_ecoli_model.pth"
torch.save(Activity_ecoli_model.state_dict(), save_path)
print(f"Model saved to {save_path}")

In [None]:
# 评估模型性能
Activity_ecoli_model.eval()
predictions = []
true_values = []
with torch.no_grad():
    for batch in tqdm(val_dataloader):
        input_ids = batch['sequence']['input_ids'].to(device)
        attention_mask = batch['sequence']['attention_mask'].to(device)
        labels = batch['activity_ecoli'].to(device)

        outputs = Activity_ecoli_model(input_ids, attention_mask)

        predictions.extend(outputs.view(-1).cpu().numpy())
        true_values.extend(labels.cpu().numpy())

mse = mean_squared_error(true_values, predictions)
r2 = r2_score(true_values, predictions)
print(f"Mean Squared Error: {mse:.4f}, R^2 Score: {r2:.4f}")

mse = mean_squared_error(true_values, predictions)
r2 = r2_score(true_values, predictions)
print(f"Mean Squared Error: {mse:.4f}, R^2 Score: {r2:.4f}")

In [None]:
# 预测activity_ecoli
sequence = "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"
encoded_sequence = Tokenizer(sequence, return_tensors="pt")
input_ids = encoded_sequence['input_ids'].to(device)
attention_mask = encoded_sequence['attention_mask'].to(device)
activity_ecoli = Activity_ecoli_model(input_ids, attention_mask).item()
print(f"Predicted activity_ecoli: {activity_ecoli:.4f}")