In [13]:
# 从文件中读取序列，进行ESMCqueeze操作，并保存为Tensor格式

import pandas as pd
import glob
import torch
from esm.models.esmc import ESMC
from esm.sdk.api import ESMProtein, LogitsConfig

In [14]:
def ESMC_embedding(sequence:str):
   '''调用ESMC模型进行embedding
   
   输入：蛋白质序列
   输出：embedding后的隐层特征（Size = [36, 1, 68, 1152]）
   '''
   protein = ESMProtein(sequence=sequence)
   client = ESMC.from_pretrained("esmc_600m").to("cpu") # or "cpu"
   protein_tensor = client.encode(protein)
   logits_output = client.logits(protein_tensor, LogitsConfig(sequence=True, return_embeddings=True, return_hidden_states=True))
   return logits_output.hidden_states

In [15]:
ProteinPairsData_csv_path = "/home/users/hcdai/AI-peptide/Seq2Score/SiameseNetWork/abc.csv"
output_path = "/home/users/hcdai/AI-peptide/Seq2Score/SiameseNetWork/Seq2Tensor"

# 读取output文件夹下所有的文件名，不包括后缀
roteinPairsData_file_pattern = f"{output_path}/*pt"  # 匹配所有文件
roteinPairsData_Tensors = set([path.split('/')[-1].rsplit('.', 1)[0] for path in glob.glob(roteinPairsData_file_pattern)])

In [16]:
def seq2tensor(seq):
    if seq in roteinPairsData_Tensors:
        return True
    
    seq_tensor = ESMC_embedding(seq)
    torch.save(seq_tensor, output_path +'/'+ seq + '.pt')
    roteinPairsData_Tensors.add(seq)
    return True


In [17]:

data = pd.read_csv(ProteinPairsData_csv_path)
# print(data.head())


for idx in range(data.shape[0]):
    seq2tensor(data.iloc[idx, 0])
    seq2tensor(data.iloc[idx, 1])



In [18]:
y = torch.load('/home/users/hcdai/AI-peptide/Seq2Score/SiameseNetWork/Seq2Tensor/DIEKLKEAASSIGLSSIQLGIALTQHYSELTNIFG.pt',weights_only=True)
print(y.shape)
print(len(roteinPairsData_Tensors))

torch.Size([36, 1, 37, 1152])
719
