In [4]:
import os

# 设置环境变量
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# 打印环境变量以确认设置成功
print(os.environ.get('HF_ENDPOINT'))

https://hf-mirror.com


In [5]:
import pandas as pd
from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments
import torch
import torch.nn as nn
from torch.nn import functional as F
from datasets import Dataset
from tokenizers import Tokenizer
from transformers import GPT2LMHeadModel, AutoConfig,GPT2Tokenizer
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
import numpy as np

2025-03-08 21:39:21.221333: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 21:39:21.234451: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-08 21:39:21.249804: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-08 21:39:21.254488: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 21:39:21.266225: I tensorflow/core/platform/cpu_feature_guar

In [6]:
# 读取并预处理数据集
df = pd.read_csv('test_sequences.csv')
df

Unnamed: 0,target_id,sequence,temporal_cutoff,description,all_sequences
0,R1107,GGGGGCCACAGCAGAAGCGUUCACGUCGCAGCCCCUGUCAGCCAUU...,2022-05-28,CPEB3 ribozyme\nHuman\nhuman CPEB3 HDV-like ri...,>7QR4_1|Chain A|U1 small nuclear ribonucleopro...
1,R1108,GGGGGCCACAGCAGAAGCGUUCACGUCGCGGCCCCUGUCAGCCAUU...,2022-05-27,CPEB3 ribozyme\nChimpanzee\nChimpanzee CPEB3 H...,">7QR3_1|Chains A, B|U1 small nuclear ribonucle..."
2,R1116,CGCCCGGAUAGCUCAGUCGGUAGAGCAGCGGCUAAAACAGCUCUGG...,2022-06-04,Cloverleaf RNA\nPoliovirus\nCrystal Structure ...,">8S95_1|Chain A[auth C]|Lysine tRNA scaffold,P..."
3,R1117v2,UUGGGUUCCCUCACCCCAAUCAUAAAAAGG,2022-06-03,PreQ1 class I type III riboswitch\nK. pneumoni...,">8FZA_1|Chains A, B|PreQ1 Riboswitch (30-MER)|..."
4,R1126,GGAAUCUCGCCCGAUGUUCGCAUCGGGAUUUGCAGGUCCAUGGAUU...,2022-06-11,Traptamer\nSynthetic\nAdditional Information: ...,>8TVZ_1|Chain A[auth C]|RNA (363-MER)|syntheti...
5,R1128,GGAAUAUCGUCAUGGUGAUUCGUCACCAUGAGGCUAGAUCUCAUAU...,2022-06-10,6WJ\nSingle-stranded Paranemic Crossover RNA T...,>8BTZ_1|Chain A|RNA Paranemic croosover triang...
6,R1136,GGAUACGUCUACGCUCAGUGACGGACUCUCUUCGGAGAGUCUGACA...,2022-06-18,Apta-FRET\nAdditional Information: Info...,>7ZJ4_1|Chain A[auth E]|brocolli-pepper aptame...
7,R1138,GGGAGAGUACUAUUCAGAUGCAGACCGCAAGUUCAGAGCGGUUUGC...,2022-06-24,6HBC-Young\nAdditional Information: Thi...,>7PTK_1|Chain A[auth B]|RNA|synthetic construc...
8,R1149,GGACACGAGUAACUCGUCUAUCUUCUGCAGGCUGCUUACGGUUUCG...,2022-07-02,SARS-CoV-2 SL5\nAdditional Information: ...,>8UYS_1|Chain A|SARS-CoV-2 RNA SL5 domain.|Sev...
9,R1156,GGAGCAUCGUGUCUCAAGUGCUUCACGGUCACAAUAUACCGUUUCG...,2022-07-07,BtCoV-HKU5 SL5\nBtCoV-HKU5 5 proximal stem-loo...,>8UYE_1|Chain A|BtCoV-HKU5 5' proximal stem-lo...


In [7]:
print(df["sequence"][0])
print(df["target_id"][0])

GGGGGCCACAGCAGAAGCGUUCACGUCGCAGCCCCUGUCAGCCAUUGCACUCCGGCUGCGAAUUCUGCU
R1107


In [9]:
# 从 Trainer 获取训练好的模型

# 检查 GPU 可用性并设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model =  AutoModelForSequenceClassification.from_pretrained("dnagpt/gene_eng_gpt2_v0_rna3d_ft").to(device)
tokenizer = AutoTokenizer.from_pretrained("dnagpt/gene_eng_gpt2_v0_rna3d_ft")
model.eval()  # 设置为评估模式

def get_rna_pos(seq):
    """
    获得ran序列最后一个残基的三维坐标预测
    """
    # 分词和填充
    inputs = tokenizer(seq, truncation=True, padding='max_length', max_length=256, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)  # 移动到 GPU   形状: (1, 256)
    attention_mask = inputs["attention_mask"].to(device)  # 移动到 GPU  形状: (1, 256)
    
    # 推理
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = outputs.logits  # 形状: (1, 3)
    
    # 转换为 numpy 或列表
    predictions = predictions.squeeze(0).cpu().numpy()  # 从 GPU 移回 CPU 并转为 NumPy  ， [x, y, z]
    return predictions

def get_rna_all_pos(sequence):
    """
    获得rna每个残基的三维坐标预测
    """
    seq_pos_list_local = []
    for i in range(0, len(sequence)):
        seq = sequence[0:i+1].replace("U","T") #U-->T
        #如果seq长度大于1024则只要最后面的1024个字符
        if len(seq) > 1024:
            seq = seq[-1024:]
        pos = get_rna_pos(seq).tolist()
        #print(seq, pos)
        seq_pos_list_local.append(pos)
    return seq_pos_list_local

def get_csv_data(target_id, seq):
    """
    get csv result of seq
    """
    seq_pos_list = get_rna_all_pos(seq)
    data_list = []
    for index, (res, pos_list) in enumerate(zip(seq, seq_pos_list)):
        #print(index, res, pos_list)
        data = {
            "ID":target_id +"_" +str(index+1),
            "resname":res,
            "resid":str(index+1),
            "x_1":pos_list[0],
            "y_1":pos_list[1],
            "z_1":pos_list[2],
            "x_2":pos_list[0],
            "y_2":pos_list[1],
            "z_2":pos_list[2],
            "x_3":pos_list[0],
            "y_3":pos_list[1],
            "z_3":pos_list[2],
            "x_4":pos_list[0],
            "y_4":pos_list[1],
            "z_4":pos_list[2],
            "x_5":pos_list[0],
            "y_5":pos_list[1],
            "z_5":pos_list[2],
        }
        data_list.append(data)
    return data_list

tokenizer_config.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

In [10]:
csv_data_list = []
for target_id,seq in zip(df["target_id"], df["sequence"]):
    data_list = get_csv_data(target_id, seq)
    csv_data_list.extend(data_list)

In [11]:
# 将数据列表转换为 DataFrame
df = pd.DataFrame(csv_data_list)

# 写入 CSV 文件
df.to_csv("submission.csv", index=False)