In [1]:
import os

# 设置环境变量
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# 打印环境变量以确认设置成功
print(os.environ.get('HF_ENDPOINT'))

https://hf-mirror.com


In [2]:
from transformers import AutoTokenizer, AutoModel
from tokenizers import Tokenizer
from transformers import GPT2LMHeadModel, AutoConfig,GPT2Tokenizer
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding

2025-03-09 20:39:08.963201: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-09 20:39:08.975167: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-09 20:39:08.989704: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-09 20:39:08.994092: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-09 20:39:09.004966: I tensorflow/core/platform/cpu_feature_guar

In [3]:
#set tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("dnagpt/gene_eng_gpt2_v0")
tokenizer.pad_token = tokenizer.eos_token

In [4]:
#set model
model = AutoModelForSequenceClassification.from_pretrained("dnagpt/gene_eng_gpt2_v0", num_labels=3, problem_type="regression")
model.config.pad_token_id = model.config.eos_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at dnagpt/gene_eng_gpt2_v0 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from datasets import load_dataset,DatasetDict
# 1. load ~11k samples from promoters prediction dataset
dataset = load_dataset("json", data_files="rna_pos_1024_train.jsonl")['train'].train_test_split(test_size=0.1)
# dataset_val = load_dataset("json", data_files="rna_pos_1024_val.jsonl")

# dataset = DatasetDict({
#     "train":dataset_train["train"],
#     "test":dataset_val["train"] }
# )

dataset

DatasetDict({
    train: Dataset({
        features: ['seq', 'label'],
        num_rows: 123385
    })
    test: Dataset({
        features: ['seq', 'label'],
        num_rows: 13710
    })
})

In [7]:
dataset["test"][1242]

{'seq': 'GGACGAGGAAACACTTGGGCGGCTTCGAAAGATGCCTATTTCGACCGGTGTTTGCGGGTCAAGAAGCGACATGGACCCCAAAGATCGGCTTGGTCTGACGAAGTGAATGGCGAGGTAATAAGTAGAGTCACCAAGACCCTCTTATCCACAGCTAGTGCTATTTTTGTATTTAGGTTAGCTATTTAGCTTTACGTTCCAGGATGCCTAGTGGCAGCCCCACAATATCCAGG',
 'label': [-54.77099990844727, -43.78300094604492, 19.11100006103516]}

In [8]:
token_len_list = []
for item in dataset["test"]:
    inputs = tokenizer.tokenize(item["seq"])
    token_len_list.append( len(inputs) )

mean_len = sum(token_len_list)/len(token_len_list)
min_len  = min(token_len_list)
max_len = max(token_len_list)

print("datasets ", "mean token lenght", mean_len, "min token length", min_len, "max token length", max_len)

datasets  mean token lenght 106.6472647702407 min token length 1 max token length 199


In [9]:
# 2. tokenize
def tokenize_function(examples):
    return tokenizer(examples['seq'], truncation=True, padding='max_length',max_length=256)

# 3. 对数据集应用分词函数
tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=8)

# 4. 创建一个数据收集器，用于动态填充和遮蔽
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map (num_proc=8):   0%|          | 0/123385 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/13710 [00:00<?, ? examples/s]

In [13]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import mean_squared_error


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions)
    return {"rmse": rmse}

# 设置训练参数
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",            # 每个 epoch 保存一次模型
    learning_rate=2e-5,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=3,               # 最多保存 3 个 checkpoint
    load_best_model_at_end=True,      # 训练结束时加载最好的模型
    metric_for_best_model="eval_loss", # 使用 eval_loss 作为评估指标
    greater_is_better=False,          # eval_loss 越小越好
)

# 使用Trainer API进行训练（假设已有train_dataset和eval_dataset）
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rmse
1,3318.435,3583.534668,3583.526367
2,2887.7238,3022.019287,3022.016846
3,2416.6795,2614.540039,2614.532227
4,2109.736,2299.857666,2299.853271
5,1950.7916,2125.344971,2125.338623
6,1737.4627,2055.251953,2055.248291
7,1661.5099,2073.966064,2073.960938
8,1592.2287,2041.346558,2041.34375
9,1566.5035,2073.028809,2073.024658
10,1549.163,2089.218018,2089.20874


TrainOutput(global_step=61700, training_loss=2194.6143567666127, metrics={'train_runtime': 9213.6248, 'train_samples_per_second': 133.916, 'train_steps_per_second': 6.697, 'total_flos': 1.612019533676544e+17, 'train_loss': 2194.6143567666127, 'epoch': 10.0})

In [15]:
#模型测试
predictions = trainer.predict(tokenized_datasets["test"])
predictions

PredictionOutput(predictions=array([[-61.452217, -55.75406 , -26.49135 ],
       [188.73888 , 191.95325 , 224.40453 ],
       [105.570244, 102.61758 , 124.64346 ],
       ...,
       [188.68427 , 205.40616 , 187.1908  ],
       [201.12808 , 226.905   , 252.09097 ],
       [-20.086788, -12.286413,   2.185144]], dtype=float32), label_ids=array([[-55.652, -52.663, -34.913],
       [182.877, 206.286, 238.595],
       [101.682, 111.573, 118.957],
       ...,
       [185.887, 205.243, 189.923],
       [235.199, 225.855, 299.421],
       [-26.234,  -1.115,   3.715]], dtype=float32), metrics={'test_loss': 2041.3465576171875, 'test_rmse': 2041.34375, 'test_runtime': 32.623, 'test_samples_per_second': 420.256, 'test_steps_per_second': 21.028})

In [16]:
# 保存模型和 tokenizer 到本地目录
model.save_pretrained("./gene_eng_gpt2_v0_rna3d_ft_v1")  # 模型保存到 ./my_model 目录
tokenizer.save_pretrained("./gene_eng_gpt2_v0_rna3d_ft_v1")  # tokenizer 保存到 ./my_model 目录

('./gene_eng_gpt2_v0_rna3d_ft_v1/tokenizer_config.json',
 './gene_eng_gpt2_v0_rna3d_ft_v1/special_tokens_map.json',
 './gene_eng_gpt2_v0_rna3d_ft_v1/vocab.json',
 './gene_eng_gpt2_v0_rna3d_ft_v1/merges.txt',
 './gene_eng_gpt2_v0_rna3d_ft_v1/added_tokens.json')