In [1]:
import os

# 设置环境变量
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# 打印环境变量以确认设置成功
print(os.environ.get('HF_ENDPOINT'))

https://hf-mirror.com


In [2]:
from transformers import AutoTokenizer, AutoModel
from tokenizers import Tokenizer
from transformers import GPT2LMHeadModel, AutoConfig,GPT2Tokenizer
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding

In [3]:
#set tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("dnagpt/gene_eng_gpt2_v0")
tokenizer.pad_token = tokenizer.eos_token

In [4]:
#set model
model = AutoModelForSequenceClassification.from_pretrained("dnagpt/gene_eng_gpt2_v0", num_labels=3, problem_type="regression")
model.config.pad_token_id = model.config.eos_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at dnagpt/gene_eng_gpt2_v0 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from datasets import load_dataset,DatasetDict
# 1. load ~11k samples from promoters prediction dataset
dataset_train = load_dataset("json", data_files="rna_pos_1024_train_no_none.jsonl")
dataset_val = load_dataset("json", data_files="rna_pos_1024_val_no_none.jsonl")

dataset = DatasetDict({
    "train":dataset_train["train"],
    "test":dataset_val["train"] }
)

dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['seq', 'label'],
        num_rows: 130950
    })
    test: Dataset({
        features: ['seq', 'label'],
        num_rows: 2515
    })
})

In [6]:
dataset["test"][1242]

{'seq': 'GGATACGTCTACGCTCAGTGACGGACTCTCTTCGGAGAGTCTGACATCCGAACCATACACGGATGTGCCTCGCCGAACAGTCTACGGCGAGCTTAAGCGCTGGGGACGCCCAACGCATCACAAAGACTGAGTGATGAACCAGAAGTATGGACTGGTTGCGTTGGTGGAGACGGTCGGGTCCAGTTCGCTGTCGAGTAGAGTGTGGGCTCCATCGACGCCGCTTTAAGGTCCCCAATCGTGGCGTGTCGGCCTGCTTCGGCAGGCACTGGCGCCGGGACCTTGAAGAGATGAGATTTCGATCTCATCTTTGGGTGTCT',
 'label': [141.51800537109375, 172.2519989013672, 124.7959976196289]}

In [7]:
token_len_list = []
for item in dataset["test"]:
    inputs = tokenizer.tokenize(item["seq"])
    token_len_list.append( len(inputs) )

mean_len = sum(token_len_list)/len(token_len_list)
min_len  = min(token_len_list)
max_len = max(token_len_list)

print("datasets ", "mean token lenght", mean_len, "min token length", min_len, "max token length", max_len)

datasets  mean token lenght 35.09025844930417 min token length 1 max token length 133


In [9]:
# 2. tokenize
def tokenize_function(examples):
    return tokenizer(examples['seq'], truncation=True, padding='max_length',max_length=256)

# 3. 对数据集应用分词函数
tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=8)

# 4. 创建一个数据收集器，用于动态填充和遮蔽
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map (num_proc=8):   0%|          | 0/130950 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/2515 [00:00<?, ? examples/s]

In [11]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import mean_squared_error


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions)
    return {"rmse": rmse}

# 设置训练参数
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=3,               # 最多保存 3 个 checkpoint
)

# 使用Trainer API进行训练（假设已有train_dataset和eval_dataset）
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rmse
1,1214.0706,5.964214370031225e+33,5.964214989001246e+33
2,1294.3806,5.964214370031225e+33,5.964214989001246e+33
3,1211.346,5.964214370031225e+33,5.964214989001246e+33
4,1112.1134,5.964214370031225e+33,5.964214989001246e+33
5,1073.7946,5.964214370031225e+33,5.964214989001246e+33
6,1041.5099,5.964214370031225e+33,5.964214989001246e+33
7,971.0549,5.964214370031225e+33,5.964214989001246e+33
8,898.0403,5.964214370031225e+33,5.964214989001246e+33
9,973.293,5.964214370031225e+33,5.964214989001246e+33
10,1034.2371,5.964214370031225e+33,5.964214989001246e+33


TrainOutput(global_step=65480, training_loss=1053.222846670739, metrics={'train_runtime': 9782.5556, 'train_samples_per_second': 133.861, 'train_steps_per_second': 6.694, 'total_flos': 1.71085592199168e+17, 'train_loss': 1053.222846670739, 'epoch': 10.0})

In [14]:
#模型测试
predictions = trainer.predict(tokenized_datasets["test"])
predictions

PredictionOutput(predictions=array([[ 28.463812,  21.273907,  26.50425 ],
       [ 22.752205,  17.67276 ,  21.739912],
       [ 16.271358,  16.555983,  18.032711],
       ...,
       [ 78.604294, 101.76971 ,  74.41993 ],
       [ 86.36459 ,  83.671036,  75.93454 ],
       [ 80.91185 ,  88.14135 ,  72.40309 ]], dtype=float32), label_ids=array([[ -5.499,   8.52 ,   8.605],
       [ -5.826,  10.453,  14.01 ],
       [ -5.849,  14.768,  17.585],
       ...,
       [ 99.012, 105.749, 113.074],
       [103.861, 103.453, 114.59 ],
       [106.745,  95.313, 112.321]], dtype=float32), metrics={'test_loss': 5.964214370031226e+33, 'test_rmse': 5.964214989001246e+33, 'test_runtime': 5.9612, 'test_samples_per_second': 421.896, 'test_steps_per_second': 21.137})

In [15]:
# 保存模型和 tokenizer 到本地目录
model.save_pretrained("./gene_eng_gpt2_v0_rna3d_ft_v2")  # 模型保存到 ./my_model 目录
tokenizer.save_pretrained("./gene_eng_gpt2_v0_rna3d_ft_v2")  # tokenizer 保存到 ./my_model 目录

('./gene_eng_gpt2_v0_rna3d_ft_v2/tokenizer_config.json',
 './gene_eng_gpt2_v0_rna3d_ft_v2/special_tokens_map.json',
 './gene_eng_gpt2_v0_rna3d_ft_v2/vocab.json',
 './gene_eng_gpt2_v0_rna3d_ft_v2/merges.txt',
 './gene_eng_gpt2_v0_rna3d_ft_v2/added_tokens.json')