In [1]:
# import subprocess
# import os
# import json

# result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
# output = result.stdout
# for line in output.splitlines():
#     if '=' in line:
#         var, value = line.split('=', 1)
#         os.environ[var] = value



from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import Trainer
import evaluate
import numpy as np
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
import json
import random


# 假设你的JSON文件名为 'data.json' 并且每行是一个独立的JSON对象
raw_datasets = load_dataset('paws-x', 'en')  # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-x


#分词器
tokenizer = AutoTokenizer.from_pretrained("dnagpt/gene_eng_gpt2_v1")
tokenizer.pad_token = tokenizer.eos_token

# 修改分词器的填充方向为左侧，默认有右侧，分类问题建议左侧
#tokenizer.padding_side = "left"


#分词函数
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True,max_length=256, padding="max_length")

#构建分词后的数据集
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

#训练数据构建
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


#指标函数定义
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy': (predictions==labels).sum() / len(labels)}



# 动态生成随机种子
#seed = random.randint(0, 10000)
seed = 3314
#print(f"Generated seed: {seed}")
result = {}
result["seed"] = seed

training_args = TrainingArguments(
    output_dir="ds_job_dna_2222",
    learning_rate=1e-5,
    lr_scheduler_type="constant_with_warmup",
    warmup_ratio=0.1,
    optim='adamw_torch',
    weight_decay=0.0,
    seed=seed,  # 使用动态生成的随机种子
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    num_train_epochs=4, #训练多少轮
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True
)

#模型定义，文本分类模型
model = AutoModelForSequenceClassification.from_pretrained("dnagpt/gene_eng_gpt2_v1", num_labels=2)
model.config.pad_token_id = model.config.eos_token_id

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train() #模型训练

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at dnagpt/gene_eng_gpt2_v1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


[2025-01-16 23:51:42,196] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/root/miniconda3/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'
/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for bool@CXXABI_1.3'

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6225,0.382532,0.828
2,0.3535,0.31776,0.8685
3,0.2494,0.307708,0.8945
4,0.1949,0.362022,0.8895


TrainOutput(global_step=9884, training_loss=0.3550827106644103, metrics={'train_runtime': 1473.1268, 'train_samples_per_second': 134.139, 'train_steps_per_second': 6.71, 'total_flos': 2.5816641551990784e+16, 'train_loss': 0.3550827106644103, 'epoch': 4.0})

In [2]:
#模型测试，英文数据集
predictions = trainer.predict(tokenized_datasets["test"])
preds = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("glue", "mrpc")
ret = metric.compute(predictions=preds, references=predictions.label_ids)
result["en"] = ret


#模型测试，法文数据集
raw_datasets_fr = load_dataset('paws-x', 'fr')  # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-x
tokenized_datasets_fr = raw_datasets_fr.map(tokenize_function, batched=True)

predictions = trainer.predict(tokenized_datasets_fr["test"])
preds = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("glue", "mrpc")
ret = metric.compute(predictions=preds, references=predictions.label_ids)
result["fr"] = ret

#模型测试，德文数据集
raw_datasets_de = load_dataset('google-research-datasets/paws-x', 'de')  # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-
tokenized_datasets_de = raw_datasets_de.map(tokenize_function, batched=True)
predictions = trainer.predict(tokenized_datasets_de["test"])
preds = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("glue", "mrpc")
ret = metric.compute(predictions=preds, references=predictions.label_ids)
result["de"] = ret

#模型测试，中文数据集
raw_datasets_zh = load_dataset('google-research-datasets/paws-x', 'zh')  # 或者指定特定语言如 'zh' 表示中文,https://huggingface.co/datasets/google-research-datasets/paws-
tokenized_datasets_zh = raw_datasets_zh.map(tokenize_function, batched=True)

predictions = trainer.predict(tokenized_datasets_zh["test"])
preds = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("glue", "mrpc")
ret = metric.compute(predictions=preds, references=predictions.label_ids)
result["zh"] = ret

#模型测试 dna数据集，150 bp长度 简单版本
raw_datasets_dna =load_dataset('dnagpt/gene_lan_transfer', 'dna_sim_pair_simple_150bp')['train'].train_test_split(test_size=0.2) #默认已经shuffle
tokenized_datasets_dna = raw_datasets_dna.map(tokenize_function, batched=True)
predictions = trainer.predict(tokenized_datasets_dna["test"])
preds = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("glue", "mrpc")
ret = metric.compute(predictions=preds, references=predictions.label_ids)
result["dna_sim_pair_simple_150bp"] = ret

#模型测试 dna数据集，150长度，复杂版本 不相似
raw_datasets_dna = load_dataset('dnagpt/gene_lan_transfer', 'dna_sim_pair_150bp')['train'].train_test_split(test_size=0.2) #默认已经shuffle
tokenized_datasets_dna= raw_datasets_dna.map(tokenize_function, batched=True)

predictions = trainer.predict(tokenized_datasets_dna["test"])
preds = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("glue", "mrpc")
ret = metric.compute(predictions=preds, references=predictions.label_ids)
result["dna_sim_pair_150bp"] = ret

#模型测试 dna数据集，50长度，复杂版本 不相似
raw_datasets_dna = load_dataset('dnagpt/gene_lan_transfer', 'dna_sim_pair_50bp')['train'].train_test_split(test_size=0.1) #默认已经shuffle
tokenized_datasets_dna = raw_datasets_dna.map(tokenize_function, batched=True)
predictions = trainer.predict(tokenized_datasets_dna["test"])
preds = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("glue", "mrpc")
ret = metric.compute(predictions=preds, references=predictions.label_ids)
result["dna_sim_pair_50bp"] = ret

#模型测试 蛋白质数据集，50长度/150bp，复杂版本 不相似
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'protein_sim_pair_150bp')['train'].train_test_split(test_size=0.1) #默认已经shuffle
tokenized_datasets_dna_protein = raw_datasets_dna_protein.map(tokenize_function, batched=True)
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
preds = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("glue", "mrpc")
ret = metric.compute(predictions=preds, references=predictions.label_ids)
result["protein_sim_pair_150bp"] = ret

#模型测试 蛋白质数据集，150长度/450bp，复杂版本 不相似

raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'protein_sim_pair_450bp')['train'].train_test_split(test_size=0.1) #默认已经shuffle
tokenized_datasets_dna_protein = raw_datasets_dna_protein.map(tokenize_function, batched=True)
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
preds = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("glue", "mrpc")
ret = metric.compute(predictions=preds, references=predictions.label_ids)
result["protein_sim_pair_450bp"] = ret

#模型测试 dna-蛋白质序列
raw_datasets_dna_protein = load_dataset('dnagpt/gene_lan_transfer', 'dna_protein_pair')['train'].train_test_split(test_size=0.1) #默认已经shuffle

# 定义翻转标签的函数
def flip_labels(example):
    # 截取 sentence1 和 sentence2 的前 50 个字符,如果dna序列过长，bert分词会产生错误，只生成unk一个token
    example["sentence1"] = example["sentence1"]
    example["sentence2"] = example["sentence2"]
    #example['label'] = 1 - example['label']
    return example

# 应用翻转标签函数
flipped_datasets_dna_protein = raw_datasets_dna_protein.map(flip_labels, batched=False)

tokenized_datasets_dna_protein = flipped_datasets_dna_protein.map(tokenize_function, batched=True)
predictions = trainer.predict(tokenized_datasets_dna_protein["test"])
preds = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("glue", "mrpc")
ret = metric.compute(predictions=preds, references=predictions.label_ids)
result["dna_protein_pair"] = ret

print(json.dumps(result))

{"seed": 3314, "en": {"accuracy": 0.8945, "f1": 0.8873465029364656}, "fr": {"accuracy": 0.7565, "f1": 0.7488396080453842}, "de": {"accuracy": 0.7345, "f1": 0.7048360200111173}, "zh": {"accuracy": 0.6315, "f1": 0.5030343897505057}, "dna_sim_pair_simple_150bp": {"accuracy": 0.8722222222222222, "f1": 0.8800834202294057}, "dna_sim_pair_150bp": {"accuracy": 0.8255, "f1": 0.8238263503281171}, "dna_sim_pair_50bp": {"accuracy": 0.667, "f1": 0.5506072874493927}, "protein_sim_pair_150bp": {"accuracy": 0.97, "f1": 0.97}, "protein_sim_pair_450bp": {"accuracy": 0.9822222222222222, "f1": 0.9821029082774049}, "dna_protein_pair": {"accuracy": 0.4975, "f1": 0.01951219512195122}}


In [3]:
model.save_pretrained("gene_eng_gpt2_v1_ft")
tokenizer.save_pretrained("gene_eng_gpt2_v1_ft")

('gene_eng_gpt2_v1_ft/tokenizer_config.json',
 'gene_eng_gpt2_v1_ft/special_tokens_map.json',
 'gene_eng_gpt2_v1_ft/tokenizer.json')