In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
from datasets import load_dataset
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
import torch
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
import evaluate
import random
from transformers import AutoTokenizer, ModernBertForSequenceClassification
import wandb
import accelerate
from tokenizers import Tokenizer

In [3]:
# 设置随机种子以确保结果可复现
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True


In [4]:

set_seed(3407)

In [5]:
# 加载数据集
dataset=load_dataset('json',data_files='/root/for_may_conference/remake/data_for_train_bert_need_shuffle.json')
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'score'],
        num_rows: 78200
    })
})

In [6]:
# 数据集随机打乱
shuffled_dataset = dataset["train"].shuffle(seed=3407)

In [7]:
train_test_split = shuffled_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]


In [8]:
train_test_split = train_dataset.train_test_split(test_size=0.05)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]


In [9]:
model_name = "neavo/modern_bert_multilingual"  # 替换为你的模型

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [11]:
# 定义数据预处理函数
def preprocess_function(examples):
    # 假设您的JSON中每个样本的文本字段名为"text"
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=8912)

In [12]:
# 预处理数据集
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
tokenized_test = eval_dataset.map(preprocess_function, batched=True)

In [13]:
tokenized_train

Dataset({
    features: ['text', 'label', 'score', 'input_ids', 'attention_mask'],
    num_rows: 66861
})

In [14]:
tokenized_eval

Dataset({
    features: ['text', 'label', 'score', 'input_ids', 'attention_mask'],
    num_rows: 7820
})

In [15]:
tokenized_test

Dataset({
    features: ['text', 'label', 'score', 'input_ids', 'attention_mask'],
    num_rows: 7820
})

In [16]:
# 删除原来的 label 字段并将 score 字段改名为 label
tokenized_train = tokenized_train.map(
    lambda x: {"label": x["score"]},  # 将 score 改名为 label
    remove_columns=["label"]         # 删除旧的 label 字段
)

In [17]:
tokenized_eval = tokenized_eval.map(
    lambda x: {"label": x["score"]},  # 将 score 改名为 label
    remove_columns=["label"]         # 删除旧的 label 字段
)

In [18]:
tokenized_test = tokenized_test.map(
    lambda x: {"label": x["score"]},  # 将 score 改名为 label
    remove_columns=["label"]         # 删除旧的 label 字段
)

In [19]:
print(train_dataset[0])

{'text': '首先,孔子生活在春秋战国时期,社会动荡,奴隶制逐渐崩溃,封建制逐渐形成.孔子的思想在当时就比较先进了,表现之一为其仁的学说,不正映射了奴隶制崩溃期人的解放吗?\n其次,就是在这段过渡期同时也是封建社会的形成期,"中"这一思想为大一统的封建王朝提供了一些虽然粗略但规模宏大的政治建设蓝图,他研究了以往的政治经验,做出了一定程度的总结,又加上一些适合社会发展情况的创造,给后代封建社会的统治规模打下了一些基础,把它称为封建社会的圣人,不是偶然的.\n孔子的落后面主要还是他带有等级观念,血统观念,狭隘的地域观念,表现在他讲君君,臣臣,父父,子子,讲正名,讲礼,讲君子小人,讲天命,讲内诸夏而外夷狄等.而且更由于孔子所处的时代的过渡性,以及他的政治地位(他既当过高级官吏,而且医生主要活动除了教育事业y也是奔走做官,就是教育事业,也主要是训练弟子们做官)的关系,他的思想有许多不彻底,不明朗,对上妥协,对劳动生产和劳动人民轻视的地方.这些地方集中的表现,就是他的中庸之道,这是他的软弱处.\n圣人也是人,有进步的一面,也有落后的一面,无不局囿在他的时代里.孔子讲仁,但同时讲礼,礼就限制了仁.他普及教育,但也并非普及到所有人,限度也仍然是有的.他讲礼,礼一般来说是落后的,但他注重你的内容而轻视礼的形式,这就又是改革.他讲天命,一般来说也是落后的,但他并没有迷信鬼神,也没有全部陷入宿命论,这仍是他进步的地方.他讲中庸,但是在和而不同上.\n至于孔子对后代的影响问题,就更复杂了,有好影响也有坏影响,在坏影响中,有些是孔子本来不对,也有的是孔子一些有益的东西,因为不正确的解释与理解,就变为有害的东西了,关于这方面,责任就不能完全由孔子来负了,所以我们一方面对孔子要有总的把握,一方面对他个别言论的实质和影响加以具体分析.', 'label': '教育', 'score': 0.467021226480655}


In [20]:
type(tokenized_train[0]['label'])

float

In [21]:
# 定义评估指标计算函数
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # 确保预测是一维的
    predictions = predictions.squeeze()
    
    # 计算MSE、MAE和R²
    mse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    
    # 计算相关系数
    corr = np.corrcoef(predictions, labels)[0, 1]
    
    # 计算R²分数
    r2 = evaluate.load("r2").compute(predictions=predictions, references=labels)["r2"]
    
    return {
        "mse": mse,
        "mae": mae,
        "r2": r2,
        "correlation": corr
    }


In [22]:
# 加载模型 - 使用回归输出方式
model = ModernBertForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=1,  # 回归任务设置为1
    problem_type="regression" , # 明确指定为回归问题
).to('cuda')

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at neavo/modern_bert_multilingual and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
run = wandb.init(
    project="modern_bert_regression",  # Specify your project
    config={                        # Track hyperparameters and metadata
        "learning_rate": 0.01,
        "epochs": 3,
    },
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mpansy13[0m ([33mpansy13-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [24]:
wandb.login()




True

In [25]:
training_args = TrainingArguments(
    output_dir="./results",
    run_name="modern_bert_regression",
    learning_rate=2e-5,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="steps",  # 修改评估策略为按步数进行评估
    save_strategy="steps",        # 修改保存策略为按步数保存
    save_steps=2500,               # 每训练50步保存一次模型
    eval_steps=2500,               # 每训练50步进行一次评估
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="wandb",
    logging_dir="./logs",
    logging_steps=50,
    metric_for_best_model="mse",  # 以MSE为指标选择最佳模型
    greater_is_better=False,  # MSE越小越好
)



In [26]:
# 初始化Trainer
trainer = Trainer(1
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # 早停机制
)

In [None]:
# 训练模型
trainer.train()

Step,Training Loss,Validation Loss


In [None]:
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")

In [None]:
# 评估模型
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")