In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
from datasets import load_dataset
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
import torch
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
import evaluate
import random
from transformers import AutoTokenizer, ModernBertForSequenceClassification
import wandb
import accelerate
from tokenizers import Tokenizer
from torch.utils.data import DataLoader
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support

In [2]:
# 设置随机种子以确保结果可复现
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True


In [3]:

set_seed(3407)

In [4]:
torch.cuda.device_count()

4

In [5]:
# 加载数据集
dataset=load_dataset('json',data_files='/root/for_may_conference/remake/data_for_train_bert_need_shuffle.json')
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'score'],
        num_rows: 78200
    })
})

In [6]:
# 数据集随机打乱
shuffled_dataset = dataset["train"].shuffle(seed=3407)

In [7]:
train_test_split = shuffled_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]


In [8]:
train_test_split = train_dataset.train_test_split(test_size=0.05)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]


In [9]:
model_name = "neavo/modern_bert_multilingual"  # 替换为你的模型

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [11]:
# 定义数据预处理函数
def preprocess_function(examples):
    # 假设您的JSON中每个样本的文本字段名为"text"
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=8912)

In [12]:
# 预处理数据集
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
tokenized_test = eval_dataset.map(preprocess_function, batched=True)

In [13]:
tokenized_train

Dataset({
    features: ['text', 'label', 'score', 'input_ids', 'attention_mask'],
    num_rows: 66861
})

In [14]:
tokenized_eval

Dataset({
    features: ['text', 'label', 'score', 'input_ids', 'attention_mask'],
    num_rows: 7820
})

In [15]:
tokenized_test

Dataset({
    features: ['text', 'label', 'score', 'input_ids', 'attention_mask'],
    num_rows: 7820
})

In [16]:
type(train_dataset[0]['label'])

str

In [17]:
label_mapping = {
    "其它": 0, "政务": 1, "娱乐": 2, "科学": 3, "教育": 4, "时政": 5, "新闻": 6, "农业": 7,
    "房地产": 8, "douban": 9, "法律1": 10, "经济": 11, "法律": 12, "学习强国": 13, "电力": 14,
    "政府工作报告": 15, "外交": 16, "企业": 17
}

In [18]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        label = label_mapping[item['label']]
        score = item['score']
        return {
            'input_ids': torch.tensor(item['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(item['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long),
            'score': torch.tensor(score, dtype=torch.float)
        }


In [19]:
train_dataset = CustomDataset(tokenized_train)
eval_dataset = CustomDataset(tokenized_eval)
test_dataset = CustomDataset(tokenized_test)

In [20]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [21]:
# 加载模型 - 使用回归输出方式
model = ModernBertForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=18,  
).to('cuda')

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at neavo/modern_bert_multilingual and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
model.device

device(type='cuda', index=0)

In [23]:
model = model.to('cuda')

In [24]:
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)


In [24]:
run = wandb.init(
    project="modern_bert_regression_only_cf",  # Specify your project
    config={                        # Track hyperparameters and metadata
        "learning_rate": 0.01,
        "epochs": 3,
    },
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mcasit205[0m ([33mcasit205-ucas[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [25]:
def compute_metrics(p): # p 是 EvalPrediction 对象
    preds = np.argmax(p.predictions, axis=1) # p.predictions 是 logits
    labels = p.label_ids
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted') # 或者 'macro'
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }


In [26]:
training_args = TrainingArguments(
    output_dir="./only_cf",
    run_name="modern_bert_regression_only_cf",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="steps",  # 修改评估策略为按步数进行评估
    save_strategy="steps",        # 修改保存策略为按步数保存
    save_steps=2500,               # 每训练50步保存一次模型
    eval_steps=2500,               # 每训练50步进行一次评估
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="wandb",
    logging_dir="./logs",
    logging_steps=50,
    metric_for_best_model="accuracy",  # 或者 "f1"
    greater_is_better=True,
)
training_args.per_device_train_batch_size



2

In [27]:
model.gradient_checkpointing_enable()

In [27]:
# 初始化Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # 早停机制
)

[2025-05-20 20:25:14,998] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [28]:
# 训练模型
trainer.train()



Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")

In [None]:
# 评估模型
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")