In [25]:
import torch
import torch.nn as nn
from transformers import ModernBertForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer,EarlyStoppingCallback
from datasets import load_dataset
import wandb
import os
import random 
import numpy as np
from sklearn.metrics import accuracy_score, mean_squared_error


In [2]:
# 设置随机种子以确保结果可复现
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True

In [3]:

set_seed(3407)

In [None]:
# 自定义模型
class MultiTaskModel(ModernBertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.regressor = nn.Sequential(
            nn.Linear(config.hidden_size, 18),
            nn.Linear(18, 1))
        
    def forward(self, input_ids=None, attention_mask=None, label=None, score=None):
        outputs = self.model(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # CLS token
        
        # 分类任务
        _logits = self.classifier(pooled_output)
        
        # 回归任务
        _score = self.regressor(_logits).squeeze()
        
        # 计算联合损失
        loss = None
        loss_cls = loss_reg = None
        if label is not None and score is not None:
            loss_cls = nn.CrossEntropyLoss()(_logits, label)
            loss_reg = nn.MSELoss()(_score, score)
            loss = loss_cls + loss_reg

        print(loss)
            
        return {
            'loss': loss,
            'logits': _logits,
            'score': _score,
            'loss_cls': loss_cls,
            'loss_reg': loss_reg
        }

In [5]:
# 定义评估指标
def compute_metrics(eval_pred):
    logits = eval_pred.predictions['logits']
    scores_pred = eval_pred.predictions['score']
    labels = eval_pred.label_ids
    
    # 分类指标
    class_preds = np.argmax(logits, axis=-1)
    class_accuracy = accuracy_score(labels[:, 0], class_preds)
    
    # 回归指标
    mse = mean_squared_error(labels[:, 1], scores_pred)
    mae = np.mean(np.abs(labels[:, 1] - scores_pred))
    
    return {
        "accuracy": class_accuracy,
        "mse": mse,
        "mae": mae,
        "combined": (class_accuracy - mse)  # 自定义综合指标
    }

In [21]:
# 初始化wandb（先执行wandb login）
wandb.init(project="bert-modern_bert_multilingual_18_1-1")

In [7]:
# 初始化组件
model_name = "neavo/modern_bert_multilingual"
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [9]:
# 加载数据集
dataset=load_dataset('json',data_files='/root/for_may_conference/remake/data_for_train_bert_need_shuffle.json')
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'score'],
        num_rows: 78200
    })
})

In [10]:
label_mapping = {
    "其它": 0,
    "政务": 1,
    "娱乐": 2,
    "科学": 3,
    "教育": 4,
    "时政": 5,
    "新闻": 6,
    "农业": 7,
    "房地产": 8,
    "douban": 9,
    "法律1": 10,
    "经济": 11,
    "法律": 12,
    "学习强国": 13,
    "电力": 14,
    "政府工作报告": 15,
    "外交": 16,
    "企业": 17
}

In [11]:
dataset = dataset.map(lambda x: {
    "labels": [label_mapping[x["label"]], x["score"]]  # 合并标签
})

In [12]:
# 数据集随机打乱
shuffled_dataset = dataset["train"].shuffle(seed=3407)

In [13]:
train_test_split = shuffled_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

In [14]:
train_test_split = train_dataset.train_test_split(test_size=0.05)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

In [15]:
# 定义数据预处理函数
def preprocess_function(examples):
    # 假设您的JSON中每个样本的文本字段名为"text"
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=8912)

In [16]:
# 预处理数据集
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
tokenized_test = eval_dataset.map(preprocess_function, batched=True)

In [17]:
tokenized_train

Dataset({
    features: ['text', 'label', 'score', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 66861
})

In [18]:
tokenized_train[0]['labels']

[4.0, 0.467021226480655]

In [31]:
tokenized_train = tokenized_train.map(lambda x: {
    "label": [label_mapping[x["label"]]]  # 合并标签
})

Map: 100%|██████████| 66861/66861 [00:21<00:00, 3050.35 examples/s]


In [32]:
tokenized_train[0]['label']

[4]

In [33]:
tokenized_eval = tokenized_eval.map(lambda x: {
    "label": [label_mapping[x["label"]]]  # 合并标签
})

Map: 100%|██████████| 7820/7820 [00:03<00:00, 2178.74 examples/s]


In [47]:
model = MultiTaskModel.from_pretrained(model_name, num_labels=18)

In [48]:
# 训练参数
training_args = TrainingArguments(
    output_dir="./modern_bert_multilingual_18_1",
    evaluation_strategy="steps",  # 修改评估策略为按步数进行评估
    save_strategy="steps",        # 修改保存策略为按步数保存
    save_steps=2500,               # 每训练50步保存一次模型
    eval_steps=2500,               # 每训练50步进行一次评估
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="mse",
    greater_is_better=False,  # MSE越小越好
    logging_steps=50,
    # warmup_ratio=0.1,
    # fp16=True,                       # 混合精度训练
    report_to="wandb",
    run_name="modern_bert_multilingual_18_1-1",
)



In [49]:
model.to("cuda:3")

MultiTaskModel(
  (model): ModernBertModel(
    (embeddings): ModernBertEmbeddings(
      (tok_embeddings): Embedding(151680, 768, padding_idx=151646)
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (layers): ModuleList(
      (0): ModernBertEncoderLayer(
        (attn_norm): Identity()
        (attn): ModernBertAttention(
          (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
          (rotary_emb): ModernBertUnpaddedRotaryEmbedding(dim=64, base=160000.0, scale_base=None)
          (Wo): Linear(in_features=768, out_features=768, bias=False)
          (out_drop): Identity()
        )
        (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModernBertMLP(
          (Wi): Linear(in_features=768, out_features=2304, bias=False)
          (act): GELUActivation()
          (drop): Dropout(p=0.0, inplace=False)
          (Wo): Linear(in_features=1152, out_features=768, b

In [50]:
# 自定义训练器
class MultiTaskTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # 从输入中获取标签
        labels = inputs.pop("labels", None)
        scores = inputs.pop("scores", None)
        
        # 前向传播
        outputs = model(input_ids=inputs["input_ids"], 
                        attention_mask=inputs["attention_mask"], 
                        labels=labels, 
                        scores=scores)
        
        # 获取损失
        loss = outputs["loss"]
        # print(loss)
        
        return (loss, outputs) if return_outputs else loss
    
    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        labels = inputs.get("labels")
        scores = inputs.get("scores")
        
        with torch.no_grad():
            outputs = model(input_ids=inputs["input_ids"], 
                           attention_mask=inputs["attention_mask"], 
                           labels=labels, 
                           scores=scores)
            
            loss = outputs["loss"]
            logits = outputs["logits"]
            score_preds = outputs["score"]
            
        if prediction_loss_only:
            return (loss, None, None)
        
        # 将标签合并为一个数组以便于评估
        combined_labels = torch.stack([labels, scores], dim=1) if labels is not None and scores is not None else None
        
        return (loss, (logits, score_preds), combined_labels)


In [51]:
# 初始化自定义训练器
trainer = MultiTaskTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


In [None]:

# 训练模型
trainer.train()

None
None
None
None


AttributeError: 'NoneType' object has no attribute 'mean'

: 

In [None]:
# 训练模型
trainer.train()

ValueError: too many dimensions 'str'

: 