In [7]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, AutoConfig, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict
from sklearn.metrics import accuracy_score, mean_squared_error
import numpy as np

class MultiTaskEmotionModel(nn.Module):
    def __init__(self, model_name="bert-base-chinese", num_labels=5):
        super().__init__()
        self.config = AutoConfig.from_pretrained(model_name)
        self.bert = AutoModel.from_pretrained(model_name, config=self.config)
        hidden_size = self.config.hidden_size

        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_size, num_labels)
        )
        self.regressor = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_size, 1),
            nn.Sigmoid()  # output range: 0 ~ 1
        )

    def forward(self, input_ids, attention_mask=None, labels=None, intensities=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]

        logits = self.classifier(pooled)
        intensity_pred = self.regressor(pooled).squeeze(-1)

        loss = None
        if labels is not None and intensities is not None:
            loss_cls = nn.CrossEntropyLoss()(logits, labels)
            loss_reg = nn.MSELoss()(intensity_pred, intensities)
            loss = loss_cls + loss_reg

        return {
            "loss": loss,
            "logits": logits,
            "intensity": intensity_pred
        }

def compute_metrics(eval_pred):
    logits, intensity_preds = eval_pred.predictions
    logits = logits[0] if isinstance(logits, tuple) else logits
    preds = np.argmax(logits, axis=1)

    labels = eval_pred.label_ids["labels"]
    intensities = eval_pred.label_ids["intensities"]

    acc = accuracy_score(labels, preds)
    mse = mean_squared_error(intensities, intensity_preds)

    return {"accuracy": acc, "mse": mse}

def tokenize(example, tokenizer):
    # print(example)
    result = tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)
    result["labels"] = example["labels"]
    result["intensities"] = example["five_class_label"]
    return result



def load_data(tokenizer):
    # # 示例数据加载（你可以换成自己的）
    # raw = load_dataset("go_emotions", "simplified")  # or load custom dataset
    # # raw = raw.remove_columns(["id", "comments", "emotion"])

    # # 模拟强度值（真实使用你自己的数据）
    # def add_intensity(example):
    #     example["intensities"] = np.random.uniform(0.5, 1.0) if example["labels"] in [3,4] else np.random.uniform(0.0, 0.5)
    #     return example

    # raw = raw.map(add_intensity)
    # return raw.map(lambda x: tokenize(x, tokenizer), batched=True)
    from datasets import load_dataset
    from collections import Counter

    # 加载数据
    dataset = load_dataset("go_emotions", "simplified")

    # 五类情绪映射
    five_class_mapping = {
        'anger': 0, 'disgust': 0, 'fear': 0, 'sadness': 0,
        'annoyance': 1, 'disappointment': 1, 'embarrassment': 1, 'grief': 1, 'nervousness': 1,
        'neutral': 2, 'confusion': 2, 'curiosity': 2, 'realization': 2, 'desire': 2,
        'approval': 3, 'love': 3, 'joy': 3, 'optimism': 3,
        'admiration': 4, 'excitement': 4, 'gratitude': 4, 'pride': 4, 'relief': 4
    }

    id2label = ['very negative', 'negative', 'neutral', 'positive', 'very positive']
    label2id = {v: k for k, v in enumerate(id2label)}

    label_names = dataset['train'].features['labels'].feature.names

    # 多标签情感 → 单个五类情绪映射
    def map_emotion(example):
        label_ids = example['labels']
        if not label_ids:
            return {'five_class_label': 2}  # 若无情绪标签则为 neutral

        # 将所有原始标签映射为五类
        five_class_votes = []
        for i in label_ids:
            emotion = label_names[i]
            if emotion in five_class_mapping:
                five_class_votes.append(five_class_mapping[emotion])
        
        # 若所有原始标签都不在映射表，设为 neutral
        if not five_class_votes:
            return {'five_class_label': 2}

        # 选择出现次数最多的五类标签作为最终 label（也可选最“极端”的 max/min）
        count = Counter(five_class_votes)
        most_common = count.most_common(1)[0][0]
        return {'five_class_label': most_common}

    

    # 应用映射
    for split in dataset.keys():
        dataset[split] = dataset[split].map(map_emotion)

    # 示例打印
    print("Example:")
    print(dataset['train'][0]['text'])
    print("Labels:", [label_names[i] for i in dataset['train'][0]['labels']])
    print("Five-class label:", id2label[dataset['train'][0]['five_class_label']])
    return dataset.map(lambda x: tokenize(x, tokenizer), batched=True)



def main():
    base_model_path = "./go_emotions_model_1"  # 预训练模型路径
    tokenizer = AutoTokenizer.from_pretrained(base_model_path)
    raw_dataset = load_data(tokenizer)

    train_dataset = raw_dataset["train"]
    val_dataset = raw_dataset["validation"]

    # 初始化多任务模型（不加载原分类头）
    model = MultiTaskEmotionModel(model_name=base_model_path)

    # 从旧模型中加载 encoder 权重
    pretrained = AutoModel.from_pretrained(base_model_path)
    model.bert.load_state_dict(pretrained.state_dict())

    # 定义 Trainer 所需 TrainingArguments
    training_args = TrainingArguments(
        output_dir="./multitask_results",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
    )

    # HuggingFace 的 Trainer 默认不支持多输出，我们需要用 wrapper
    class CustomTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False):
            labels = inputs.pop("labels")
            intensities = inputs.pop("intensities")
            outputs = model(**inputs, labels=labels, intensities=intensities)
            loss = outputs["loss"]
            return (loss, outputs) if return_outputs else loss

        def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
            labels = inputs["labels"]
            intensities = inputs["intensities"]
            inputs = {k: v for k, v in inputs.items() if k in ["input_ids", "attention_mask"]}

            with torch.no_grad():
                outputs = model(**inputs)
            logits = outputs["logits"]
            preds = outputs["intensity"]
            return None, (logits.detach().cpu().numpy(), preds.detach().cpu().numpy()), {"labels": labels.cpu().numpy(), "intensities": intensities.cpu().numpy()}

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()

if __name__ == "__main__":
    main()


Example:
My favourite food is anything I didn't have to cook myself.
Labels: ['neutral']
Five-class label: neutral


Map: 100%|██████████| 43410/43410 [00:06<00:00, 6642.34 examples/s]
Map: 100%|██████████| 5426/5426 [00:00<00:00, 6428.33 examples/s]
Map: 100%|██████████| 5427/5427 [00:00<00:00, 6440.90 examples/s]
  trainer = CustomTrainer(


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).