In [108]:
!pip install transformers datasets torch pandas scikit-learn numpy

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
# 环境要求：Python 3.8+, PyTorch 2.0+, Transformers 4.30+
# 安装依赖：pip install transformers datasets torch pandas scikit-learn

import torch
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup,
    AutoConfig
)
# from transformers import AutoConfig, AutoModelForSequenceClassification

from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.model_selection import train_test_split


# 参数配置
MAX_LEN = 512
BATCH_SIZE = 32  # 4090显存充足可以适当增大
EPOCHS = 3
OLD_MODEL_PATH = "/Users/minghuayao/+++AutoDL/models/bert-base-uncased/"
NEW_MODEL_PATH = "/Users/minghuayao/+++AutoDL/models/new/"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import os
from datasets import DatasetDict, Dataset
import pandas as pd

# 初始化tokenizer
tokenizer = AutoTokenizer.from_pretrained(OLD_MODEL_PATH+"tokenizer")

def load_and_preprocess_data():

    # 示例数据 - 实际应替换为真实数据加载
    # 假设原始DataFrame包含text、split、label三列
    data = {
        "text": ["This is positive example.", "Negative sentence here.", "Another neutral text."],
        "split": ["train", "train", "test"],
        "label": [1, 0, 2]
    }
    df = pd.DataFrame(data)
    
    # 创建训练测试分割
    train_df = df[df["split"] == "train"]
    
    # 创建DatasetDict并彻底清除索引
    final_dataset = DatasetDict({
        "train": Dataset.from_pandas(train_df, preserve_index=False),
        "test": Dataset.from_pandas(
            df[df["split"] == "test"], 
            preserve_index=False
        )
    })

    # 单步处理函数
    def process_batch(examples):
        # 分词处理
        tokenized = tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=MAX_LEN,
            return_token_type_ids=False  # 可选：是否返回token类型ID
        )
        
        # 直接返回结构化数据
        return {
            **tokenized,
            "labels": examples["label"]
        }

    # 应用处理并清除所有原始列
    tokenized_dataset = final_dataset.map(
        process_batch,
        batched=True,
        remove_columns=["text", "split", "label"],  # 仅删除确切存在的列
        desc="Processing text data"
    )
    
    # 显式设置张量格式
    tokenized_dataset.set_format(
        "torch",
        columns=["input_ids", "attention_mask", "labels"]
    )

    # ==== 新增部分 ====
    # 收集所有标签值并创建映射
    all_labels = (
        final_dataset["train"]["label"] 
        + final_dataset["test"]["label"]
    )
    unique_labels = sorted(np.unique(all_labels))
    label2id = {orig: new for new, orig in enumerate(unique_labels)}
    
    def process_batch(examples):
        tokenized = tokenizer(examples["text"], ...)
        
        # 转换原始标签为从0开始
        converted_labels = [label2id[lbl] for lbl in examples["label"]]
        
        return {
            **tokenized,
            "labels": converted_labels  # 使用转换后的标签
        }
    
    return tokenized_dataset, label2id


def create_model(num_labels):
    config = AutoConfig.from_pretrained(
        OLD_MODEL_PATH,
        num_labels=num_labels,  # 动态设置分类数量
        id2label={v: str(v) for v in range(num_labels)}
    )
    return AutoModelForSequenceClassification.from_pretrained(OLD_MODEL_PATH, config=config)


# 训练函数
def train_model(model, train_loader, val_loader):
    optimizer = AdamW(model.parameters(), lr=2e-5)
    total_steps = len(train_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    best_accuracy = 0
    for epoch in range(EPOCHS):
        # 训练阶段
        model.train()
        total_loss = 0
        for batch in train_loader:
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        avg_train_loss = total_loss / len(train_loader)
        
        # 验证阶段
        model.eval()
        val_preds = []
        val_labels = []
        for batch in val_loader:
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            with torch.no_grad():
                #outputs = model(**batch)
                # 应改为显式参数传递：
                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['labels']
                )

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(batch["labels"].cpu().numpy())

        accuracy = accuracy_score(val_labels, val_preds)
        print(f"Epoch {epoch+1}/{EPOCHS}")
        print(f"Train loss: {avg_train_loss:.4f}, Val accuracy: {accuracy:.4f}")

        if accuracy > best_accuracy:
            best_accuracy = accuracy

    print(f"Training complete. Best validation accuracy: {best_accuracy:.4f}")

OSError: Can't load tokenizer for '/Users/minghuayao/+++AutoDL/models/bert-base-uncased/'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure '/Users/minghuayao/+++AutoDL/models/bert-base-uncased/' is the correct path to a directory containing all relevant files for a BertTokenizerFast tokenizer.

In [None]:
from torch.serialization import safe_globals, add_safe_globals
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer

add_safe_globals([
    (np, 'dtype'),  # 允许dtype的构造
    (np.core.multiarray, '_reconstruct'), 
    (np, '_DType'),  # numpy的dtype元类型
])

def load_safe_model(path):
    # 加载检查点时强制使用安全模式
    checkpoint = torch.load(path, map_location='cpu', weights_only=True)

    # 重建配置参数（在运行时动态恢复dtype）
    config_params = checkpoint['config'].copy()
    if 'torch_dtype' in checkpoint['dtype_mapping']:
        config_params['torch_dtype'] = getattr(
            torch, 
            checkpoint['dtype_mapping']['torch_dtype']
        )
    
    # 使用transformers的安全配置接口
    config = AutoConfig.for_model(
        **config_params
    )
    config.num_labels = checkpoint["num_labels"]
    
    # 实例化并加载模型
    model = AutoModelForSequenceClassification.from_config(config)
    model.load_state_dict(checkpoint['model_state_dict'])
    return model


def predict(new_model_path, text):
    """预测时应明确分割模型目录和tokenizer目录""" 
    model_dir = os.path.dirname(new_model_path)  # 提取目录路径
    model_file = os.path.basename(new_model_path)

    checkpoint = torch.load(
        new_model_path,
        map_location=DEVICE,
        weights_only=True
    )
    
    # 独立重建关键参数
    config = AutoConfig.from_pretrained(
        new_model_path,
        **{
            k: v for k, v in checkpoint["config"].items()
            if not k.startswith('_')  # 规避内部参数
        }
    )
    
    # 特殊处理分类头参数
    config.num_labels = checkpoint["num_labels"]
    config.id2label = {i: str(i) for i in range(config.num_labels)}

    # 实例化模型
    model = load_safe_model(new_model_path)
    model.load_state_dict(checkpoint["model_state_dict"])
    model.eval()
    
    # 编码与预测
    tokenizer_path = os.path.join(model_dir, "tokenizer")
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    inputs = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    ).to(DEVICE)
    
    with torch.no_grad():
        outputs = model(**inputs)
        pred_id = outputs.logits.argmax().item()
    
    # 反向标签映射
    id2label = {v: k for k, v in checkpoint["label_map"].items()}
    return id2label[pred_id]


In [None]:
from torch.utils.data.dataloader import default_collate
from pathlib import Path

# 主程序
if __name__ == "__main__":    
    dataset, label_map = load_and_preprocess_data()
    actual_labels = len(label_map)

    # 创建自定义collate函数
    def custom_collate(batch):
        # 自动转换为张量并处理嵌套字典
        processed_batch = {}
        for key in batch[0].keys():
            if key == "label":
                processed_batch[key] = torch.stack([torch.tensor(item[key]) for item in batch])
            else:
                processed_batch[key] = default_collate([item[key] for item in batch])
        return processed_batch    
    
    # 使用修正的collate_fn创建DataLoader
    train_loader = DataLoader(
        dataset["train"],
        batch_size=32,
        shuffle=True,
        collate_fn=custom_collate
    )
    val_loader = DataLoader(
        dataset["test"],
        batch_size=32,
        collate_fn=custom_collate
    )

    # 初始化模型
    model = create_model(num_labels=actual_labels)

    # 训练模型
    train_model(model, train_loader, val_loader)

    def deep_convert_dtypes(obj):
        """彻底转换所有层级的dtype为字符串"""
        if isinstance(obj, np.dtype):
            return str(obj)  # 转换为标准字符串表示如 'int64'
        elif isinstance(obj, np.generic):
            return obj.item()  # numpy标量转Python类型
        elif isinstance(obj, dict):
            return {k: deep_convert_dtypes(v) for k, v in obj.items()}
        elif isinstance(obj, (list, tuple)):
            return type(obj)(deep_convert_dtypes(v) for v in obj)
        return obj

    # 处理模型配置的每个参数
    original_config = model.config.to_dict()
    sanitized_config = deep_convert_dtypes(original_config)

    # 添加二次验证（确保没有遗留dtype）
    assert not any(isinstance(v, np.dtype) for v in sanitized_config.values()), "发现未转换的dtype"

    # 保存检查点时强制所有数值类型为原生类型
    checkpoint = {
        "model_state_dict": model.state_dict(),
        "config": {
            k: int(v) if isinstance(v, np.integer) else 
            float(v) if isinstance(v, np.floating) else
            v 
            for k, v in sanitized_config.items()
        },
        "num_labels": int(len(label_map)),
        "dtype_mapping": {  # 显式记录dtype转换关系
            "torch_dtype": "int64"  
        }
    }

    # ====== 模型保存时需同时保存tokenizer ======
    def save_model_and_tokenizer(model, tokenizer, new_model_path):
        """完整的模型保存函数"""
        # 创建目录
        os.makedirs(new_model_path, exist_ok=True)
        
        # 保存模型检查点
        torch.save(checkpoint, os.path.join(new_model_path, "final_bert_imdb_model.pt"))
        
        # 保存tokenizer到独立的子目录 (关键步骤)
        tokenizer.save_pretrained(new_model_path+"tokenizer")
        
        print(f"模型与tokenizer已保存至 {new_model_path}")

    save_model_and_tokenizer(model, tokenizer, NEW_MODEL_PATH)

Processing text data: 100%|██████████| 2/2 [00:00<00:00, 33.19 examples/s]
Processing text data: 100%|██████████| 1/1 [00:00<00:00, 437.50 examples/s]
Some weights of the model checkpoint at /Users/minghuayao/+++AutoDL/models/bert-base-uncased/ were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (init

Epoch 1/3
Train loss: 1.4785, Val accuracy: 1.0000
Epoch 2/3
Train loss: 1.2819, Val accuracy: 1.0000
Epoch 3/3
Train loss: 1.3463, Val accuracy: 1.0000
Training complete. Best validation accuracy: 1.0000
模型与tokenizer已保存至 /Users/minghuayao/+++AutoDL/models/bert-base-uncased/


In [None]:

# 测试加载
model = load_safe_model(NEW_MODEL_PATH+"final_bert_imdb_model.pt")
print("模型加载成功!")


模型加载成功!


In [None]:
# 使用样例
print(predict(NEW_MODEL_PATH+"final_bert_imdb_model.pt", "This movie is absolutely wonderful!"))

KeyError: 'label_map'

In [None]:
import torch
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cpu
