In [4]:
from google.colab import files

# 上传文件
uploaded = files.upload()

Saving ChnSentiCorp_htl_all.csv to ChnSentiCorp_htl_all.csv


In [2]:
! pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [5]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
)
from torch.optim import AdamW  # 使用 PyTorch 的 AdamW

# -------------------------- 加载并预处理数据集 --------------------------
def load_data(file_path="./ChnSentiCorp_htl_all.csv"):
    """加载数据集并预处理"""
    dataset = load_dataset("csv", data_files=file_path)["train"]
    dataset = dataset.filter(lambda x: x["review"] is not None)  # 过滤空值
    return dataset

def preprocess_dataset(dataset, tokenizer):
    """对数据集进行 Tokenization"""
    def tokenize_function(examples):
        return tokenizer(examples["review"], truncation=True, max_length=128)

    # 使用 map 批量处理数据
    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=["review"]  # 移除原始文本列，保留 labels 和编码后的字段
    )
    return tokenized_dataset

# -------------------------- 划分数据集 --------------------------
def split_dataset(dataset, test_size=0.1):
    """划分训练集和验证集"""
    split_dataset = dataset.train_test_split(test_size=test_size, seed=42)
    trainset, validset = split_dataset["train"], split_dataset["test"]
    print(f"训练集大小: {len(trainset)}, 验证集大小: {len(validset)}")
    return trainset, validset

# -------------------------- 创建 DataLoader --------------------------
def create_dataloaders(trainset, validset, batch_size_train=32, batch_size_valid=64):
    """创建动态填充的数据加载器"""
    tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # 创建 DataLoader
    trainloader = DataLoader(
        trainset,
        batch_size=batch_size_train,
        shuffle=True,
        collate_fn=data_collator
    )
    validloader = DataLoader(
        validset,
        batch_size=batch_size_valid,
        shuffle=False,
        collate_fn=data_collator
    )
    return trainloader, validloader, tokenizer

# -------------------------- 模型初始化 --------------------------
def initialize_model():
    """初始化模型和优化器"""
    model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")
    if torch.cuda.is_available():
        model = model.cuda()
    optimizer = AdamW(model.parameters(), lr=2e-5)  # 使用 PyTorch 的 AdamW
    return model, optimizer

# -------------------------- 评估函数 --------------------------
def evaluate(model, validloader, validset):
    """计算验证集准确率"""
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in validloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=-1)
            acc_num += (pred.long() == batch["labels"].long()).float().sum()
    return acc_num / len(validset)

# -------------------------- 训练函数 --------------------------
def train(model, optimizer, trainloader, validloader, validset, epochs=3, log_step=100):
    """训练模型"""
    global_step = 0
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        model.train()
        running_loss = 0.0
        for batch in trainloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            loss = output.loss
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            if global_step % log_step == 0:
                avg_loss = running_loss / (global_step + 1) if global_step > 0 else running_loss
                print(f"Step: {global_step}, Loss: {loss.item():.4f}, Avg Loss: {avg_loss:.4f}")
            global_step += 1
        acc = evaluate(model, validloader, validset)
        print(f"Epoch: {epoch}, Accuracy: {acc:.4f}")

# -------------------------- 预测函数 --------------------------
def predict(model, tokenizer, sentence):
    """单句预测"""
    model.eval()
    id2_label = {0: "差评！", 1: "好评！"}
    with torch.inference_mode():
        inputs = tokenizer(sentence, return_tensors="pt")
        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}
        logits = model(**inputs).logits
        pred = torch.argmax(logits, dim=-1)
        print(f"输入：{sentence}\n预测结果：{id2_label.get(pred.item())}")

# -------------------------- 主程序 --------------------------
if __name__ == "__main__":
    # 加载数据
    dataset = load_data()
    # 初始化 Tokenizer
    tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")
    # 预处理数据集（生成 input_ids）
    tokenized_dataset = preprocess_dataset(dataset, tokenizer)
    # 划分数据集
    trainset, validset = split_dataset(tokenized_dataset)
    # 创建 DataLoader
    trainloader, validloader, tokenizer = create_dataloaders(trainset, validset)
    # 初始化模型和优化器
    model, optimizer = initialize_model()
    # 训练模型
    train(model, optimizer, trainloader, validloader, validset)
    # 测试预测
    sen = "我觉得这家酒店不错，饭很好吃！"
    predict(model, tokenizer, sen)

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/7766 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/828 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/7765 [00:00<?, ? examples/s]

训练集大小: 6988, 验证集大小: 777


pytorch_model.bin:   0%|          | 0.00/156M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Step: 0, Loss: 0.8626, Avg Loss: 0.8626
Step: 100, Loss: 0.2666, Avg Loss: 0.4357
Step: 200, Loss: 0.4230, Avg Loss: 0.3787
Epoch: 0, Accuracy: 0.8970
Epoch 2/3
Step: 300, Loss: 0.1656, Avg Loss: 0.0636
Step: 400, Loss: 0.2221, Avg Loss: 0.1111
Epoch: 1, Accuracy: 0.8970
Epoch 3/3
Step: 500, Loss: 0.1995, Avg Loss: 0.0227
Step: 600, Loss: 0.2805, Avg Loss: 0.0502
Epoch: 2, Accuracy: 0.9073
输入：我觉得这家酒店不错，饭很好吃！
预测结果：好评！
