In [2]:
# noinspection PyUnresolvedReferences
from google.colab import files
uploaded = files.upload()


Saving ChnSentiCorp_htl_all.csv to ChnSentiCorp_htl_all.csv


In [3]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import Adam
import torch

class MyDataset(Dataset):
    def __init__(self, file_path="./ChnSentiCorp_htl_all.csv"):
        super().__init__()
        self.data = pd.read_csv(file_path).dropna()

    def __getitem__(self, index):
        return self.data.iloc[index]["review"], self.data.iloc[index]["label"]

    def __len__(self):
        return len(self.data)

def load_data(file_path="./ChnSentiCorp_htl_all.csv"):
    """加载并返回数据集实例"""
    dataset = MyDataset(file_path)
    print("前5个样本:")
    for i in range(5):
        print(dataset[i])
    return dataset

def split_dataset(dataset, train_ratio=0.9):
    """将数据集划分为训练集和验证集"""
    train_size = int(len(dataset) * train_ratio)
    valid_size = len(dataset) - train_size
    trainset, validset = random_split(dataset, [train_size, valid_size])
    print(f"训练集大小: {len(trainset)}, 验证集大小: {len(validset)}")
    return trainset, validset

def create_dataloaders(trainset, validset, batch_size_train=32, batch_size_valid=64):
    """创建训练集和验证集的数据加载器"""
    tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

    def collate_func(batch):
        texts, labels = zip(*batch)
        inputs = tokenizer(texts, max_length=128, padding="max_length", truncation=True, return_tensors="pt")
        inputs["labels"] = torch.tensor(labels)
        return inputs

    trainloader = DataLoader(trainset, batch_size=batch_size_train, shuffle=True, collate_fn=collate_func)
    validloader = DataLoader(validset, batch_size=batch_size_valid, shuffle=False, collate_fn=collate_func)
    return trainloader, validloader, tokenizer  # 新增返回tokenizer
def initialize_model():
    """初始化模型和优化器"""
    model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")
    if torch.cuda.is_available():
        model = model.cuda()
    optimizer = Adam(model.parameters(), lr=2e-5)
    return model, optimizer

def evaluate(model, validloader):
    """评估模型性能"""
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in validloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=-1)
            acc_num += (pred.long() == batch["labels"].long()).float().sum()
    return acc_num / len(validset)

def train(model, optimizer, trainloader, validloader, epochs=3, log_step=100):
    """训练模型"""
    global_step = 0
    for epoch in range(epochs):
        model.train()
        for batch in trainloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step % log_step == 0:
                print(f"Epoch: {epoch}, Global Step: {global_step}, Loss: {output.loss.item()}")
            global_step += 1
        acc = evaluate(model, validloader)
        print(f"Epoch: {epoch}, Accuracy: {acc}")

def predict(model, tokenizer, sentence):
    """对单个句子进行预测"""
    model.eval()
    id2_label = {0: "差评！", 1: "好评！"}
    with torch.inference_mode():
        inputs = tokenizer(sentence, return_tensors="pt")
        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}
        logits = model(**inputs).logits
        pred = torch.argmax(logits, dim=-1)
        print(f"输入：{sentence}\n模型预测结果:{id2_label.get(pred.item())}")

if __name__ == "__main__":
    # 加载数据
    dataset = load_data()

    # 划分数据集
    trainset, validset = split_dataset(dataset)

    # 创建数据加载器
    trainloader, validloader, tokenizer = create_dataloaders(trainset, validset)
    # 初始化模型和优化器
    model, optimizer = initialize_model()

    # 训练模型
    train(model, optimizer, trainloader, validloader)

    # 测试预测
    sen = "我觉得这家酒店不错，饭很好吃！"
    predict(model, tokenizer, sen)


前5个样本:
('距离川沙公路较近,但是公交指示不对,如果是"蔡陆线"的话,会非常麻烦.建议用别的路线.房间较为简单.', 1)
('商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!', 1)
('早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。', 1)
('宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小，但加上低价位因素，还是无超所值的；环境不错，就在小胡同内，安静整洁，暖气好足-_-||。。。呵还有一大优势就是从宾馆出发，步行不到十分钟就可以到梅兰芳故居等等，京味小胡同，北海距离好近呢。总之，不错。推荐给节约消费的自助游朋友~比较划算，附近特色小吃很多~', 1)
('CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风', 1)
训练集大小: 6988, 验证集大小: 777


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/828 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/156M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 0, Global Step: 0, Loss: 0.6795675158500671
Epoch: 0, Global Step: 100, Loss: 0.7331611514091492
Epoch: 0, Global Step: 200, Loss: 0.30157509446144104
Epoch: 0, Accuracy: 0.8687258958816528
Epoch: 1, Global Step: 300, Loss: 0.19014620780944824
Epoch: 1, Global Step: 400, Loss: 0.19437932968139648
Epoch: 1, Accuracy: 0.8931788802146912
Epoch: 2, Global Step: 500, Loss: 0.17168833315372467
Epoch: 2, Global Step: 600, Loss: 0.27399080991744995
Epoch: 2, Accuracy: 0.8803088665008545
输入：我觉得这家酒店不错，饭很好吃！
模型预测结果:好评！
