In [9]:
import torch
from transformers import AutoTokenizer,AutoModel, AdamW
from torch.utils.data import DataLoader, Dataset, random_split
import pandas as pd
from tqdm import tqdm
import random
from transformers import BertForSequenceClassification

In [10]:
from modelscope import snapshot_download
model_dir = snapshot_download('tiansz/bert-base-chinese')

In [11]:
# 加载预训练的BERT模型和分词器
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = BertForSequenceClassification.from_pretrained(model_dir)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /mnt/workspace/.cache/modelscope/tiansz/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
class SentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['review']  # 使用标签索引
        label = 1 if self.dataframe.iloc[idx]['label'] == 1 else 0  # 使用标签索引
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [13]:
with open("train.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

data = []
for line in lines:
    label, text = line.strip().split("\t")
    data.append((int(label), text))

df = pd.DataFrame(data, columns=["label", "review"])

dataset = SentimentDataset(df, tokenizer, max_length=128)

# 划分训练集和验证集
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [14]:
# 设置训练参数
optimizer = AdamW(model.parameters(), lr=5e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [15]:
# 训练模型
model.train()
for epoch in range(3):
    for batch in tqdm(train_loader, desc="Epoch {}".format(epoch + 1)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

Epoch 1: 100%|██████████| 1000/1000 [01:14<00:00, 13.51it/s]
Epoch 2: 100%|██████████| 1000/1000 [01:14<00:00, 13.37it/s]
Epoch 3: 100%|██████████| 1000/1000 [01:15<00:00, 13.32it/s]


In [47]:
# 验证集下评估模型
model.eval()
total_eval_accuracy = 0
for batch in tqdm(val_loader, desc="Evaluating"):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        accuracy = (preds == labels).float().mean()
        total_eval_accuracy += accuracy.item()

average_eval_accuracy = total_eval_accuracy / len(val_loader)
print("Validation Accuracy:", average_eval_accuracy)


Evaluating: 100%|██████████| 250/250 [00:06<00:00, 41.61it/s]

Validation Accuracy: 0.7655





In [58]:
# 使用微调后的模型进行预测
def predict_sentiment(sentence):
    inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=128, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=1)
    positive_prob = probs[0][1].item()  # 1表示正面
    return positive_prob

def predict(sentence):
    positive_prob = predict_sentiment(sentence)
    threshold = 0.5  # 设置阈值
    if positive_prob > threshold:
        print("正面")
    else:
        print("负面")

predict("这家菜真好吃！")

正面


In [49]:
import torch
from torch.utils.data import Dataset

# 定义用于加载测试数据集的类
class TestDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=128):
        self.data = []
        with open(file_path, "r", encoding="utf-8") as file:
            for line in file:
                label, text = line.strip().split("\t")
                self.data.append((int(label), text))
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        label, text = self.data[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 加载测试数据集
test_dataset = TestDataset("test.txt", tokenizer, max_length=128)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=False)

# 评估模型
model.eval()
total_eval_accuracy = 0
for batch in tqdm(test_loader, desc="Evaluating"):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    accuracy = (preds == labels).float().mean()
    total_eval_accuracy += accuracy.item()

average_eval_accuracy = total_eval_accuracy / len(test_loader)
print("Test Accuracy:", average_eval_accuracy)


Evaluating: 100%|██████████| 63/63 [00:01<00:00, 43.83it/s]

Test Accuracy: 0.9792857142857143





```mermaid
sequenceDiagram
    participant 数据准备
    participant 模型微调
    participant 模型评估
    participant 代码实现
    participant 自动生成测试数据

    数据准备 -> 模型微调: 准备数据集
    模型微调 -> 模型评估: 微调模型
    模型评估 -> 代码实现: 评估模型性能
    代码实现 -> 自动生成测试数据: 编写评估代码
    自动生成测试数据 --> 代码实现: 生成测试数据
    代码实现 -> 模型评估: 使用测试数据评估模型性能
```
